Spaces:

onurcopur
/

tattoo_search_engine

Running

App Files Files Community

onurcopur commited on Sep 24

Commit

e01c07b

1 Parent(s): be7846f

change dockerfile

Browse files

Files changed (18) hide show

search_engines/__init__.py +14 -0
search_engines/__pycache__/__init__.cpython-312.pyc +0 -0
search_engines/__pycache__/base.cpython-312.pyc +0 -0
search_engines/__pycache__/instagram.cpython-312.pyc +0 -0
search_engines/__pycache__/manager.cpython-312.pyc +0 -0
search_engines/__pycache__/pinterest.cpython-312.pyc +0 -0
search_engines/__pycache__/reddit.cpython-312.pyc +0 -0
search_engines/base.py +96 -0
search_engines/instagram.py +105 -0
search_engines/manager.py +196 -0
search_engines/pinterest.py +117 -0
search_engines/reddit.py +114 -0
utils/__init__.py +6 -0
utils/__pycache__/__init__.cpython-312.pyc +0 -0
utils/__pycache__/cache.cpython-312.pyc +0 -0
utils/__pycache__/url_validator.cpython-312.pyc +0 -0
utils/cache.py +105 -0
utils/url_validator.py +189 -0

search_engines/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""Search engines package for tattoo image discovery."""
+from .base import BaseSearchEngine, ImageResult, SearchPlatform, SearchResult
+from .pinterest import PinterestSearchEngine
+from .manager import SearchEngineManager
+__all__ = [
+    "BaseSearchEngine",
+    "ImageResult",
+    "SearchPlatform",
+    "SearchResult",
+    "PinterestSearchEngine",
+    "SearchEngineManager",
+]

search_engines/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (536 Bytes). View file

search_engines/__pycache__/base.cpython-312.pyc ADDED Viewed

Binary file (5.01 kB). View file

search_engines/__pycache__/instagram.cpython-312.pyc ADDED Viewed

Binary file (5.03 kB). View file

search_engines/__pycache__/manager.cpython-312.pyc ADDED Viewed

Binary file (8.65 kB). View file

search_engines/__pycache__/pinterest.cpython-312.pyc ADDED Viewed

Binary file (5.45 kB). View file

search_engines/__pycache__/reddit.cpython-312.pyc ADDED Viewed

Binary file (5.02 kB). View file

search_engines/base.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""Base classes for image search engines."""
+import logging
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List, Optional, Set
+logger = logging.getLogger(__name__)
+class SearchPlatform(Enum):
+    """Supported search platforms."""
+    PINTEREST = "pinterest"
+    INSTAGRAM = "instagram"
+    REDDIT = "reddit"
+    FLICKR = "flickr"
+    DEVIANTART = "deviantart"
+    GENERAL = "general"
+@dataclass
+class ImageResult:
+    """Represents a single image search result."""
+    url: str
+    platform: SearchPlatform
+    quality_score: float = 0.0
+    width: Optional[int] = None
+    height: Optional[int] = None
+    title: Optional[str] = None
+    source_url: Optional[str] = None
+    @property
+    def resolution_score(self) -> float:
+        """Calculate score based on image resolution."""
+        if not self.width or not self.height:
+            return 0.5
+        total_pixels = self.width * self.height
+        if total_pixels >= 1000000:  # 1MP+
+            return 1.0
+        elif total_pixels >= 500000:  # 0.5MP+
+            return 0.8
+        elif total_pixels >= 250000:  # 0.25MP+
+            return 0.6
+        else:
+            return 0.3
+@dataclass
+class SearchResult:
+    """Container for all search results from multiple platforms."""
+    images: List[ImageResult]
+    total_found: int
+    platforms_used: Set[SearchPlatform]
+    search_duration: float
+    def get_top_results(self, limit: int = 50) -> List[ImageResult]:
+        """Get top results sorted by quality score."""
+        sorted_images = sorted(self.images, key=lambda x: x.quality_score, reverse=True)
+        return sorted_images[:limit]
+class BaseSearchEngine(ABC):
+    """Abstract base class for image search engines."""
+    def __init__(self, platform: SearchPlatform):
+        self.platform = platform
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+    @abstractmethod
+    def search(self, query: str, max_results: int = 20) -> List[ImageResult]:
+        """Search for images on the platform."""
+        pass
+    @abstractmethod
+    def is_valid_url(self, url: str) -> bool:
+        """Check if URL is valid for this platform."""
+        pass
+    def get_quality_score(self, url: str, **kwargs) -> float:
+        """Calculate quality score for a URL (0.0 to 1.0)."""
+        score = 0.5  # Base score
+        # URL length penalty (very long URLs often broken)
+        if len(url) > 500:
+            score -= 0.2
+        elif len(url) > 300:
+            score -= 0.1
+        # Image extension bonus
+        image_extensions = ['.jpg', '.jpeg', '.png', '.webp']
+        if any(ext in url.lower() for ext in image_extensions):
+            score += 0.1
+        return max(0.0, min(1.0, score))

search_engines/instagram.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""Instagram-specific search engine implementation."""
+import time
+from typing import List
+from ddgs import DDGS
+from .base import BaseSearchEngine, ImageResult, SearchPlatform
+class InstagramSearchEngine(BaseSearchEngine):
+    """Search engine for Instagram images."""
+    def __init__(self):
+        super().__init__(SearchPlatform.INSTAGRAM)
+        self.instagram_domains = {
+            "instagram.com",
+            "cdninstagram.com",
+            "scontent.cdninstagram.com",
+            "scontent-",  # Instagram CDN prefix
+        }
+    def search(self, query: str, max_results: int = 20) -> List[ImageResult]:
+        """Search Instagram for tattoo images."""
+        results = []
+        # Instagram hashtag-based queries
+        instagram_queries = self._build_instagram_queries(query)
+        try:
+            with DDGS() as ddgs:
+                for i, instagram_query in enumerate(instagram_queries):
+                    if i > 0:
+                        time.sleep(2)  # Instagram is more sensitive to rate limiting
+                    try:
+                        search_results = ddgs.images(
+                            instagram_query,
+                            region="wt-wt",
+                            safesearch="off",
+                            size="Medium",
+                            max_results=max_results // len(instagram_queries)
+                        )
+                        for result in search_results:
+                            url = result.get("image")
+                            if url and self.is_valid_url(url):
+                                image_result = self._create_image_result(url, result)
+                                results.append(image_result)
+                        if len(results) >= max_results:
+                            break
+                    except Exception as e:
+                        self.logger.warning(f"Instagram query failed: {e}")
+                        continue
+        except Exception as e:
+            self.logger.error(f"Instagram search failed: {e}")
+        return results[:max_results]
+    def is_valid_url(self, url: str) -> bool:
+        """Check if URL is from Instagram domains."""
+        return any(domain in url.lower() for domain in self.instagram_domains)
+    def get_quality_score(self, url: str, **kwargs) -> float:
+        """Calculate Instagram-specific quality score."""
+        score = super().get_quality_score(url)
+        # Instagram CDN URLs are generally reliable
+        if "cdninstagram.com" in url or "scontent" in url:
+            score += 0.15
+        # Instagram posts tend to be high quality
+        if "instagram.com/p/" in url:
+            score += 0.1
+        return min(1.0, score)
+    def _build_instagram_queries(self, query: str) -> List[str]:
+        """Build Instagram-specific search queries."""
+        queries = []
+        # General Instagram search
+        queries.append(f"site:instagram.com {query} tattoo")
+        # Hashtag-focused searches
+        hashtag_queries = [
+            f"site:instagram.com #{query.replace(' ', '')}tattoo",
+            f"site:instagram.com #tattoo {query}",
+        ]
+        queries.extend(hashtag_queries)
+        return queries
+    def _create_image_result(self, url: str, raw_result: dict) -> ImageResult:
+        """Create ImageResult from raw Instagram search result."""
+        return ImageResult(
+            url=url,
+            platform=self.platform,
+            quality_score=self.get_quality_score(url),
+            title=raw_result.get("title"),
+            source_url=raw_result.get("source")
+        )

search_engines/manager.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""Search engine manager for coordinating multi-platform searches."""
+import asyncio
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict, List, Optional, Set
+from .base import BaseSearchEngine, ImageResult, SearchPlatform, SearchResult
+from .instagram import InstagramSearchEngine
+from .pinterest import PinterestSearchEngine
+from .reddit import RedditSearchEngine
+class SearchEngineManager:
+    """Manages and coordinates searches across multiple platforms."""
+    def __init__(self, max_workers: int = 5):
+        self.max_workers = max_workers
+        self.engines: Dict[SearchPlatform, BaseSearchEngine] = {
+            SearchPlatform.PINTEREST: PinterestSearchEngine(),
+            SearchPlatform.REDDIT: RedditSearchEngine(),
+            SearchPlatform.INSTAGRAM: InstagramSearchEngine(),
+        }
+    def search_all_platforms(
+        self,
+        query: str,
+        max_results_per_platform: int = 20,
+        platforms: Optional[Set[SearchPlatform]] = None
+    ) -> SearchResult:
+        """Search across multiple platforms concurrently."""
+        start_time = time.time()
+        if platforms is None:
+            platforms = set(self.engines.keys())
+        all_results = []
+        platforms_used = set()
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            # Submit search tasks for each platform
+            future_to_platform = {
+                executor.submit(
+                    self._search_single_platform,
+                    platform,
+                    query,
+                    max_results_per_platform
+                ): platform
+                for platform in platforms
+                if platform in self.engines
+            }
+            # Collect results as they complete
+            for future in as_completed(future_to_platform):
+                platform = future_to_platform[future]
+                try:
+                    platform_results = future.result(timeout=30)  # 30s timeout per platform
+                    if platform_results:
+                        all_results.extend(platform_results)
+                        platforms_used.add(platform)
+                except Exception as e:
+                    print(f"Platform {platform.value} search failed: {e}")
+        # Remove duplicates and sort by quality
+        unique_results = self._deduplicate_results(all_results)
+        sorted_results = sorted(unique_results, key=lambda x: x.quality_score, reverse=True)
+        search_duration = time.time() - start_time
+        return SearchResult(
+            images=sorted_results,
+            total_found=len(sorted_results),
+            platforms_used=platforms_used,
+            search_duration=search_duration
+        )
+    def search_with_fallback(
+        self,
+        query: str,
+        max_results: int = 50,
+        min_results_threshold: int = 10
+    ) -> SearchResult:
+        """Search with intelligent fallback strategies."""
+        # Try primary platforms first
+        primary_platforms = {SearchPlatform.PINTEREST, SearchPlatform.REDDIT}
+        result = self.search_all_platforms(
+            query,
+            max_results_per_platform=max_results // 2,
+            platforms=primary_platforms
+        )
+        # If we don't have enough results, try additional platforms
+        if len(result.images) < min_results_threshold:
+            additional_platforms = {SearchPlatform.INSTAGRAM}
+            additional_result = self.search_all_platforms(
+                query,
+                max_results_per_platform=max_results // 2,
+                platforms=additional_platforms
+            )
+            # Merge results
+            all_images = result.images + additional_result.images
+            unique_images = self._deduplicate_results(all_images)
+            sorted_images = sorted(unique_images, key=lambda x: x.quality_score, reverse=True)
+            result = SearchResult(
+                images=sorted_images,
+                total_found=len(sorted_images),
+                platforms_used=result.platforms_used | additional_result.platforms_used,
+                search_duration=result.search_duration + additional_result.search_duration
+            )
+        # If still not enough, try simplified queries
+        if len(result.images) < min_results_threshold:
+            simplified_query = self._simplify_query(query)
+            if simplified_query != query:
+                fallback_result = self.search_all_platforms(
+                    simplified_query,
+                    max_results_per_platform=max_results // 3
+                )
+                # Merge with existing results
+                all_images = result.images + fallback_result.images
+                unique_images = self._deduplicate_results(all_images)
+                sorted_images = sorted(unique_images, key=lambda x: x.quality_score, reverse=True)
+                result = SearchResult(
+                    images=sorted_images,
+                    total_found=len(sorted_images),
+                    platforms_used=result.platforms_used | fallback_result.platforms_used,
+                    search_duration=result.search_duration + fallback_result.search_duration
+                )
+        return result
+    def _search_single_platform(
+        self,
+        platform: SearchPlatform,
+        query: str,
+        max_results: int
+    ) -> List[ImageResult]:
+        """Search a single platform (thread-safe)."""
+        engine = self.engines.get(platform)
+        if not engine:
+            return []
+        try:
+            return engine.search(query, max_results)
+        except Exception as e:
+            print(f"Error searching {platform.value}: {e}")
+            return []
+    def _deduplicate_results(self, results: List[ImageResult]) -> List[ImageResult]:
+        """Remove duplicate URLs while preserving the highest quality version."""
+        seen_urls = {}
+        for result in results:
+            if result.url in seen_urls:
+                # Keep the result with higher quality score
+                if result.quality_score > seen_urls[result.url].quality_score:
+                    seen_urls[result.url] = result
+            else:
+                seen_urls[result.url] = result
+        return list(seen_urls.values())
+    def _simplify_query(self, query: str) -> str:
+        """Simplify query by removing complex terms and keeping core concepts."""
+        # Remove adjectives and keep main nouns
+        words = query.split()
+        # Common tattoo-related keywords to keep
+        core_keywords = {
+            'tattoo', 'design', 'art', 'ink', 'traditional', 'realistic', 'geometric',
+            'tribal', 'watercolor', 'minimalist', 'blackwork', 'dotwork',
+            'dragon', 'flower', 'skull', 'rose', 'bird', 'lion', 'butterfly'
+        }
+        # Keep important words and first few words
+        simplified_words = []
+        for i, word in enumerate(words):
+            if i < 3 or word.lower() in core_keywords:
+                simplified_words.append(word)
+        simplified = ' '.join(simplified_words)
+        return simplified if simplified else 'tattoo art'
+    def get_platform_stats(self) -> Dict[str, Dict]:
+        """Get statistics about available platforms."""
+        stats = {}
+        for platform, engine in self.engines.items():
+            stats[platform.value] = {
+                'available': True,
+                'class': engine.__class__.__name__
+            }
+        return stats

search_engines/pinterest.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""Pinterest-specific search engine implementation."""
+import re
+import time
+from typing import List, Optional
+from ddgs import DDGS
+from .base import BaseSearchEngine, ImageResult, SearchPlatform
+class PinterestSearchEngine(BaseSearchEngine):
+    """Search engine for Pinterest images."""
+    def __init__(self):
+        super().__init__(SearchPlatform.PINTEREST)
+        self.pinterest_domains = {
+            "pinterest.com",
+            "pinimg.com",
+            "i.pinimg.com",
+            "media.pinimg.com",
+            "s-media-cache-ak0.pinimg.com"
+        }
+    def search(self, query: str, max_results: int = 20) -> List[ImageResult]:
+        """Search Pinterest for tattoo images."""
+        results = []
+        pinterest_queries = [
+            f"site:pinterest.com {query} tattoo",
+            f"site:pinterest.com tattoo {query}",
+        ]
+        try:
+            with DDGS() as ddgs:
+                for i, pinterest_query in enumerate(pinterest_queries):
+                    if i > 0:
+                        time.sleep(2)  # Rate limiting
+                    try:
+                        search_results = ddgs.images(
+                            pinterest_query,
+                            region="wt-wt",
+                            safesearch="off",
+                            size="Medium",
+                            max_results=max_results // 2
+                        )
+                        for result in search_results:
+                            url = result.get("image")
+                            if url and self.is_valid_url(url):
+                                image_result = self._create_image_result(url, result)
+                                results.append(image_result)
+                        if len(results) >= max_results:
+                            break
+                    except Exception as e:
+                        self.logger.warning(f"Pinterest query failed: {e}")
+                        continue
+        except Exception as e:
+            self.logger.error(f"Pinterest search failed: {e}")
+        return results[:max_results]
+    def is_valid_url(self, url: str) -> bool:
+        """Check if URL is from Pinterest domains."""
+        return any(domain in url.lower() for domain in self.pinterest_domains)
+    def get_quality_score(self, url: str, **kwargs) -> float:
+        """Calculate Pinterest-specific quality score."""
+        score = super().get_quality_score(url)
+        # Pinterest size indicators (higher resolution = higher score)
+        size_patterns = {
+            "/736x/": 1.0,
+            "/564x/": 0.9,
+            "/474x/": 0.8,
+            "/236x/": 0.6
+        }
+        for pattern, bonus in size_patterns.items():
+            if pattern in url:
+                score = bonus
+                break
+        # Pinterest CDN reliability bonus
+        if "i.pinimg.com" in url:
+            score += 0.1
+        return min(1.0, score)
+    def _create_image_result(self, url: str, raw_result: dict) -> ImageResult:
+        """Create ImageResult from raw Pinterest search result."""
+        dimensions = self._extract_dimensions(url)
+        return ImageResult(
+            url=url,
+            platform=self.platform,
+            quality_score=self.get_quality_score(url),
+            width=dimensions.get("width"),
+            height=dimensions.get("height"),
+            title=raw_result.get("title"),
+            source_url=raw_result.get("source")
+        )
+    def _extract_dimensions(self, url: str) -> dict:
+        """Extract image dimensions from Pinterest URL patterns."""
+        # Pinterest URL pattern: .../236x/... or .../564x314/...
+        size_match = re.search(r"/(\d+)x(\d*)/", url)
+        if size_match:
+            width = int(size_match.group(1))
+            height = int(size_match.group(2)) if size_match.group(2) else None
+            return {"width": width, "height": height}
+        return {}

search_engines/reddit.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""Reddit-specific search engine implementation."""
+import time
+from typing import List
+from ddgs import DDGS
+from .base import BaseSearchEngine, ImageResult, SearchPlatform
+class RedditSearchEngine(BaseSearchEngine):
+    """Search engine for Reddit images."""
+    def __init__(self):
+        super().__init__(SearchPlatform.REDDIT)
+        self.reddit_domains = {
+            "reddit.com",
+            "i.redd.it",
+            "i.imgur.com",
+            "imgur.com"
+        }
+        self.tattoo_subreddits = [
+            "tattoos",
+            "tattoo",
+            "traditionaltattoos",
+            "blackwork",
+            "sticknpokes",
+            "tattoodesigns"
+        ]
+    def search(self, query: str, max_results: int = 20) -> List[ImageResult]:
+        """Search Reddit for tattoo images."""
+        results = []
+        # Create Reddit-specific queries
+        reddit_queries = self._build_reddit_queries(query)
+        try:
+            with DDGS() as ddgs:
+                for i, reddit_query in enumerate(reddit_queries):
+                    if i > 0:
+                        time.sleep(1.5)  # Rate limiting
+                    try:
+                        search_results = ddgs.images(
+                            reddit_query,
+                            region="wt-wt",
+                            safesearch="off",
+                            size="Medium",
+                            max_results=max_results // len(reddit_queries)
+                        )
+                        for result in search_results:
+                            url = result.get("image")
+                            if url and self.is_valid_url(url):
+                                image_result = self._create_image_result(url, result)
+                                results.append(image_result)
+                        if len(results) >= max_results:
+                            break
+                    except Exception as e:
+                        self.logger.warning(f"Reddit query failed: {e}")
+                        continue
+        except Exception as e:
+            self.logger.error(f"Reddit search failed: {e}")
+        return results[:max_results]
+    def is_valid_url(self, url: str) -> bool:
+        """Check if URL is from Reddit or Reddit-linked domains."""
+        return any(domain in url.lower() for domain in self.reddit_domains)
+    def get_quality_score(self, url: str, **kwargs) -> float:
+        """Calculate Reddit-specific quality score."""
+        score = super().get_quality_score(url)
+        # i.redd.it is Reddit's native image host (reliable)
+        if "i.redd.it" in url:
+            score += 0.2
+        # Imgur is commonly used and reliable
+        elif "imgur.com" in url:
+            score += 0.1
+        # Reddit posts tend to be higher quality
+        if "reddit.com" in url:
+            score += 0.1
+        return min(1.0, score)
+    def _build_reddit_queries(self, query: str) -> List[str]:
+        """Build Reddit-specific search queries."""
+        queries = []
+        # General Reddit search
+        queries.append(f"site:reddit.com {query} tattoo")
+        # Subreddit-specific searches
+        for subreddit in self.tattoo_subreddits[:3]:  # Limit to top 3 subreddits
+            queries.append(f"site:reddit.com/r/{subreddit} {query}")
+        return queries
+    def _create_image_result(self, url: str, raw_result: dict) -> ImageResult:
+        """Create ImageResult from raw Reddit search result."""
+        return ImageResult(
+            url=url,
+            platform=self.platform,
+            quality_score=self.get_quality_score(url),
+            title=raw_result.get("title"),
+            source_url=raw_result.get("source")
+        )

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Utilities package for the tattoo search engine."""
+from .cache import SearchCache
+from .url_validator import URLValidator
+__all__ = ["URLValidator", "SearchCache"]

utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (371 Bytes). View file

utils/__pycache__/cache.cpython-312.pyc ADDED Viewed

Binary file (5.47 kB). View file

utils/__pycache__/url_validator.cpython-312.pyc ADDED Viewed

Binary file (8.57 kB). View file

utils/cache.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""Simple in-memory caching for search results."""
+import hashlib
+import time
+from typing import Any, Dict, Optional, Tuple
+class SearchCache:
+    """Simple in-memory cache for search results."""
+    def __init__(self, default_ttl: int = 3600, max_size: int = 1000):
+        """
+        Initialize cache.
+        Args:
+            default_ttl: Default time-to-live in seconds (1 hour)
+            max_size: Maximum number of cached items
+        """
+        self.default_ttl = default_ttl
+        self.max_size = max_size
+        self._cache: Dict[str, Tuple[Any, float]] = {}  # key -> (value, expiry_time)
+    def get(self, key: str) -> Optional[Any]:
+        """Get value from cache if not expired."""
+        if key not in self._cache:
+            return None
+        value, expiry_time = self._cache[key]
+        # Check if expired
+        if time.time() > expiry_time:
+            del self._cache[key]
+            return None
+        return value
+    def set(self, key: str, value: Any, ttl: Optional[int] = None) -> None:
+        """Set value in cache with optional TTL."""
+        if ttl is None:
+            ttl = self.default_ttl
+        expiry_time = time.time() + ttl
+        # Evict oldest entries if cache is full
+        if len(self._cache) >= self.max_size:
+            self._evict_expired()
+            # If still full, remove oldest entry
+            if len(self._cache) >= self.max_size:
+                oldest_key = min(self._cache.keys(), key=lambda k: self._cache[k][1])
+                del self._cache[oldest_key]
+        self._cache[key] = (value, expiry_time)
+    def delete(self, key: str) -> bool:
+        """Delete key from cache. Returns True if key existed."""
+        return self._cache.pop(key, None) is not None
+    def clear(self) -> None:
+        """Clear all cached items."""
+        self._cache.clear()
+    def _evict_expired(self) -> None:
+        """Remove expired entries from cache."""
+        current_time = time.time()
+        expired_keys = [
+            key for key, (_, expiry_time) in self._cache.items()
+            if current_time > expiry_time
+        ]
+        for key in expired_keys:
+            del self._cache[key]
+    def get_stats(self) -> Dict[str, Any]:
+        """Get cache statistics."""
+        current_time = time.time()
+        expired_count = sum(
+            1 for _, expiry_time in self._cache.values()
+            if current_time > expiry_time
+        )
+        return {
+            'total_items': len(self._cache),
+            'expired_items': expired_count,
+            'active_items': len(self._cache) - expired_count,
+            'max_size': self.max_size,
+            'default_ttl': self.default_ttl
+        }
+    @staticmethod
+    def create_cache_key(query: str, max_results: int, platforms: Optional[set] = None) -> str:
+        """Create a cache key from search parameters."""
+        # Normalize query
+        normalized_query = query.lower().strip()
+        # Create a string representation of platforms
+        platform_str = ''
+        if platforms:
+            platform_str = '_'.join(sorted(p.value for p in platforms))
+        # Combine all parameters
+        key_string = f"{normalized_query}_{max_results}_{platform_str}"
+        # Hash to create a fixed-length key
+        return hashlib.md5(key_string.encode()).hexdigest()

utils/url_validator.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""URL validation and health checking utilities."""
+import logging
+import random
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict, List, Optional, Set
+from urllib.parse import urlparse
+import requests
+logger = logging.getLogger(__name__)
+class URLValidator:
+    """Validates and health-checks URLs before processing."""
+    def __init__(self, max_workers: int = 10, timeout: int = 10):
+        self.max_workers = max_workers
+        self.timeout = timeout
+        self.session = requests.Session()
+        # Blocked domains that consistently fail or are problematic
+        self.blocked_domains = {
+            'bodyartguru.com',
+            'dcassetcdn.com',
+            'warvox.com',
+            'jenkins-tpp.blackboard.com',
+            'wrdsclassroom.wharton.upenn.edu',
+        }
+        # User agents for health checks
+        self.user_agents = [
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
+            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
+        ]
+    def validate_urls(self, urls: List[str]) -> List[str]:
+        """Validate multiple URLs concurrently."""
+        if not urls:
+            return []
+        # First, filter out obviously bad URLs
+        pre_filtered = self._pre_filter_urls(urls)
+        if not pre_filtered:
+            return []
+        # Health check the remaining URLs
+        valid_urls = self._health_check_urls(pre_filtered)
+        logger.info(f"URL validation: {len(urls)} -> {len(pre_filtered)} -> {len(valid_urls)}")
+        return valid_urls
+    def _pre_filter_urls(self, urls: List[str]) -> List[str]:
+        """Pre-filter URLs based on basic criteria."""
+        filtered = []
+        for url in urls:
+            if not self._is_valid_url_format(url):
+                continue
+            if self._is_blocked_domain(url):
+                continue
+            if not self._has_image_extension(url):
+                continue
+            if len(url) > 500:  # Skip very long URLs
+                continue
+            filtered.append(url)
+        return filtered
+    def _health_check_urls(self, urls: List[str]) -> List[str]:
+        """Perform HEAD requests to check URL accessibility."""
+        valid_urls = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            # Submit health check tasks
+            future_to_url = {
+                executor.submit(self._check_single_url, url): url
+                for url in urls
+            }
+            # Collect results
+            for future in as_completed(future_to_url):
+                url = future_to_url[future]
+                try:
+                    is_valid = future.result(timeout=self.timeout + 5)
+                    if is_valid:
+                        valid_urls.append(url)
+                except Exception as e:
+                    logger.debug(f"Health check failed for {url}: {e}")
+                # Small delay to be respectful
+                time.sleep(0.1)
+        return valid_urls
+    def _check_single_url(self, url: str) -> bool:
+        """Check if a single URL is accessible."""
+        try:
+            headers = {
+                'User-Agent': random.choice(self.user_agents),
+                'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.9',
+                'Connection': 'keep-alive',
+                'DNT': '1',
+            }
+            # Add platform-specific headers
+            if 'pinterest' in url.lower():
+                headers.update({
+                    'Referer': 'https://www.pinterest.com/',
+                    'Origin': 'https://www.pinterest.com',
+                })
+            elif 'instagram' in url.lower():
+                headers.update({
+                    'Referer': 'https://www.instagram.com/',
+                })
+            else:
+                headers['Referer'] = 'https://www.google.com/'
+            response = self.session.head(
+                url,
+                headers=headers,
+                timeout=self.timeout,
+                allow_redirects=True
+            )
+            # Check status code
+            if response.status_code not in [200, 301, 302]:
+                return False
+            # Check content type if available
+            content_type = response.headers.get('content-type', '').lower()
+            if content_type and not content_type.startswith('image/'):
+                return False
+            # Check content length if available
+            content_length = response.headers.get('content-length')
+            if content_length:
+                size = int(content_length)
+                if size < 1024 or size > 10 * 1024 * 1024:  # Too small or too large
+                    return False
+            return True
+        except Exception as e:
+            logger.debug(f"URL check failed for {url}: {e}")
+            return False
+    def _is_valid_url_format(self, url: str) -> bool:
+        """Check if URL has valid format."""
+        try:
+            parsed = urlparse(url)
+            return all([parsed.scheme, parsed.netloc])
+        except Exception:
+            return False
+    def _is_blocked_domain(self, url: str) -> bool:
+        """Check if URL is from a blocked domain."""
+        try:
+            parsed = urlparse(url)
+            domain = parsed.netloc.lower()
+            return any(blocked in domain for blocked in self.blocked_domains)
+        except Exception:
+            return True  # Block malformed URLs
+    def _has_image_extension(self, url: str) -> bool:
+        """Check if URL appears to point to an image."""
+        image_extensions = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
+        url_lower = url.lower()
+        return any(ext in url_lower for ext in image_extensions)
+    def add_blocked_domain(self, domain: str) -> None:
+        """Add a domain to the blocked list."""
+        self.blocked_domains.add(domain.lower())
+    def remove_blocked_domain(self, domain: str) -> None:
+        """Remove a domain from the blocked list."""
+        self.blocked_domains.discard(domain.lower())
+    def get_blocked_domains(self) -> Set[str]:
+        """Get the set of blocked domains."""
+        return self.blocked_domains.copy()