Spaces:

DawnC
/

Pixcribe

Running on Zero

File size: 25,639 Bytes

6a3bd1f

import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
from typing import List, Dict
import json
from opencc import OpenCC
import warnings

class CaptionGenerationManager:
    """Caption generation using Vision-Language Models (supports Qwen2.5-VL, Qwen3-VL, etc.)"""

    def __init__(self, model_name: str = "Qwen/Qwen2.5-VL-7B-Instruct"):
        """
        Args:
            model_name: Vision-Language model name, e.g.:
                - "Qwen/Qwen2.5-VL-7B-Instruct" (default)
                - "Qwen/Qwen3-VL-8B-Instruct" (2025 latest)
        """
        print(f"Loading Vision-Language Model: {model_name}...")

        # Suppress processor warning
        warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")

        # Use Auto* classes for flexibility (supports Qwen2.5-VL, Qwen3-VL, etc.)
        self.processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
        self.model = AutoModelForImageTextToText.from_pretrained(
            model_name,
            dtype=torch.bfloat16,  # Changed from torch_dtype to dtype
            device_map="auto"
        )

        # Simplified Chinese to Traditional Chinese converter
        self.cc = OpenCC('s2t')  # Simplified to Traditional

        self.generation_config = {
            'temperature': 0.7,
            'top_p': 0.9,
            'max_new_tokens': 300,  # Increased from 200 to prevent truncation
            'repetition_penalty': 1.1
        }

        # Platform-specific templates
        self.platform_templates = {
            'instagram': {
                'style': 'storytelling, aesthetic',
                'emoji_count': '2-3',
                'hashtag_count': '8-10',
                'min_length': 120,  # Increased for richer content
                'max_length': 220,  # Allow more detailed descriptions
                'features': ['call-to-action', 'question', 'relatable']
            },
            'tiktok': {
                'style': 'brief, punchy',
                'emoji_count': '1-2',
                'hashtag_count': '5-8',
                'min_length': 60,
                'max_length': 120,
                'features': ['trending', 'POV', 'relatable']
            },
            'xiaohongshu': {
                'style': 'structured, informative, detailed',
                'emoji_count': '5-8',
                'hashtag_count': '8-12',
                'min_length': 180,
                'max_length': 500,
                'features': ['tips', 'bullets', 'sharing-tone']
            }
        }

        print(f"✓ {model_name.split('/')[-1]} loaded successfully (using Auto* classes for flexibility)")

    def construct_prompt(self, analysis_results: Dict, platform: str = 'instagram', language: str = 'zh') -> str:
        """Construct prompt with language support ensuring consistency

        Args:
            language: 'zh' (Traditional Chinese), 'en' (English), 'zh-en' (Bilingual)
        """
        platform_config = self.platform_templates.get(platform, self.platform_templates['instagram'])

        # Language-specific instructions
        language_instructions = {
            'zh': '請使用繁體中文生成標題和標籤。語言要自然流暢，符合華語社群媒體的表達習慣。避免使用簡體字。當偵測到品牌時，必須在標題中提及品牌名稱。',
            'en': '''🚨 CRITICAL LANGUAGE REQUIREMENT 🚨
            Generate captions and hashtags EXCLUSIVELY in English.
            - NEVER use Chinese characters (Traditional or Simplified)
            - NEVER mix languages
            - Use natural, engaging language suitable for international social media
            - When brands are detected, mention them naturally in English
            - All text output must be 100% English only
            This is MANDATORY and NON-NEGOTIABLE.''',
            'zh-en': '''生成雙語內容：標題使用繁體中文，同時提供英文翻譯。標籤混合使用中英文以擴大觸及範圍。當偵測到品牌時，必須在標題中提及品牌名稱。

            🚨 重要：雙語一致性要求 🚨
            - 中文和英文必須表達相同的核心意義
            - 允許表達方式的差異（形容詞、語法不同）
            - 但整體訊息、語氣、品牌提及必須一致
            - 兩種語言都要朝同一方向詮釋內容'''
        }

        system_instruction = f"""You are a professional social media content strategist.

            {language_instructions.get(language, language_instructions['zh'])}

            Target platform: {platform}
            Content style: Authentic, creative, and optimized for engagement.

            CRITICAL RULE: Never include hashtags (symbols starting with #) in the caption text. Hashtags must only appear in the separate 'hashtags' array."""

        # Extract analysis context
        objects = analysis_results.get('detections', [])
        brands = analysis_results.get('brands', [])
        scene_info = analysis_results.get('scene_analysis', {})
        composition = analysis_results.get('composition', {})

        # FIXED: Get fused lighting from scene_info (it's been updated by DetectionFusionManager)
        lighting = scene_info.get('lighting', {}).get('top', 'natural light')
        lighting_confidence = scene_info.get('lighting', {}).get('confidence', 0.7)

        # Provide explicit Chinese translations to ensure consistency
        lighting_translations_zh = {
            'soft diffused light': '柔和漫射光',
            'overcast atmosphere': '陰天氛圍',
            'natural daylight': '自然日光',
            'warm ambient light': '溫暖環境光',
            'evening light': '傍晚光線',
            'bright sunlight': '明亮陽光',
            'golden hour': '金黃時刻',
            'blue hour': '藍調時刻'
        }

        # Get appropriate lighting description based on language
        if language == 'zh':
            lighting_zh = lighting_translations_zh.get(lighting, lighting)
            lighting_display = lighting_zh
        else:
            # For English and bilingual, use English only
            lighting_display = lighting
            lighting_zh = lighting

        objects_str = ', '.join([obj['class_name'] for obj in objects[:10]])

        # CRITICAL: Emphasize brands EXTREMELY prominently - repeat multiple times
        if brands:
            brands_list = [b[0] for b in brands[:5]]
            brands_str = ', '.join(brands_list)
            brand_emphasis = f"""

            🚨 CRITICAL BRAND REQUIREMENT 🚨
            The following brands were POSITIVELY IDENTIFIED in this image: {brands_str}

            YOU ABSOLUTELY MUST:
            1. Mention the brand name "{brands_list[0]}" explicitly in the FIRST sentence
            2. Use the exact brand name - do not use generic terms like "bag" or "accessory" without the brand
            3. Write naturally as if you're excited to share this {brands_list[0]} item
            4. Example: "在傍晚光線下，這款{brands_list[0]}經典黑色菱格紋皮革包..." (CORRECT)
            5. NOT acceptable: "在傍晚光線下，這款經典黑色菱格紋皮革包..." (WRONG - missing brand name!)

            THIS IS MANDATORY - The caption will be rejected if it doesn't mention {brands_str}.
            """
        else:
            brands_str = 'None detected'
            brand_emphasis = ""

        # Enhanced scene description
        urban_scene = scene_info.get('urban', {}).get('top', 'unknown')
        mood = scene_info.get('mood', {}).get('top', 'neutral')
        comp_type = composition.get('composition_type', 'standard')

        context = f"""
            Analyze this image and generate an engaging, DETAILED social media caption with rich visual descriptions.

            **Visual Elements (Describe in Detail):**
            - Detected objects: {objects_str}
            - Scene composition: {comp_type}
            - Urban environment: {urban_scene}
            - **IMPORTANT**: Include specific details about:
            * Materials (leather, metal, fabric, canvas, etc.)
            * Colors (use descriptive terms: jet black, antique gold, midnight blue, etc.)
            * Textures (quilted, smooth, matte, glossy, metallic, etc.)
            * Design features (stitching patterns, hardware, logos, emblems, etc.)
            * Reflections and lighting effects on surfaces

            **Atmosphere:**
            - Lighting (analyzed with Places365 + CV): {lighting_display} (confidence: {lighting_confidence:.2f})
            - Mood: {mood}

            **Brand Detection:**
            - Identified brands: {brands_str}{brand_emphasis}

            **Caption Structure (Required - BE SPECIFIC AND DETAILED):**
            1. Opening hook - Most striking visual element with SPECIFIC details (1-2 sentences)
            {f"- 🚨 MANDATORY: Start with the BRAND NAME '{brands_list[0]}' in the FIRST sentence!" if brands else ""}
            {f"- Example (CORRECT): '這款{brands_list[0]}經典黑色菱格紋皮革包...'" if brands else ""}
            {f"- Example (WRONG): '這款經典黑色菱格紋皮革包...' (missing {brands_list[0]}!)" if brands else ""}
            - Be SPECIFIC: Include material, color, design features WITH the brand name

            2. Visual details - Describe materials, textures, colors, and design elements (2-3 sentences)
            - Be SPECIFIC: mention quilting patterns, metal finishes, chain details, logo placements
            - Describe how light interacts with materials (reflections on leather, gleam of metal)
            - MUST use the EXACT lighting description: "{lighting_display}"

            3. Atmospheric context - How lighting and mood create the scene's character (1-2 sentences)
            - Connect lighting to the overall visual impact
            - Describe depth, shadows, contrasts

            4. Emotional connection & Engagement - How this resonates with viewers + call-to-action (1 sentence)

            **Content Requirements:**
            - Minimum information: 3-4 specific visual details per caption
            - Include material types, color descriptions, design features
            - Describe how lighting affects the appearance
            - Make it vivid and immersive

            Platform style: {platform_config['style']}
            """

        # Language-specific examples with DETAILED visual descriptions AND BRAND NAMES
        if language == 'zh':
            brand_name_zh = brands_list[0] if brands else "Gucci"  # Use detected brand or example
            example_correct = f"""正確範例 - 詳細描述 + 品牌提及 (繁體中文):
            "在{lighting_zh}的映襯下，這款{brand_name_zh}經典黑色菱格紋皮革包展現奢華質感，V字形縫線在柔軟小牛皮上勾勒出精緻的幾何圖案，復古金色雙G標誌在深色背景中熠熠生輝。金屬鏈條肩帶反射著{lighting_zh}，增添層次感與立體效果。皮革表面細膩的光澤與霧面質地形成迷人對比，每個細節都彰顯義大利工藝的極致追求。這樣的{brand_name_zh}單品不只是配件，更是品味與格調的完美詮釋。你的衣櫃裡有哪件經典單品？✨🖤"

            注意：品牌名稱 "{brand_name_zh}" 出現在第一句！這是正確的做法。

            CRITICAL:
            - 必須包含材質描述（皮革、金屬等）
            - 必須包含顏色細節（黑色、復古金色等）
            - 必須包含設計特點（縫線、標誌、鏈條等）
            - 必須使用"{lighting_zh}"來描述光線
            """
        elif language == 'en':
            brand_name_en = brands_list[0] if brands else "Gucci"  # Use detected brand or example
            example_correct = f"""CORRECT EXAMPLE - Detailed Description + Brand Mention (ENGLISH ONLY - NO CHINESE):
                "Under the {lighting}, this {brand_name_en} classic black quilted leather bag showcases luxurious craftsmanship. V-shaped stitching traces intricate geometric patterns across supple calfskin, while the antique gold double-G logo gleams against the dark backdrop. The metal chain strap catches and reflects the {lighting}, adding dimension and depth to the piece. The leather surface presents a captivating contrast between fine sheen and matte texture, with every detail exemplifying Italian artisanship at its finest. This {brand_name_en} piece isn't just an accessory – it's a perfect expression of taste and sophistication. What's your timeless wardrobe essential? ✨🖤"

                NOTE: Brand name "{brand_name_en}" appears in the FIRST sentence! This is the correct approach.

                🚨 ABSOLUTE REQUIREMENT FOR ENGLISH MODE 🚨
                - Output must be 100% ENGLISH - zero Chinese characters allowed
                - MUST include material descriptions (leather, metal, etc.)
                - MUST include color details (black, antique gold, etc.)
                - MUST include design features (stitching, logo, chain, etc.)
                - MUST use "{lighting}" to describe the lighting
                - NO Chinese characters anywhere in the output
                """
        else:  # zh-en bilingual
            brand_name_en = brands_list[0] if brands else "Gucci"
            example_correct = f"""BILINGUAL EXAMPLE - 雙語範例:
                Caption in Traditional Chinese, with English hashtags support.
                (Details omitted for brevity)
                """

        # Language-specific hashtag instructions
        if language == 'zh':
            hashtag_instruction = """
            【CRITICAL HASHTAG REQUIREMENT - 繁體中文】:
            - ALL hashtags MUST be in Traditional Chinese (繁體中文)
            - NEVER use English hashtags when language is 繁體中文
            - Examples of CORRECT hashtags: ["時尚包包", "奢華風格", "皮革工藝", "精品配件"]
            - Examples of WRONG hashtags: ["FashionBlogger", "LuxuryLifestyle"] - DO NOT USE THESE
            """
        elif language == 'en':
            hashtag_instruction = """
            【CRITICAL HASHTAG REQUIREMENT - English】:
            - ALL hashtags MUST be in English
            - NEVER use Chinese characters in hashtags
            - Examples of CORRECT hashtags: ["FashionBlogger", "LuxuryLifestyle", "LeatherCraft"]
            """
        else:  # zh-en
            hashtag_instruction = """
            【CRITICAL HASHTAG REQUIREMENT - Bilingual】:
            - Hashtags should MIX Traditional Chinese and English
            - First half in Chinese, second half in English
            - Example: ["時尚包包", "奢華風格", "FashionBlogger", "LuxuryLifestyle"]
            """

        output_format = f"""
            Generate output in JSON format:
            {{
                "caption": "string (minimum {platform_config['min_length']} chars, maximum {platform_config['max_length']} chars, engaging and descriptive)",
                "hashtags": ["tag1", "tag2", ...] ({platform_config['hashtag_count']} relevant hashtags),
                "tone": "casual|professional|playful",
                "platform": "{platform}"
            }}

            {hashtag_instruction}

            STRICT REQUIREMENTS:
            1. Caption length: {platform_config['min_length']}-{platform_config['max_length']} characters
            2. 🚨 EMOJI REQUIREMENT 🚨 - MUST use EXACTLY {platform_config['emoji_count']} emojis naturally integrated into caption text
            - Professional style: 1-2 emojis (e.g., ✨💼🌟)
            - Creative style: 2-3 emojis (e.g., 🎨✨💫🌙)
            - Authentic style: 2-3 emojis (e.g., 💖👜✨🖤)
            - Place emojis naturally within or at end of sentences
            3. Caption must be pure descriptive text only - absolutely NO hashtags allowed
            4. 🚨 CALL-TO-ACTION REQUIREMENT 🚨 - MUST include an engaging question or CTA at the end
            - Professional: Brief professional question (e.g., "What's your go-to piece?")
            - Creative: Thought-provoking question (e.g., "How does this speak to you?")
            - Authentic: Personal question (e.g., "What's your favorite timeless accessory?")
            5. Write 3-4 complete sentences following the structure above
            6. Be specific and vivid - describe what you see in detail
            7. 【CRITICAL】 MUST use the EXACT lighting description: "{lighting_display}"
            - DO NOT substitute with similar terms
            - DO NOT use "金黃時刻" if the lighting is "{lighting_zh if language == 'zh' else lighting}"
            - DO NOT invent your own lighting description
            8. 🚨 HASHTAG REQUIREMENT 🚨 - Generate {platform_config['hashtag_count']} relevant hashtags
            - Hashtags go ONLY in the 'hashtags' array, NEVER in the caption text
            - Mix of broad and specific tags
            - Include brand name as hashtag if detected
            9. {"🚨 CRITICAL BRAND REQUIREMENT 🚨 - The brand name '" + brands_list[0] + "' MUST appear in the FIRST sentence of your caption. This is MANDATORY and NON-NEGOTIABLE. Example: " + ("'這款" + brands_list[0] + "經典黑色...'" if language == 'zh' else "'This " + brands_list[0] + " classic black...'") if brands else "No brands detected to mention"}
            10. {"🚨 LANGUAGE REQUIREMENT 🚨 - Output must be 100% ENGLISH ONLY. NO Chinese characters allowed anywhere." if language == 'en' else ""}

            WRONG EXAMPLE (DO NOT DO THIS):
            "Lost in the city's towering skyscrapers 🏙️✨ | #UrbanVibes #CityLife"

            {example_correct}
            """

        full_prompt = f"{system_instruction}\n\n{context}\n\n{output_format}"
        return full_prompt

    def generate_captions(self, analysis_results: Dict, image: Image.Image,
                         platform: str = 'instagram', language: str = 'zh') -> List[Dict]:
        """Generate 3 captions with distinct styles: Professional, Creative, Authentic"""

        # Extract brands for style instructions
        brands_in_image = analysis_results.get('brands', [])
        brand_names = [b[0] for b in brands_in_image[:3]] if brands_in_image else []
        brand_mention_requirement = f" CRITICAL: Mention {', '.join(brand_names)} brand(s) naturally in the caption." if brand_names else ""

        # Define 3 distinct styles
        styles = [
            {
                'name': 'professional',
                'temp': 0.6,
                'instruction': f'Professional style: Concise, elegant, sophisticated. Focus on quality and craftsmanship. Use refined language.{brand_mention_requirement}',
                'length_modifier': 0.8  # Shorter, more concise
            },
            {
                'name': 'creative',
                'temp': 0.7,
                'instruction': f'Creative style: Artistic, expressive, imaginative. Use vivid metaphors and sensory descriptions. Balance detail with flair.{brand_mention_requirement}',
                'length_modifier': 1.0  # Medium length
            },
            {
                'name': 'authentic',
                'temp': 0.8,
                'instruction': f'Authentic style: Personal, detailed, storytelling. Share rich observations and genuine feelings. Most descriptive and engaging.{brand_mention_requirement}',
                'length_modifier': 1.2  # Longer, more detailed
            }
        ]

        variations = []

        for style in styles:
            # Build style-specific prompt
            base_prompt = self.construct_prompt(analysis_results, platform, language)

            # Add style instruction
            style_prompt = f"""{base_prompt}

                **STYLE REQUIREMENT FOR THIS CAPTION:**
                {style['instruction']}

                Adjust tone to be clearly '{style['name']}' - this should be noticeably different from other styles."""

            messages = [{
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": style_prompt}
                ]
            }]

            text = self.processor.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )

            image_inputs, video_inputs = process_vision_info(messages)
            inputs = self.processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt"
            )

            if torch.cuda.is_available():
                inputs = inputs.to("cuda")

            # Generate with style-specific temperature
            config = self.generation_config.copy()
            config['temperature'] = style['temp']

            with torch.no_grad():
                generated_ids = self.model.generate(**inputs, **config)

            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]

            output_text = self.processor.batch_decode(
                generated_ids_trimmed,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False
            )[0]

            parsed = self._parse_json_output(output_text)
            if parsed:
                # Force the correct tone
                parsed['tone'] = style['name']

                # Remove any hashtags that leaked into caption
                if 'caption' in parsed:
                    parsed['caption'] = self._remove_hashtags_from_caption(parsed['caption'])

                # Convert Simplified Chinese to Traditional if language is 'zh'
                if language == 'zh' or language == 'zh-en':
                    parsed = self._convert_to_traditional(parsed)

                variations.append(parsed)

        return variations if variations else [self._get_fallback_caption(platform, language)]

    def _remove_hashtags_from_caption(self, caption: str) -> str:
        """Remove any hashtags, pipes, and debug info that leaked into caption text"""
        import re

        # CRITICAL FIX: Remove pipe symbol and everything after it (debug info)
        # Example: "Text 🕰️🌉 | SoftDiffusedLight" -> "Text 🕰️🌉"
        if '|' in caption:
            caption = caption.split('|')[0].strip()

        # Remove hashtags (words starting with #)
        caption = re.sub(r'#\w+', '', caption)
        caption = re.sub(r'#[\u4e00-\u9fff]+', '', caption)  # Remove Chinese hashtags

        # Remove standalone weird text patterns (like "BLACKBELT")
        # If there's a suspicious all-caps word at the end without context, remove it
        words = caption.split()
        if len(words) > 0:
            last_word = words[-1].strip('✨💎👗🌟💫🖤')
            # If last "word" is all caps and doesn't look like a normal sentence word, remove it
            if last_word.isupper() and len(last_word) > 3 and not any(char in last_word for char in '.,!?'):
                caption = ' '.join(words[:-1])

        # Remove excessive emojis at the end (more than 3)
        emoji_pattern = r'[\U0001F300-\U0001F9FF]{4,}$'
        caption = re.sub(emoji_pattern, '', caption)

        # Remove multiple spaces
        caption = re.sub(r'\s+', ' ', caption)

        # Remove trailing/leading whitespace
        caption = caption.strip()

        # Final cleanup: if caption ends with weird patterns like "✨X 👗💎", clean it
        if re.search(r'[✨💎👗🌟💫🖤]{2,}\s*$', caption):
            caption = re.sub(r'[✨💎👗🌟💫🖤\s]+$', '', caption).strip()

        return caption

    def _convert_to_traditional(self, caption: Dict) -> Dict:
        """Convert Simplified Chinese to Traditional Chinese"""
        if 'caption' in caption:
            caption['caption'] = self.cc.convert(caption['caption'])
        return caption

    def _parse_json_output(self, text: str) -> Dict:
        """Parse JSON output"""
        try:
            start = text.find('{')
            end = text.rfind('}') + 1
            if start != -1 and end > start:
                json_str = text[start:end]
                return json.loads(json_str)
        except:
            pass
        return None

    def _get_fallback_caption(self, platform: str, language: str) -> Dict:
        """Fallback caption"""
        if language == 'en':
            return {
                'caption': 'Every moment tells a story worth sharing. The world around us is filled with beauty waiting to be discovered. Take a pause and appreciate the details that make life extraordinary. What caught your eye today? ✨',
                'hashtags': ['photography', 'daily', 'lifestyle', 'moment', 'capture'],
                'tone': 'casual',
                'platform': platform
            }
        else:
            return {
                'caption': '每個瞬間都值得被記錄與分享。生活中充滿了等待被發現的美好細節。停下腳步，用心感受周遭的一切。今天什麼畫面觸動了你的心？✨',
                'hashtags': ['攝影', '日常', '生活', '瞬間', '分享'],
                'tone': 'casual',
                'platform': platform
            }

print("✓ CaptionGenerationManager (with Auto* classes for flexible model support) defined")