import torch from transformers import AutoModelForImageTextToText, AutoProcessor from qwen_vl_utils import process_vision_info from PIL import Image from typing import List, Dict import json from opencc import OpenCC import warnings class CaptionGenerationManager: """Caption generation using Vision-Language Models (supports Qwen2.5-VL, Qwen3-VL, etc.)""" def __init__(self, model_name: str = "Qwen/Qwen2.5-VL-7B-Instruct"): """ Args: model_name: Vision-Language model name, e.g.: - "Qwen/Qwen2.5-VL-7B-Instruct" (default) - "Qwen/Qwen3-VL-8B-Instruct" (2025 latest) """ print(f"Loading Vision-Language Model: {model_name}...") # Suppress processor warning warnings.filterwarnings("ignore", category=FutureWarning, module="transformers") # Use Auto* classes for flexibility (supports Qwen2.5-VL, Qwen3-VL, etc.) self.processor = AutoProcessor.from_pretrained(model_name, use_fast=False) self.model = AutoModelForImageTextToText.from_pretrained( model_name, dtype=torch.bfloat16, # Changed from torch_dtype to dtype device_map="auto" ) # Simplified Chinese to Traditional Chinese converter self.cc = OpenCC('s2t') # Simplified to Traditional self.generation_config = { 'temperature': 0.7, 'top_p': 0.9, 'max_new_tokens': 300, # Increased from 200 to prevent truncation 'repetition_penalty': 1.1 } # Platform-specific templates self.platform_templates = { 'instagram': { 'style': 'storytelling, aesthetic', 'emoji_count': '2-3', 'hashtag_count': '8-10', 'min_length': 120, # Increased for richer content 'max_length': 220, # Allow more detailed descriptions 'features': ['call-to-action', 'question', 'relatable'] }, 'tiktok': { 'style': 'brief, punchy', 'emoji_count': '1-2', 'hashtag_count': '5-8', 'min_length': 60, 'max_length': 120, 'features': ['trending', 'POV', 'relatable'] }, 'xiaohongshu': { 'style': 'structured, informative, detailed', 'emoji_count': '5-8', 'hashtag_count': '8-12', 'min_length': 180, 'max_length': 500, 'features': ['tips', 'bullets', 'sharing-tone'] } } print(f"✓ {model_name.split('/')[-1]} loaded successfully (using Auto* classes for flexibility)") def construct_prompt(self, analysis_results: Dict, platform: str = 'instagram', language: str = 'zh') -> str: """Construct prompt with language support ensuring consistency Args: language: 'zh' (Traditional Chinese), 'en' (English), 'zh-en' (Bilingual) """ platform_config = self.platform_templates.get(platform, self.platform_templates['instagram']) # Language-specific instructions language_instructions = { 'zh': '請使用繁體中文生成標題和標籤。語言要自然流暢,符合華語社群媒體的表達習慣。避免使用簡體字。當偵測到品牌時,必須在標題中提及品牌名稱。', 'en': '''🚨 CRITICAL LANGUAGE REQUIREMENT 🚨 Generate captions and hashtags EXCLUSIVELY in English. - NEVER use Chinese characters (Traditional or Simplified) - NEVER mix languages - Use natural, engaging language suitable for international social media - When brands are detected, mention them naturally in English - All text output must be 100% English only This is MANDATORY and NON-NEGOTIABLE.''', 'zh-en': '''生成雙語內容:標題使用繁體中文,同時提供英文翻譯。標籤混合使用中英文以擴大觸及範圍。當偵測到品牌時,必須在標題中提及品牌名稱。 🚨 重要:雙語一致性要求 🚨 - 中文和英文必須表達相同的核心意義 - 允許表達方式的差異(形容詞、語法不同) - 但整體訊息、語氣、品牌提及必須一致 - 兩種語言都要朝同一方向詮釋內容''' } system_instruction = f"""You are a professional social media content strategist. {language_instructions.get(language, language_instructions['zh'])} Target platform: {platform} Content style: Authentic, creative, and optimized for engagement. CRITICAL RULE: Never include hashtags (symbols starting with #) in the caption text. Hashtags must only appear in the separate 'hashtags' array.""" # Extract analysis context objects = analysis_results.get('detections', []) brands = analysis_results.get('brands', []) scene_info = analysis_results.get('scene_analysis', {}) composition = analysis_results.get('composition', {}) # FIXED: Get fused lighting from scene_info (it's been updated by DetectionFusionManager) lighting = scene_info.get('lighting', {}).get('top', 'natural light') lighting_confidence = scene_info.get('lighting', {}).get('confidence', 0.7) # Provide explicit Chinese translations to ensure consistency lighting_translations_zh = { 'soft diffused light': '柔和漫射光', 'overcast atmosphere': '陰天氛圍', 'natural daylight': '自然日光', 'warm ambient light': '溫暖環境光', 'evening light': '傍晚光線', 'bright sunlight': '明亮陽光', 'golden hour': '金黃時刻', 'blue hour': '藍調時刻' } # Get appropriate lighting description based on language if language == 'zh': lighting_zh = lighting_translations_zh.get(lighting, lighting) lighting_display = lighting_zh else: # For English and bilingual, use English only lighting_display = lighting lighting_zh = lighting objects_str = ', '.join([obj['class_name'] for obj in objects[:10]]) # CRITICAL: Emphasize brands EXTREMELY prominently - repeat multiple times if brands: brands_list = [b[0] for b in brands[:5]] brands_str = ', '.join(brands_list) brand_emphasis = f""" 🚨 CRITICAL BRAND REQUIREMENT 🚨 The following brands were POSITIVELY IDENTIFIED in this image: {brands_str} YOU ABSOLUTELY MUST: 1. Mention the brand name "{brands_list[0]}" explicitly in the FIRST sentence 2. Use the exact brand name - do not use generic terms like "bag" or "accessory" without the brand 3. Write naturally as if you're excited to share this {brands_list[0]} item 4. Example: "在傍晚光線下,這款{brands_list[0]}經典黑色菱格紋皮革包..." (CORRECT) 5. NOT acceptable: "在傍晚光線下,這款經典黑色菱格紋皮革包..." (WRONG - missing brand name!) THIS IS MANDATORY - The caption will be rejected if it doesn't mention {brands_str}. """ else: brands_str = 'None detected' brand_emphasis = "" # Enhanced scene description urban_scene = scene_info.get('urban', {}).get('top', 'unknown') mood = scene_info.get('mood', {}).get('top', 'neutral') comp_type = composition.get('composition_type', 'standard') context = f""" Analyze this image and generate an engaging, DETAILED social media caption with rich visual descriptions. **Visual Elements (Describe in Detail):** - Detected objects: {objects_str} - Scene composition: {comp_type} - Urban environment: {urban_scene} - **IMPORTANT**: Include specific details about: * Materials (leather, metal, fabric, canvas, etc.) * Colors (use descriptive terms: jet black, antique gold, midnight blue, etc.) * Textures (quilted, smooth, matte, glossy, metallic, etc.) * Design features (stitching patterns, hardware, logos, emblems, etc.) * Reflections and lighting effects on surfaces **Atmosphere:** - Lighting (analyzed with Places365 + CV): {lighting_display} (confidence: {lighting_confidence:.2f}) - Mood: {mood} **Brand Detection:** - Identified brands: {brands_str}{brand_emphasis} **Caption Structure (Required - BE SPECIFIC AND DETAILED):** 1. Opening hook - Most striking visual element with SPECIFIC details (1-2 sentences) {f"- 🚨 MANDATORY: Start with the BRAND NAME '{brands_list[0]}' in the FIRST sentence!" if brands else ""} {f"- Example (CORRECT): '這款{brands_list[0]}經典黑色菱格紋皮革包...'" if brands else ""} {f"- Example (WRONG): '這款經典黑色菱格紋皮革包...' (missing {brands_list[0]}!)" if brands else ""} - Be SPECIFIC: Include material, color, design features WITH the brand name 2. Visual details - Describe materials, textures, colors, and design elements (2-3 sentences) - Be SPECIFIC: mention quilting patterns, metal finishes, chain details, logo placements - Describe how light interacts with materials (reflections on leather, gleam of metal) - MUST use the EXACT lighting description: "{lighting_display}" 3. Atmospheric context - How lighting and mood create the scene's character (1-2 sentences) - Connect lighting to the overall visual impact - Describe depth, shadows, contrasts 4. Emotional connection & Engagement - How this resonates with viewers + call-to-action (1 sentence) **Content Requirements:** - Minimum information: 3-4 specific visual details per caption - Include material types, color descriptions, design features - Describe how lighting affects the appearance - Make it vivid and immersive Platform style: {platform_config['style']} """ # Language-specific examples with DETAILED visual descriptions AND BRAND NAMES if language == 'zh': brand_name_zh = brands_list[0] if brands else "Gucci" # Use detected brand or example example_correct = f"""正確範例 - 詳細描述 + 品牌提及 (繁體中文): "在{lighting_zh}的映襯下,這款{brand_name_zh}經典黑色菱格紋皮革包展現奢華質感,V字形縫線在柔軟小牛皮上勾勒出精緻的幾何圖案,復古金色雙G標誌在深色背景中熠熠生輝。金屬鏈條肩帶反射著{lighting_zh},增添層次感與立體效果。皮革表面細膩的光澤與霧面質地形成迷人對比,每個細節都彰顯義大利工藝的極致追求。這樣的{brand_name_zh}單品不只是配件,更是品味與格調的完美詮釋。你的衣櫃裡有哪件經典單品?✨🖤" 注意:品牌名稱 "{brand_name_zh}" 出現在第一句!這是正確的做法。 CRITICAL: - 必須包含材質描述(皮革、金屬等) - 必須包含顏色細節(黑色、復古金色等) - 必須包含設計特點(縫線、標誌、鏈條等) - 必須使用"{lighting_zh}"來描述光線 """ elif language == 'en': brand_name_en = brands_list[0] if brands else "Gucci" # Use detected brand or example example_correct = f"""CORRECT EXAMPLE - Detailed Description + Brand Mention (ENGLISH ONLY - NO CHINESE): "Under the {lighting}, this {brand_name_en} classic black quilted leather bag showcases luxurious craftsmanship. V-shaped stitching traces intricate geometric patterns across supple calfskin, while the antique gold double-G logo gleams against the dark backdrop. The metal chain strap catches and reflects the {lighting}, adding dimension and depth to the piece. The leather surface presents a captivating contrast between fine sheen and matte texture, with every detail exemplifying Italian artisanship at its finest. This {brand_name_en} piece isn't just an accessory – it's a perfect expression of taste and sophistication. What's your timeless wardrobe essential? ✨🖤" NOTE: Brand name "{brand_name_en}" appears in the FIRST sentence! This is the correct approach. 🚨 ABSOLUTE REQUIREMENT FOR ENGLISH MODE 🚨 - Output must be 100% ENGLISH - zero Chinese characters allowed - MUST include material descriptions (leather, metal, etc.) - MUST include color details (black, antique gold, etc.) - MUST include design features (stitching, logo, chain, etc.) - MUST use "{lighting}" to describe the lighting - NO Chinese characters anywhere in the output """ else: # zh-en bilingual brand_name_en = brands_list[0] if brands else "Gucci" example_correct = f"""BILINGUAL EXAMPLE - 雙語範例: Caption in Traditional Chinese, with English hashtags support. (Details omitted for brevity) """ # Language-specific hashtag instructions if language == 'zh': hashtag_instruction = """ 【CRITICAL HASHTAG REQUIREMENT - 繁體中文】: - ALL hashtags MUST be in Traditional Chinese (繁體中文) - NEVER use English hashtags when language is 繁體中文 - Examples of CORRECT hashtags: ["時尚包包", "奢華風格", "皮革工藝", "精品配件"] - Examples of WRONG hashtags: ["FashionBlogger", "LuxuryLifestyle"] - DO NOT USE THESE """ elif language == 'en': hashtag_instruction = """ 【CRITICAL HASHTAG REQUIREMENT - English】: - ALL hashtags MUST be in English - NEVER use Chinese characters in hashtags - Examples of CORRECT hashtags: ["FashionBlogger", "LuxuryLifestyle", "LeatherCraft"] """ else: # zh-en hashtag_instruction = """ 【CRITICAL HASHTAG REQUIREMENT - Bilingual】: - Hashtags should MIX Traditional Chinese and English - First half in Chinese, second half in English - Example: ["時尚包包", "奢華風格", "FashionBlogger", "LuxuryLifestyle"] """ output_format = f""" Generate output in JSON format: {{ "caption": "string (minimum {platform_config['min_length']} chars, maximum {platform_config['max_length']} chars, engaging and descriptive)", "hashtags": ["tag1", "tag2", ...] ({platform_config['hashtag_count']} relevant hashtags), "tone": "casual|professional|playful", "platform": "{platform}" }} {hashtag_instruction} STRICT REQUIREMENTS: 1. Caption length: {platform_config['min_length']}-{platform_config['max_length']} characters 2. 🚨 EMOJI REQUIREMENT 🚨 - MUST use EXACTLY {platform_config['emoji_count']} emojis naturally integrated into caption text - Professional style: 1-2 emojis (e.g., ✨💼🌟) - Creative style: 2-3 emojis (e.g., 🎨✨💫🌙) - Authentic style: 2-3 emojis (e.g., 💖👜✨🖤) - Place emojis naturally within or at end of sentences 3. Caption must be pure descriptive text only - absolutely NO hashtags allowed 4. 🚨 CALL-TO-ACTION REQUIREMENT 🚨 - MUST include an engaging question or CTA at the end - Professional: Brief professional question (e.g., "What's your go-to piece?") - Creative: Thought-provoking question (e.g., "How does this speak to you?") - Authentic: Personal question (e.g., "What's your favorite timeless accessory?") 5. Write 3-4 complete sentences following the structure above 6. Be specific and vivid - describe what you see in detail 7. 【CRITICAL】 MUST use the EXACT lighting description: "{lighting_display}" - DO NOT substitute with similar terms - DO NOT use "金黃時刻" if the lighting is "{lighting_zh if language == 'zh' else lighting}" - DO NOT invent your own lighting description 8. 🚨 HASHTAG REQUIREMENT 🚨 - Generate {platform_config['hashtag_count']} relevant hashtags - Hashtags go ONLY in the 'hashtags' array, NEVER in the caption text - Mix of broad and specific tags - Include brand name as hashtag if detected 9. {"🚨 CRITICAL BRAND REQUIREMENT 🚨 - The brand name '" + brands_list[0] + "' MUST appear in the FIRST sentence of your caption. This is MANDATORY and NON-NEGOTIABLE. Example: " + ("'這款" + brands_list[0] + "經典黑色...'" if language == 'zh' else "'This " + brands_list[0] + " classic black...'") if brands else "No brands detected to mention"} 10. {"🚨 LANGUAGE REQUIREMENT 🚨 - Output must be 100% ENGLISH ONLY. NO Chinese characters allowed anywhere." if language == 'en' else ""} WRONG EXAMPLE (DO NOT DO THIS): "Lost in the city's towering skyscrapers 🏙️✨ | #UrbanVibes #CityLife" {example_correct} """ full_prompt = f"{system_instruction}\n\n{context}\n\n{output_format}" return full_prompt def generate_captions(self, analysis_results: Dict, image: Image.Image, platform: str = 'instagram', language: str = 'zh') -> List[Dict]: """Generate 3 captions with distinct styles: Professional, Creative, Authentic""" # Extract brands for style instructions brands_in_image = analysis_results.get('brands', []) brand_names = [b[0] for b in brands_in_image[:3]] if brands_in_image else [] brand_mention_requirement = f" CRITICAL: Mention {', '.join(brand_names)} brand(s) naturally in the caption." if brand_names else "" # Define 3 distinct styles styles = [ { 'name': 'professional', 'temp': 0.6, 'instruction': f'Professional style: Concise, elegant, sophisticated. Focus on quality and craftsmanship. Use refined language.{brand_mention_requirement}', 'length_modifier': 0.8 # Shorter, more concise }, { 'name': 'creative', 'temp': 0.7, 'instruction': f'Creative style: Artistic, expressive, imaginative. Use vivid metaphors and sensory descriptions. Balance detail with flair.{brand_mention_requirement}', 'length_modifier': 1.0 # Medium length }, { 'name': 'authentic', 'temp': 0.8, 'instruction': f'Authentic style: Personal, detailed, storytelling. Share rich observations and genuine feelings. Most descriptive and engaging.{brand_mention_requirement}', 'length_modifier': 1.2 # Longer, more detailed } ] variations = [] for style in styles: # Build style-specific prompt base_prompt = self.construct_prompt(analysis_results, platform, language) # Add style instruction style_prompt = f"""{base_prompt} **STYLE REQUIREMENT FOR THIS CAPTION:** {style['instruction']} Adjust tone to be clearly '{style['name']}' - this should be noticeably different from other styles.""" messages = [{ "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": style_prompt} ] }] text = self.processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = self.processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt" ) if torch.cuda.is_available(): inputs = inputs.to("cuda") # Generate with style-specific temperature config = self.generation_config.copy() config['temperature'] = style['temp'] with torch.no_grad(): generated_ids = self.model.generate(**inputs, **config) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = self.processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] parsed = self._parse_json_output(output_text) if parsed: # Force the correct tone parsed['tone'] = style['name'] # Remove any hashtags that leaked into caption if 'caption' in parsed: parsed['caption'] = self._remove_hashtags_from_caption(parsed['caption']) # Convert Simplified Chinese to Traditional if language is 'zh' if language == 'zh' or language == 'zh-en': parsed = self._convert_to_traditional(parsed) variations.append(parsed) return variations if variations else [self._get_fallback_caption(platform, language)] def _remove_hashtags_from_caption(self, caption: str) -> str: """Remove any hashtags, pipes, and debug info that leaked into caption text""" import re # CRITICAL FIX: Remove pipe symbol and everything after it (debug info) # Example: "Text 🕰️🌉 | SoftDiffusedLight" -> "Text 🕰️🌉" if '|' in caption: caption = caption.split('|')[0].strip() # Remove hashtags (words starting with #) caption = re.sub(r'#\w+', '', caption) caption = re.sub(r'#[\u4e00-\u9fff]+', '', caption) # Remove Chinese hashtags # Remove standalone weird text patterns (like "BLACKBELT") # If there's a suspicious all-caps word at the end without context, remove it words = caption.split() if len(words) > 0: last_word = words[-1].strip('✨💎👗🌟💫🖤') # If last "word" is all caps and doesn't look like a normal sentence word, remove it if last_word.isupper() and len(last_word) > 3 and not any(char in last_word for char in '.,!?'): caption = ' '.join(words[:-1]) # Remove excessive emojis at the end (more than 3) emoji_pattern = r'[\U0001F300-\U0001F9FF]{4,}$' caption = re.sub(emoji_pattern, '', caption) # Remove multiple spaces caption = re.sub(r'\s+', ' ', caption) # Remove trailing/leading whitespace caption = caption.strip() # Final cleanup: if caption ends with weird patterns like "✨X 👗💎", clean it if re.search(r'[✨💎👗🌟💫🖤]{2,}\s*$', caption): caption = re.sub(r'[✨💎👗🌟💫🖤\s]+$', '', caption).strip() return caption def _convert_to_traditional(self, caption: Dict) -> Dict: """Convert Simplified Chinese to Traditional Chinese""" if 'caption' in caption: caption['caption'] = self.cc.convert(caption['caption']) return caption def _parse_json_output(self, text: str) -> Dict: """Parse JSON output""" try: start = text.find('{') end = text.rfind('}') + 1 if start != -1 and end > start: json_str = text[start:end] return json.loads(json_str) except: pass return None def _get_fallback_caption(self, platform: str, language: str) -> Dict: """Fallback caption""" if language == 'en': return { 'caption': 'Every moment tells a story worth sharing. The world around us is filled with beauty waiting to be discovered. Take a pause and appreciate the details that make life extraordinary. What caught your eye today? ✨', 'hashtags': ['photography', 'daily', 'lifestyle', 'moment', 'capture'], 'tone': 'casual', 'platform': platform } else: return { 'caption': '每個瞬間都值得被記錄與分享。生活中充滿了等待被發現的美好細節。停下腳步,用心感受周遭的一切。今天什麼畫面觸動了你的心?✨', 'hashtags': ['攝影', '日常', '生活', '瞬間', '分享'], 'tone': 'casual', 'platform': platform } print("✓ CaptionGenerationManager (with Auto* classes for flexible model support) defined")