File size: 25,639 Bytes
6a3bd1f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 |
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
from typing import List, Dict
import json
from opencc import OpenCC
import warnings
class CaptionGenerationManager:
"""Caption generation using Vision-Language Models (supports Qwen2.5-VL, Qwen3-VL, etc.)"""
def __init__(self, model_name: str = "Qwen/Qwen2.5-VL-7B-Instruct"):
"""
Args:
model_name: Vision-Language model name, e.g.:
- "Qwen/Qwen2.5-VL-7B-Instruct" (default)
- "Qwen/Qwen3-VL-8B-Instruct" (2025 latest)
"""
print(f"Loading Vision-Language Model: {model_name}...")
# Suppress processor warning
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
# Use Auto* classes for flexibility (supports Qwen2.5-VL, Qwen3-VL, etc.)
self.processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
self.model = AutoModelForImageTextToText.from_pretrained(
model_name,
dtype=torch.bfloat16, # Changed from torch_dtype to dtype
device_map="auto"
)
# Simplified Chinese to Traditional Chinese converter
self.cc = OpenCC('s2t') # Simplified to Traditional
self.generation_config = {
'temperature': 0.7,
'top_p': 0.9,
'max_new_tokens': 300, # Increased from 200 to prevent truncation
'repetition_penalty': 1.1
}
# Platform-specific templates
self.platform_templates = {
'instagram': {
'style': 'storytelling, aesthetic',
'emoji_count': '2-3',
'hashtag_count': '8-10',
'min_length': 120, # Increased for richer content
'max_length': 220, # Allow more detailed descriptions
'features': ['call-to-action', 'question', 'relatable']
},
'tiktok': {
'style': 'brief, punchy',
'emoji_count': '1-2',
'hashtag_count': '5-8',
'min_length': 60,
'max_length': 120,
'features': ['trending', 'POV', 'relatable']
},
'xiaohongshu': {
'style': 'structured, informative, detailed',
'emoji_count': '5-8',
'hashtag_count': '8-12',
'min_length': 180,
'max_length': 500,
'features': ['tips', 'bullets', 'sharing-tone']
}
}
print(f"✓ {model_name.split('/')[-1]} loaded successfully (using Auto* classes for flexibility)")
def construct_prompt(self, analysis_results: Dict, platform: str = 'instagram', language: str = 'zh') -> str:
"""Construct prompt with language support ensuring consistency
Args:
language: 'zh' (Traditional Chinese), 'en' (English), 'zh-en' (Bilingual)
"""
platform_config = self.platform_templates.get(platform, self.platform_templates['instagram'])
# Language-specific instructions
language_instructions = {
'zh': '請使用繁體中文生成標題和標籤。語言要自然流暢,符合華語社群媒體的表達習慣。避免使用簡體字。當偵測到品牌時,必須在標題中提及品牌名稱。',
'en': '''🚨 CRITICAL LANGUAGE REQUIREMENT 🚨
Generate captions and hashtags EXCLUSIVELY in English.
- NEVER use Chinese characters (Traditional or Simplified)
- NEVER mix languages
- Use natural, engaging language suitable for international social media
- When brands are detected, mention them naturally in English
- All text output must be 100% English only
This is MANDATORY and NON-NEGOTIABLE.''',
'zh-en': '''生成雙語內容:標題使用繁體中文,同時提供英文翻譯。標籤混合使用中英文以擴大觸及範圍。當偵測到品牌時,必須在標題中提及品牌名稱。
🚨 重要:雙語一致性要求 🚨
- 中文和英文必須表達相同的核心意義
- 允許表達方式的差異(形容詞、語法不同)
- 但整體訊息、語氣、品牌提及必須一致
- 兩種語言都要朝同一方向詮釋內容'''
}
system_instruction = f"""You are a professional social media content strategist.
{language_instructions.get(language, language_instructions['zh'])}
Target platform: {platform}
Content style: Authentic, creative, and optimized for engagement.
CRITICAL RULE: Never include hashtags (symbols starting with #) in the caption text. Hashtags must only appear in the separate 'hashtags' array."""
# Extract analysis context
objects = analysis_results.get('detections', [])
brands = analysis_results.get('brands', [])
scene_info = analysis_results.get('scene_analysis', {})
composition = analysis_results.get('composition', {})
# FIXED: Get fused lighting from scene_info (it's been updated by DetectionFusionManager)
lighting = scene_info.get('lighting', {}).get('top', 'natural light')
lighting_confidence = scene_info.get('lighting', {}).get('confidence', 0.7)
# Provide explicit Chinese translations to ensure consistency
lighting_translations_zh = {
'soft diffused light': '柔和漫射光',
'overcast atmosphere': '陰天氛圍',
'natural daylight': '自然日光',
'warm ambient light': '溫暖環境光',
'evening light': '傍晚光線',
'bright sunlight': '明亮陽光',
'golden hour': '金黃時刻',
'blue hour': '藍調時刻'
}
# Get appropriate lighting description based on language
if language == 'zh':
lighting_zh = lighting_translations_zh.get(lighting, lighting)
lighting_display = lighting_zh
else:
# For English and bilingual, use English only
lighting_display = lighting
lighting_zh = lighting
objects_str = ', '.join([obj['class_name'] for obj in objects[:10]])
# CRITICAL: Emphasize brands EXTREMELY prominently - repeat multiple times
if brands:
brands_list = [b[0] for b in brands[:5]]
brands_str = ', '.join(brands_list)
brand_emphasis = f"""
🚨 CRITICAL BRAND REQUIREMENT 🚨
The following brands were POSITIVELY IDENTIFIED in this image: {brands_str}
YOU ABSOLUTELY MUST:
1. Mention the brand name "{brands_list[0]}" explicitly in the FIRST sentence
2. Use the exact brand name - do not use generic terms like "bag" or "accessory" without the brand
3. Write naturally as if you're excited to share this {brands_list[0]} item
4. Example: "在傍晚光線下,這款{brands_list[0]}經典黑色菱格紋皮革包..." (CORRECT)
5. NOT acceptable: "在傍晚光線下,這款經典黑色菱格紋皮革包..." (WRONG - missing brand name!)
THIS IS MANDATORY - The caption will be rejected if it doesn't mention {brands_str}.
"""
else:
brands_str = 'None detected'
brand_emphasis = ""
# Enhanced scene description
urban_scene = scene_info.get('urban', {}).get('top', 'unknown')
mood = scene_info.get('mood', {}).get('top', 'neutral')
comp_type = composition.get('composition_type', 'standard')
context = f"""
Analyze this image and generate an engaging, DETAILED social media caption with rich visual descriptions.
**Visual Elements (Describe in Detail):**
- Detected objects: {objects_str}
- Scene composition: {comp_type}
- Urban environment: {urban_scene}
- **IMPORTANT**: Include specific details about:
* Materials (leather, metal, fabric, canvas, etc.)
* Colors (use descriptive terms: jet black, antique gold, midnight blue, etc.)
* Textures (quilted, smooth, matte, glossy, metallic, etc.)
* Design features (stitching patterns, hardware, logos, emblems, etc.)
* Reflections and lighting effects on surfaces
**Atmosphere:**
- Lighting (analyzed with Places365 + CV): {lighting_display} (confidence: {lighting_confidence:.2f})
- Mood: {mood}
**Brand Detection:**
- Identified brands: {brands_str}{brand_emphasis}
**Caption Structure (Required - BE SPECIFIC AND DETAILED):**
1. Opening hook - Most striking visual element with SPECIFIC details (1-2 sentences)
{f"- 🚨 MANDATORY: Start with the BRAND NAME '{brands_list[0]}' in the FIRST sentence!" if brands else ""}
{f"- Example (CORRECT): '這款{brands_list[0]}經典黑色菱格紋皮革包...'" if brands else ""}
{f"- Example (WRONG): '這款經典黑色菱格紋皮革包...' (missing {brands_list[0]}!)" if brands else ""}
- Be SPECIFIC: Include material, color, design features WITH the brand name
2. Visual details - Describe materials, textures, colors, and design elements (2-3 sentences)
- Be SPECIFIC: mention quilting patterns, metal finishes, chain details, logo placements
- Describe how light interacts with materials (reflections on leather, gleam of metal)
- MUST use the EXACT lighting description: "{lighting_display}"
3. Atmospheric context - How lighting and mood create the scene's character (1-2 sentences)
- Connect lighting to the overall visual impact
- Describe depth, shadows, contrasts
4. Emotional connection & Engagement - How this resonates with viewers + call-to-action (1 sentence)
**Content Requirements:**
- Minimum information: 3-4 specific visual details per caption
- Include material types, color descriptions, design features
- Describe how lighting affects the appearance
- Make it vivid and immersive
Platform style: {platform_config['style']}
"""
# Language-specific examples with DETAILED visual descriptions AND BRAND NAMES
if language == 'zh':
brand_name_zh = brands_list[0] if brands else "Gucci" # Use detected brand or example
example_correct = f"""正確範例 - 詳細描述 + 品牌提及 (繁體中文):
"在{lighting_zh}的映襯下,這款{brand_name_zh}經典黑色菱格紋皮革包展現奢華質感,V字形縫線在柔軟小牛皮上勾勒出精緻的幾何圖案,復古金色雙G標誌在深色背景中熠熠生輝。金屬鏈條肩帶反射著{lighting_zh},增添層次感與立體效果。皮革表面細膩的光澤與霧面質地形成迷人對比,每個細節都彰顯義大利工藝的極致追求。這樣的{brand_name_zh}單品不只是配件,更是品味與格調的完美詮釋。你的衣櫃裡有哪件經典單品?✨🖤"
注意:品牌名稱 "{brand_name_zh}" 出現在第一句!這是正確的做法。
CRITICAL:
- 必須包含材質描述(皮革、金屬等)
- 必須包含顏色細節(黑色、復古金色等)
- 必須包含設計特點(縫線、標誌、鏈條等)
- 必須使用"{lighting_zh}"來描述光線
"""
elif language == 'en':
brand_name_en = brands_list[0] if brands else "Gucci" # Use detected brand or example
example_correct = f"""CORRECT EXAMPLE - Detailed Description + Brand Mention (ENGLISH ONLY - NO CHINESE):
"Under the {lighting}, this {brand_name_en} classic black quilted leather bag showcases luxurious craftsmanship. V-shaped stitching traces intricate geometric patterns across supple calfskin, while the antique gold double-G logo gleams against the dark backdrop. The metal chain strap catches and reflects the {lighting}, adding dimension and depth to the piece. The leather surface presents a captivating contrast between fine sheen and matte texture, with every detail exemplifying Italian artisanship at its finest. This {brand_name_en} piece isn't just an accessory – it's a perfect expression of taste and sophistication. What's your timeless wardrobe essential? ✨🖤"
NOTE: Brand name "{brand_name_en}" appears in the FIRST sentence! This is the correct approach.
🚨 ABSOLUTE REQUIREMENT FOR ENGLISH MODE 🚨
- Output must be 100% ENGLISH - zero Chinese characters allowed
- MUST include material descriptions (leather, metal, etc.)
- MUST include color details (black, antique gold, etc.)
- MUST include design features (stitching, logo, chain, etc.)
- MUST use "{lighting}" to describe the lighting
- NO Chinese characters anywhere in the output
"""
else: # zh-en bilingual
brand_name_en = brands_list[0] if brands else "Gucci"
example_correct = f"""BILINGUAL EXAMPLE - 雙語範例:
Caption in Traditional Chinese, with English hashtags support.
(Details omitted for brevity)
"""
# Language-specific hashtag instructions
if language == 'zh':
hashtag_instruction = """
【CRITICAL HASHTAG REQUIREMENT - 繁體中文】:
- ALL hashtags MUST be in Traditional Chinese (繁體中文)
- NEVER use English hashtags when language is 繁體中文
- Examples of CORRECT hashtags: ["時尚包包", "奢華風格", "皮革工藝", "精品配件"]
- Examples of WRONG hashtags: ["FashionBlogger", "LuxuryLifestyle"] - DO NOT USE THESE
"""
elif language == 'en':
hashtag_instruction = """
【CRITICAL HASHTAG REQUIREMENT - English】:
- ALL hashtags MUST be in English
- NEVER use Chinese characters in hashtags
- Examples of CORRECT hashtags: ["FashionBlogger", "LuxuryLifestyle", "LeatherCraft"]
"""
else: # zh-en
hashtag_instruction = """
【CRITICAL HASHTAG REQUIREMENT - Bilingual】:
- Hashtags should MIX Traditional Chinese and English
- First half in Chinese, second half in English
- Example: ["時尚包包", "奢華風格", "FashionBlogger", "LuxuryLifestyle"]
"""
output_format = f"""
Generate output in JSON format:
{{
"caption": "string (minimum {platform_config['min_length']} chars, maximum {platform_config['max_length']} chars, engaging and descriptive)",
"hashtags": ["tag1", "tag2", ...] ({platform_config['hashtag_count']} relevant hashtags),
"tone": "casual|professional|playful",
"platform": "{platform}"
}}
{hashtag_instruction}
STRICT REQUIREMENTS:
1. Caption length: {platform_config['min_length']}-{platform_config['max_length']} characters
2. 🚨 EMOJI REQUIREMENT 🚨 - MUST use EXACTLY {platform_config['emoji_count']} emojis naturally integrated into caption text
- Professional style: 1-2 emojis (e.g., ✨💼🌟)
- Creative style: 2-3 emojis (e.g., 🎨✨💫🌙)
- Authentic style: 2-3 emojis (e.g., 💖👜✨🖤)
- Place emojis naturally within or at end of sentences
3. Caption must be pure descriptive text only - absolutely NO hashtags allowed
4. 🚨 CALL-TO-ACTION REQUIREMENT 🚨 - MUST include an engaging question or CTA at the end
- Professional: Brief professional question (e.g., "What's your go-to piece?")
- Creative: Thought-provoking question (e.g., "How does this speak to you?")
- Authentic: Personal question (e.g., "What's your favorite timeless accessory?")
5. Write 3-4 complete sentences following the structure above
6. Be specific and vivid - describe what you see in detail
7. 【CRITICAL】 MUST use the EXACT lighting description: "{lighting_display}"
- DO NOT substitute with similar terms
- DO NOT use "金黃時刻" if the lighting is "{lighting_zh if language == 'zh' else lighting}"
- DO NOT invent your own lighting description
8. 🚨 HASHTAG REQUIREMENT 🚨 - Generate {platform_config['hashtag_count']} relevant hashtags
- Hashtags go ONLY in the 'hashtags' array, NEVER in the caption text
- Mix of broad and specific tags
- Include brand name as hashtag if detected
9. {"🚨 CRITICAL BRAND REQUIREMENT 🚨 - The brand name '" + brands_list[0] + "' MUST appear in the FIRST sentence of your caption. This is MANDATORY and NON-NEGOTIABLE. Example: " + ("'這款" + brands_list[0] + "經典黑色...'" if language == 'zh' else "'This " + brands_list[0] + " classic black...'") if brands else "No brands detected to mention"}
10. {"🚨 LANGUAGE REQUIREMENT 🚨 - Output must be 100% ENGLISH ONLY. NO Chinese characters allowed anywhere." if language == 'en' else ""}
WRONG EXAMPLE (DO NOT DO THIS):
"Lost in the city's towering skyscrapers 🏙️✨ | #UrbanVibes #CityLife"
{example_correct}
"""
full_prompt = f"{system_instruction}\n\n{context}\n\n{output_format}"
return full_prompt
def generate_captions(self, analysis_results: Dict, image: Image.Image,
platform: str = 'instagram', language: str = 'zh') -> List[Dict]:
"""Generate 3 captions with distinct styles: Professional, Creative, Authentic"""
# Extract brands for style instructions
brands_in_image = analysis_results.get('brands', [])
brand_names = [b[0] for b in brands_in_image[:3]] if brands_in_image else []
brand_mention_requirement = f" CRITICAL: Mention {', '.join(brand_names)} brand(s) naturally in the caption." if brand_names else ""
# Define 3 distinct styles
styles = [
{
'name': 'professional',
'temp': 0.6,
'instruction': f'Professional style: Concise, elegant, sophisticated. Focus on quality and craftsmanship. Use refined language.{brand_mention_requirement}',
'length_modifier': 0.8 # Shorter, more concise
},
{
'name': 'creative',
'temp': 0.7,
'instruction': f'Creative style: Artistic, expressive, imaginative. Use vivid metaphors and sensory descriptions. Balance detail with flair.{brand_mention_requirement}',
'length_modifier': 1.0 # Medium length
},
{
'name': 'authentic',
'temp': 0.8,
'instruction': f'Authentic style: Personal, detailed, storytelling. Share rich observations and genuine feelings. Most descriptive and engaging.{brand_mention_requirement}',
'length_modifier': 1.2 # Longer, more detailed
}
]
variations = []
for style in styles:
# Build style-specific prompt
base_prompt = self.construct_prompt(analysis_results, platform, language)
# Add style instruction
style_prompt = f"""{base_prompt}
**STYLE REQUIREMENT FOR THIS CAPTION:**
{style['instruction']}
Adjust tone to be clearly '{style['name']}' - this should be noticeably different from other styles."""
messages = [{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": style_prompt}
]
}]
text = self.processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = self.processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt"
)
if torch.cuda.is_available():
inputs = inputs.to("cuda")
# Generate with style-specific temperature
config = self.generation_config.copy()
config['temperature'] = style['temp']
with torch.no_grad():
generated_ids = self.model.generate(**inputs, **config)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = self.processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
parsed = self._parse_json_output(output_text)
if parsed:
# Force the correct tone
parsed['tone'] = style['name']
# Remove any hashtags that leaked into caption
if 'caption' in parsed:
parsed['caption'] = self._remove_hashtags_from_caption(parsed['caption'])
# Convert Simplified Chinese to Traditional if language is 'zh'
if language == 'zh' or language == 'zh-en':
parsed = self._convert_to_traditional(parsed)
variations.append(parsed)
return variations if variations else [self._get_fallback_caption(platform, language)]
def _remove_hashtags_from_caption(self, caption: str) -> str:
"""Remove any hashtags, pipes, and debug info that leaked into caption text"""
import re
# CRITICAL FIX: Remove pipe symbol and everything after it (debug info)
# Example: "Text 🕰️🌉 | SoftDiffusedLight" -> "Text 🕰️🌉"
if '|' in caption:
caption = caption.split('|')[0].strip()
# Remove hashtags (words starting with #)
caption = re.sub(r'#\w+', '', caption)
caption = re.sub(r'#[\u4e00-\u9fff]+', '', caption) # Remove Chinese hashtags
# Remove standalone weird text patterns (like "BLACKBELT")
# If there's a suspicious all-caps word at the end without context, remove it
words = caption.split()
if len(words) > 0:
last_word = words[-1].strip('✨💎👗🌟💫🖤')
# If last "word" is all caps and doesn't look like a normal sentence word, remove it
if last_word.isupper() and len(last_word) > 3 and not any(char in last_word for char in '.,!?'):
caption = ' '.join(words[:-1])
# Remove excessive emojis at the end (more than 3)
emoji_pattern = r'[\U0001F300-\U0001F9FF]{4,}$'
caption = re.sub(emoji_pattern, '', caption)
# Remove multiple spaces
caption = re.sub(r'\s+', ' ', caption)
# Remove trailing/leading whitespace
caption = caption.strip()
# Final cleanup: if caption ends with weird patterns like "✨X 👗💎", clean it
if re.search(r'[✨💎👗🌟💫🖤]{2,}\s*$', caption):
caption = re.sub(r'[✨💎👗🌟💫🖤\s]+$', '', caption).strip()
return caption
def _convert_to_traditional(self, caption: Dict) -> Dict:
"""Convert Simplified Chinese to Traditional Chinese"""
if 'caption' in caption:
caption['caption'] = self.cc.convert(caption['caption'])
return caption
def _parse_json_output(self, text: str) -> Dict:
"""Parse JSON output"""
try:
start = text.find('{')
end = text.rfind('}') + 1
if start != -1 and end > start:
json_str = text[start:end]
return json.loads(json_str)
except:
pass
return None
def _get_fallback_caption(self, platform: str, language: str) -> Dict:
"""Fallback caption"""
if language == 'en':
return {
'caption': 'Every moment tells a story worth sharing. The world around us is filled with beauty waiting to be discovered. Take a pause and appreciate the details that make life extraordinary. What caught your eye today? ✨',
'hashtags': ['photography', 'daily', 'lifestyle', 'moment', 'capture'],
'tone': 'casual',
'platform': platform
}
else:
return {
'caption': '每個瞬間都值得被記錄與分享。生活中充滿了等待被發現的美好細節。停下腳步,用心感受周遭的一切。今天什麼畫面觸動了你的心?✨',
'hashtags': ['攝影', '日常', '生活', '瞬間', '分享'],
'tone': 'casual',
'platform': platform
}
print("✓ CaptionGenerationManager (with Auto* classes for flexible model support) defined")
|