|
|
import sys |
|
|
import time |
|
|
import traceback |
|
|
from PIL import Image |
|
|
from typing import Dict, List, Callable, Optional |
|
|
|
|
|
from image_processor_manager import ImageProcessorManager |
|
|
from yolo_detection_manager import YOLODetectionManager |
|
|
from saliency_detection_manager import SaliencyDetectionManager |
|
|
from openclip_semantic_manager import OpenCLIPSemanticManager |
|
|
from lighting_analysis_manager import LightingAnalysisManager |
|
|
from ocr_engine_manager import OCREngineManager |
|
|
from prompt_library_manager import PromptLibraryManager |
|
|
from brand_recognition_manager import BrandRecognitionManager |
|
|
from brand_visualization_manager import BrandVisualizationManager |
|
|
from brand_verification_manager import BrandVerificationManager |
|
|
from scene_compatibility_manager import SceneCompatibilityManager |
|
|
from caption_generation_manager import CaptionGenerationManager |
|
|
from detection_fusion_manager import DetectionFusionManager |
|
|
from output_processing_manager import OutputProcessingManager |
|
|
from batch_processing_manager import BatchProcessingManager |
|
|
|
|
|
class PixcribePipeline: |
|
|
"""Main Facade coordinating all components (V2 with multi-language support)""" |
|
|
|
|
|
def __init__(self, yolo_variant='l', vlm_model_name='Qwen/Qwen2.5-VL-7B-Instruct'): |
|
|
""" |
|
|
Args: |
|
|
yolo_variant: 'm', 'l' (default), or 'x' |
|
|
vlm_model_name: Vision-Language Model name (default: Qwen2.5-VL-7B-Instruct) |
|
|
Can be changed to 'Qwen/Qwen3-VL-8B-Instruct' for latest model |
|
|
""" |
|
|
print("="*60) |
|
|
print("Initializing Pixcribe Pipeline V2...") |
|
|
print("="*60) |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
self.image_processor = ImageProcessorManager() |
|
|
self.yolo_detector = YOLODetectionManager(variant=yolo_variant) |
|
|
self.saliency_detector = SaliencyDetectionManager() |
|
|
self.clip_semantic = OpenCLIPSemanticManager() |
|
|
self.lighting_analyzer = LightingAnalysisManager() |
|
|
self.ocr_engine = OCREngineManager() |
|
|
|
|
|
|
|
|
self.prompt_library = PromptLibraryManager() |
|
|
|
|
|
|
|
|
self.brand_recognizer = BrandRecognitionManager( |
|
|
self.clip_semantic, self.ocr_engine, self.prompt_library |
|
|
) |
|
|
|
|
|
|
|
|
self.brand_visualizer = BrandVisualizationManager() |
|
|
|
|
|
self.caption_generator = CaptionGenerationManager(model_name=vlm_model_name) |
|
|
|
|
|
|
|
|
self.brand_verifier = BrandVerificationManager(self.caption_generator) |
|
|
|
|
|
|
|
|
self.scene_compatibility = SceneCompatibilityManager(self.prompt_library) |
|
|
|
|
|
self.fusion_manager = DetectionFusionManager(self.clip_semantic) |
|
|
|
|
|
|
|
|
self.output_processor = OutputProcessingManager(self.prompt_library) |
|
|
|
|
|
|
|
|
self.batch_processor = BatchProcessingManager(pipeline=self) |
|
|
|
|
|
elapsed = time.time() - start_time |
|
|
print("="*60) |
|
|
print(f"✓ Pipeline V5 initialized successfully with batch processing (Time: {elapsed:.2f}s)") |
|
|
print("="*60) |
|
|
|
|
|
def process_image(self, image, platform='instagram', yolo_variant='l', language='zh') -> Dict: |
|
|
"""End-to-end image processing pipeline |
|
|
|
|
|
Args: |
|
|
image: PIL Image or path |
|
|
platform: 'instagram', 'tiktok', or 'xiaohongshu' |
|
|
yolo_variant: 'm', 'l' (default), or 'x' |
|
|
language: 'zh' (Traditional Chinese), 'en' (English), 'zh-en' (Bilingual) |
|
|
|
|
|
Returns: |
|
|
Processing results dictionary with brand visualizations |
|
|
""" |
|
|
print(f"\nProcessing image (Platform: {platform}, Language: {language})...") |
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
|
|
|
|
print("[1/9] Preprocessing image...") |
|
|
processed_img = self.image_processor.load_image(image) |
|
|
yolo_input = self.image_processor.preprocess_for_yolo(processed_img) |
|
|
|
|
|
|
|
|
print("[2/9] YOLO object detection...") |
|
|
yolo_results = self.yolo_detector.detect(yolo_input) |
|
|
print(f" Detected {len(yolo_results)} objects") |
|
|
|
|
|
print("[3/9] Saliency detection...") |
|
|
salient_regions = self.saliency_detector.detect_salient_regions(processed_img) |
|
|
print(f" Found {len(salient_regions)} salient regions") |
|
|
|
|
|
|
|
|
print("[4/9] Identifying unknown objects...") |
|
|
unknown_regions = self.saliency_detector.extract_unknown_regions( |
|
|
salient_regions, yolo_results |
|
|
) |
|
|
print(f" Found {len(unknown_regions)} unknown regions") |
|
|
|
|
|
|
|
|
print("[5/9] Brand recognition...") |
|
|
brands = [] |
|
|
brand_detections = [] |
|
|
|
|
|
|
|
|
brand_relevant = self.yolo_detector.filter_brand_relevant_objects(yolo_results) |
|
|
if brand_relevant: |
|
|
print(f" Checking {len(brand_relevant)} YOLO brand-relevant objects...") |
|
|
for det in brand_relevant[:5]: |
|
|
region = processed_img.crop(det['bbox']) |
|
|
brand_result = self.brand_recognizer.recognize_brand( |
|
|
region, processed_img, region_bbox=det['bbox'] |
|
|
) |
|
|
|
|
|
if brand_result: |
|
|
for brand_name, confidence, bbox in brand_result[:2]: |
|
|
brands.append((brand_name, confidence)) |
|
|
|
|
|
|
|
|
brand_info = self.prompt_library.get_brand_prompts(brand_name) |
|
|
category = brand_info.get('category', 'default') if brand_info else 'default' |
|
|
|
|
|
brand_detections.append({ |
|
|
'name': brand_name, |
|
|
'confidence': confidence, |
|
|
'bbox': bbox, |
|
|
'category': category |
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
print(" Performing intelligent full-image brand scan...") |
|
|
full_image_brands = self.brand_recognizer.scan_full_image_for_brands( |
|
|
processed_img, |
|
|
exclude_bboxes=[bd['bbox'] for bd in brand_detections if bd.get('bbox')], |
|
|
saliency_regions=salient_regions |
|
|
) |
|
|
|
|
|
|
|
|
if full_image_brands: |
|
|
print(f" Full-image scan found {len(full_image_brands)} additional brands") |
|
|
for brand_name, confidence, bbox in full_image_brands: |
|
|
|
|
|
if not any(bd['name'] == brand_name for bd in brand_detections): |
|
|
brands.append((brand_name, confidence)) |
|
|
|
|
|
brand_info = self.prompt_library.get_brand_prompts(brand_name) |
|
|
category = brand_info.get('category', 'default') if brand_info else 'default' |
|
|
|
|
|
brand_detections.append({ |
|
|
'name': brand_name, |
|
|
'confidence': confidence, |
|
|
'bbox': bbox, |
|
|
'category': category |
|
|
}) |
|
|
|
|
|
print(f" Identified {len(brands)} brand instances (before verification)") |
|
|
|
|
|
|
|
|
print("[5.5/11] Scene understanding (CLIP)...") |
|
|
scene_analysis = self.clip_semantic.analyze_scene(processed_img) |
|
|
print(f" Scene: {scene_analysis.get('urban', {}).get('top', 'unknown')}") |
|
|
|
|
|
|
|
|
if brands: |
|
|
print("[5.6/11] Checking scene compatibility...") |
|
|
brands_with_bbox = [(b[0], b[1], brand_detections[i]['bbox']) |
|
|
for i, b in enumerate(brands)] |
|
|
compatible_brands = self.scene_compatibility.batch_check_compatibility( |
|
|
brands_with_bbox, scene_analysis |
|
|
) |
|
|
print(f" {len(compatible_brands)} brands passed compatibility check") |
|
|
|
|
|
|
|
|
if compatible_brands: |
|
|
brands = [(b[0], b[1]) for b in compatible_brands] |
|
|
brand_detections = [] |
|
|
for brand_name, confidence, bbox in compatible_brands: |
|
|
brand_info = self.prompt_library.get_brand_prompts(brand_name) |
|
|
category = brand_info.get('category', 'default') if brand_info else 'default' |
|
|
brand_detections.append({ |
|
|
'name': brand_name, |
|
|
'confidence': confidence, |
|
|
'bbox': bbox, |
|
|
'category': category |
|
|
}) |
|
|
else: |
|
|
brands = [] |
|
|
brand_detections = [] |
|
|
|
|
|
|
|
|
if brand_detections: |
|
|
print("[5.7/11] VLM brand verification...") |
|
|
vlm_verification = self.brand_verifier.verify_brands( |
|
|
processed_img, [(bd['name'], bd['confidence'], bd['bbox']) |
|
|
for bd in brand_detections] |
|
|
) |
|
|
print(f" VLM verified {len(vlm_verification.get('verified_brands', []))} brands") |
|
|
|
|
|
|
|
|
|
|
|
ocr_brands = {} |
|
|
for brand_name, conf in brands: |
|
|
if brand_name not in ocr_brands: |
|
|
ocr_brands[brand_name] = (0.5, conf) |
|
|
|
|
|
final_brands = self.brand_verifier.three_way_voting( |
|
|
[(bd['name'], bd['confidence'], bd['bbox']) for bd in brand_detections], |
|
|
ocr_brands, |
|
|
vlm_verification |
|
|
) |
|
|
print(f" Final verified brands: {len(final_brands)}") |
|
|
|
|
|
|
|
|
if final_brands: |
|
|
brands = [(b[0], b[1]) for b in final_brands] |
|
|
brand_detections = [] |
|
|
for brand_name, confidence, bbox in final_brands: |
|
|
brand_info = self.prompt_library.get_brand_prompts(brand_name) |
|
|
category = brand_info.get('category', 'default') if brand_info else 'default' |
|
|
brand_detections.append({ |
|
|
'name': brand_name, |
|
|
'confidence': confidence, |
|
|
'bbox': bbox, |
|
|
'category': category |
|
|
}) |
|
|
else: |
|
|
brands = [] |
|
|
brand_detections = [] |
|
|
|
|
|
|
|
|
if brand_detections: |
|
|
visualized_image = self.brand_visualizer.draw_brand_detections( |
|
|
processed_img.copy(), brand_detections |
|
|
) |
|
|
else: |
|
|
visualized_image = processed_img |
|
|
|
|
|
|
|
|
print("[7/11] Analyzing lighting conditions...") |
|
|
cv_lighting = self.lighting_analyzer.analyze_lighting(processed_img) |
|
|
print(f" CV Lighting: {cv_lighting['lighting_type']} (confidence: {cv_lighting['confidence']:.2f})") |
|
|
print(f" Details: brightness={cv_lighting['cv_features']['brightness']:.1f}, " |
|
|
f"temp_ratio={cv_lighting['cv_features']['color_temp']:.2f}, " |
|
|
f"contrast={cv_lighting['cv_features']['contrast']:.1f}") |
|
|
|
|
|
|
|
|
print("[8/11] Additional scene analysis...") |
|
|
print(f" CLIP Lighting: {scene_analysis.get('lighting', {}).get('top', 'unknown')}") |
|
|
print(f" Mood: {scene_analysis.get('mood', {}).get('top', 'unknown')}") |
|
|
|
|
|
|
|
|
print("[9/11] Fusing detection results...") |
|
|
fused_results = self.fusion_manager.fuse_detections( |
|
|
yolo_results, unknown_regions, scene_analysis, processed_img, cv_lighting |
|
|
) |
|
|
fused_results['brands'] = brands |
|
|
fused_results['scene_analysis'] = scene_analysis |
|
|
|
|
|
|
|
|
fused_lighting = fused_results['scene_analysis']['lighting']['top'] |
|
|
print(f" Fused Lighting: {fused_lighting}") |
|
|
|
|
|
|
|
|
print("[10/11] Generating captions...") |
|
|
captions = self.caption_generator.generate_captions( |
|
|
fused_results, processed_img, platform, language |
|
|
) |
|
|
|
|
|
|
|
|
print("[11/11] Output processing...") |
|
|
validated_captions = [] |
|
|
for caption in captions: |
|
|
|
|
|
|
|
|
if not caption.get('hashtags') or len(caption.get('hashtags', [])) < 3: |
|
|
print(f" [DEBUG] Caption has {len(caption.get('hashtags', []))} hashtags, generating smart hashtags...") |
|
|
caption['hashtags'] = self.output_processor.generate_smart_hashtags( |
|
|
fused_results['detections'], |
|
|
scene_analysis, |
|
|
brands, |
|
|
platform, |
|
|
language |
|
|
) |
|
|
else: |
|
|
print(f" [DEBUG] Caption has {len(caption['hashtags'])} VLM-generated hashtags") |
|
|
|
|
|
|
|
|
is_valid, msg = self.output_processor.validate_output( |
|
|
caption, platform, |
|
|
detections=fused_results['detections'], |
|
|
scene_info=scene_analysis, |
|
|
brands=brands, |
|
|
language=language |
|
|
) |
|
|
if is_valid: |
|
|
validated_captions.append(caption) |
|
|
else: |
|
|
print(f" [DEBUG] Caption validation failed: {msg}") |
|
|
|
|
|
elapsed = time.time() - start_time |
|
|
print(f"\n✓ Processing complete (Total time: {elapsed:.2f}s)") |
|
|
print(f" Generated {len(validated_captions)} caption variations") |
|
|
|
|
|
return { |
|
|
'captions': validated_captions, |
|
|
'detections': fused_results['detections'], |
|
|
'brands': brands, |
|
|
'brand_detections': brand_detections, |
|
|
'visualized_image': visualized_image, |
|
|
'scene': scene_analysis, |
|
|
'composition': fused_results.get('composition', {}), |
|
|
'lighting': cv_lighting, |
|
|
'processing_time': elapsed |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
print(f"\n✗ Processing error: {str(e)}") |
|
|
traceback.print_exc() |
|
|
|
|
|
raise |
|
|
|
|
|
def process_batch( |
|
|
self, |
|
|
images: List[Image.Image], |
|
|
platform: str = 'instagram', |
|
|
yolo_variant: str = 'l', |
|
|
language: str = 'zh', |
|
|
progress_callback: Optional[Callable] = None |
|
|
) -> Dict: |
|
|
""" |
|
|
Process multiple images in batch with progress tracking. |
|
|
|
|
|
This method provides a Facade interface to the BatchProcessingManager, |
|
|
allowing batch processing through the main Pipeline API. |
|
|
|
|
|
Args: |
|
|
images: List of PIL Image objects to process (max 10) |
|
|
platform: Target social media platform ('instagram', 'tiktok', 'xiaohongshu') |
|
|
yolo_variant: YOLO model variant ('m', 'l', 'x') |
|
|
language: Caption language ('zh' for Traditional Chinese, 'en' for English) |
|
|
progress_callback: Optional callback function for progress updates |
|
|
|
|
|
Returns: |
|
|
Dictionary containing: |
|
|
- results: Dict mapping image index to processing results |
|
|
- total_processed: Total number of images processed |
|
|
- total_success: Number of successfully processed images |
|
|
- total_failed: Number of failed images |
|
|
- total_time: Total processing time in seconds |
|
|
- average_time_per_image: Average time per image in seconds |
|
|
|
|
|
Raises: |
|
|
ValueError: If images list is empty or exceeds 10 images |
|
|
|
|
|
Example: |
|
|
>>> images = [Image.open(f'image{i}.jpg') for i in range(1, 6)] |
|
|
>>> results = pipeline.process_batch(images, platform='instagram') |
|
|
>>> print(f"Processed {results['total_success']}/{results['total_processed']} images") |
|
|
""" |
|
|
return self.batch_processor.process_batch( |
|
|
images=images, |
|
|
platform=platform, |
|
|
yolo_variant=yolo_variant, |
|
|
language=language, |
|
|
progress_callback=progress_callback |
|
|
) |
|
|
|
|
|
print("✓ PixcribePipeline V5 (with Batch Processing) defined") |
|
|
|