from typing import List, Dict import numpy as np class DetectionFusionManager: """Integrate and prioritize detection results with intelligent lighting fusion""" def __init__(self, clip_manager): self.clip_manager = clip_manager def fuse_lighting_analysis(self, cv_lighting: Dict, clip_scene: Dict) -> Dict: """Intelligently fuse CV+Places365 lighting with CLIP scene understanding""" cv_lighting_type = cv_lighting.get('lighting_type', 'soft diffused light') cv_confidence = cv_lighting.get('confidence', 0.7) cv_features = cv_lighting.get('cv_features', {}) # Get CLIP's lighting prediction clip_lighting_data = clip_scene.get('lighting', {}) clip_lighting_type = clip_lighting_data.get('top', 'natural light') clip_confidence = clip_lighting_data.get('confidence', 0.5) # Intelligent fusion strategy: # 1. If CV has high confidence (>0.85), trust it # 2. If CV and CLIP semantically agree, boost confidence # 3. Otherwise, weighted average based on confidence if cv_confidence > 0.85: # High confidence from CV+Places365 final_lighting = cv_lighting_type final_confidence = cv_confidence fusion_method = 'cv_dominant' elif self._lighting_semantically_similar(cv_lighting_type, clip_lighting_type): # Semantic agreement between CV and CLIP final_lighting = cv_lighting_type # Prefer CV's specific description # Boost confidence when both agree final_confidence = min(cv_confidence * 1.15, 0.95) fusion_method = 'consensus' else: # Weighted fusion based on confidence cv_weight = cv_confidence / (cv_confidence + clip_confidence) clip_weight = 1.0 - cv_weight # If CV weight is higher, use CV result if cv_weight > 0.6: final_lighting = cv_lighting_type final_confidence = cv_confidence * 0.9 fusion_method = 'cv_weighted' else: # Use more generic description when uncertain final_lighting = self._generalize_lighting_description( cv_lighting_type, clip_lighting_type, cv_features ) final_confidence = (cv_confidence * cv_weight + clip_confidence * clip_weight) * 0.85 fusion_method = 'generalized' return { 'lighting_type': final_lighting, 'confidence': min(final_confidence, 0.95), 'cv_analysis': cv_lighting_type, 'clip_prediction': clip_lighting_type, 'fusion_method': fusion_method, 'cv_confidence': cv_confidence, 'clip_confidence': clip_confidence } def _lighting_semantically_similar(self, cv_type: str, clip_type: str) -> bool: """Check if two lighting descriptions are semantically similar""" # Define semantic similarity groups similarity_groups = [ {'soft', 'diffused', 'overcast', 'cloudy'}, {'bright', 'sunny', 'sunlight', 'clear'}, {'warm', 'golden', 'amber', 'evening'}, {'natural', 'daylight', 'outdoor'}, {'cool', 'blue', 'twilight'}, ] cv_words = set(cv_type.lower().split()) clip_words = set(clip_type.lower().split()) # Check if both descriptions share words from same semantic group for group in similarity_groups: cv_match = cv_words & group clip_match = clip_words & group if cv_match and clip_match: return True # Direct word overlap common_words = cv_words & clip_words return len(common_words) >= 1 def _generalize_lighting_description(self, cv_type: str, clip_type: str, cv_features: Dict) -> str: """Generate a generalized lighting description when CV and CLIP disagree""" brightness = cv_features.get('brightness', 128) contrast = cv_features.get('contrast', 50) color_temp = cv_features.get('color_temp', 1.0) # Use feature-based generalization (not hard thresholds) brightness_norm = brightness / 255.0 contrast_norm = min(contrast / 100.0, 1.0) # Decision tree based on physical features if contrast_norm < 0.5: # Low contrast if color_temp < 1.0: return 'soft diffused light' else: return 'warm ambient light' elif brightness_norm > 0.7: # High brightness return 'natural daylight' elif color_temp > 1.1: # Warm temperature return 'warm ambient light' else: # Default safe description return 'soft diffused light' def analyze_composition(self, image, detections: List[Dict]) -> Dict: """Analyze image composition""" if not detections: return {'composition_type': 'empty', 'vertical_ratio': 0.0} # Calculate vertical element ratio vertical_objects = [ d for d in detections if (d['bbox'][3] - d['bbox'][1]) > (d['bbox'][2] - d['bbox'][0]) ] vertical_ratio = len(vertical_objects) / max(len(detections), 1) # Determine composition type if vertical_ratio > 0.6: composition_type = 'urban canyon' elif vertical_ratio > 0.4: composition_type = 'vertical emphasis' else: composition_type = 'standard street view' return { 'composition_type': composition_type, 'vertical_ratio': vertical_ratio, 'vertical_objects_count': len(vertical_objects), 'total_objects': len(detections) } def fuse_detections(self, yolo_results: List[Dict], unknown_regions: List[Dict], scene_info: Dict, image=None, cv_lighting: Dict = None) -> Dict: """Fuse all detection results with intelligent lighting fusion""" all_detections = [] # Process YOLO detections with attention scores for det in yolo_results: attention_score = self._calculate_attention_score(det) det['attention_score'] = attention_score all_detections.append(det) # Classify unknown regions using OpenCLIP for region in unknown_regions: if 'image' not in region: continue classification = self.clip_manager.classify_hierarchical(region['image']) detection = { 'class_name': classification['top_prediction'], 'bbox': region['bbox'], 'confidence': classification.get('confidence', 0.5), 'attention_score': region.get('saliency_score', 0.5), 'source': 'openclip' } all_detections.append(detection) # Sort by attention score ranked_detections = sorted( all_detections, key=lambda x: x['attention_score'], reverse=True ) # Filter top 15 filtered = [] for det in ranked_detections: if len(filtered) >= 15: if det.get('brand') and det.get('brand_confidence', 0) > 0.45: filtered.append(det) else: break else: filtered.append(det) # Analyze composition composition = self.analyze_composition(image, filtered) if image else {} # Intelligent lighting fusion if cv_lighting: fused_lighting = self.fuse_lighting_analysis(cv_lighting, scene_info) # Update scene_info with fused lighting scene_info['lighting'] = { 'top': fused_lighting['lighting_type'], 'confidence': fused_lighting['confidence'], 'fusion_details': fused_lighting } return { 'detections': filtered, 'scene_info': scene_info, 'composition': composition, 'total_objects': len(all_detections) } def _calculate_attention_score(self, detection: Dict) -> float: """Calculate attention score based on position, size, and confidence""" bbox = detection['bbox'] x1, y1, x2, y2 = bbox center_x = (x1 + x2) / 2 center_y = (y1 + y2) / 2 if x2 > 100: position_score = 0.5 else: position_score = 1.0 - (abs(center_x - 0.5) + abs(center_y - 0.5)) area = abs((x2 - x1) * (y2 - y1)) if x2 > 100: area = area / (1000 * 1000) size_score = min(area, 0.5) conf_score = detection.get('confidence', 0.5) attention = ( 0.3 * position_score + 0.3 * size_score + 0.4 * conf_score ) return attention print("✓ DetectionFusionManager (V2 with intelligent fusion) defined")