|
|
from typing import List, Dict |
|
|
import numpy as np |
|
|
|
|
|
class DetectionFusionManager: |
|
|
"""Integrate and prioritize detection results with intelligent lighting fusion""" |
|
|
|
|
|
def __init__(self, clip_manager): |
|
|
self.clip_manager = clip_manager |
|
|
|
|
|
def fuse_lighting_analysis(self, cv_lighting: Dict, clip_scene: Dict) -> Dict: |
|
|
"""Intelligently fuse CV+Places365 lighting with CLIP scene understanding""" |
|
|
|
|
|
cv_lighting_type = cv_lighting.get('lighting_type', 'soft diffused light') |
|
|
cv_confidence = cv_lighting.get('confidence', 0.7) |
|
|
cv_features = cv_lighting.get('cv_features', {}) |
|
|
|
|
|
|
|
|
clip_lighting_data = clip_scene.get('lighting', {}) |
|
|
clip_lighting_type = clip_lighting_data.get('top', 'natural light') |
|
|
clip_confidence = clip_lighting_data.get('confidence', 0.5) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if cv_confidence > 0.85: |
|
|
|
|
|
final_lighting = cv_lighting_type |
|
|
final_confidence = cv_confidence |
|
|
fusion_method = 'cv_dominant' |
|
|
|
|
|
elif self._lighting_semantically_similar(cv_lighting_type, clip_lighting_type): |
|
|
|
|
|
final_lighting = cv_lighting_type |
|
|
|
|
|
final_confidence = min(cv_confidence * 1.15, 0.95) |
|
|
fusion_method = 'consensus' |
|
|
|
|
|
else: |
|
|
|
|
|
cv_weight = cv_confidence / (cv_confidence + clip_confidence) |
|
|
clip_weight = 1.0 - cv_weight |
|
|
|
|
|
|
|
|
if cv_weight > 0.6: |
|
|
final_lighting = cv_lighting_type |
|
|
final_confidence = cv_confidence * 0.9 |
|
|
fusion_method = 'cv_weighted' |
|
|
else: |
|
|
|
|
|
final_lighting = self._generalize_lighting_description( |
|
|
cv_lighting_type, clip_lighting_type, cv_features |
|
|
) |
|
|
final_confidence = (cv_confidence * cv_weight + clip_confidence * clip_weight) * 0.85 |
|
|
fusion_method = 'generalized' |
|
|
|
|
|
return { |
|
|
'lighting_type': final_lighting, |
|
|
'confidence': min(final_confidence, 0.95), |
|
|
'cv_analysis': cv_lighting_type, |
|
|
'clip_prediction': clip_lighting_type, |
|
|
'fusion_method': fusion_method, |
|
|
'cv_confidence': cv_confidence, |
|
|
'clip_confidence': clip_confidence |
|
|
} |
|
|
|
|
|
def _lighting_semantically_similar(self, cv_type: str, clip_type: str) -> bool: |
|
|
"""Check if two lighting descriptions are semantically similar""" |
|
|
|
|
|
similarity_groups = [ |
|
|
{'soft', 'diffused', 'overcast', 'cloudy'}, |
|
|
{'bright', 'sunny', 'sunlight', 'clear'}, |
|
|
{'warm', 'golden', 'amber', 'evening'}, |
|
|
{'natural', 'daylight', 'outdoor'}, |
|
|
{'cool', 'blue', 'twilight'}, |
|
|
] |
|
|
|
|
|
cv_words = set(cv_type.lower().split()) |
|
|
clip_words = set(clip_type.lower().split()) |
|
|
|
|
|
|
|
|
for group in similarity_groups: |
|
|
cv_match = cv_words & group |
|
|
clip_match = clip_words & group |
|
|
if cv_match and clip_match: |
|
|
return True |
|
|
|
|
|
|
|
|
common_words = cv_words & clip_words |
|
|
return len(common_words) >= 1 |
|
|
|
|
|
def _generalize_lighting_description(self, cv_type: str, clip_type: str, |
|
|
cv_features: Dict) -> str: |
|
|
"""Generate a generalized lighting description when CV and CLIP disagree""" |
|
|
|
|
|
brightness = cv_features.get('brightness', 128) |
|
|
contrast = cv_features.get('contrast', 50) |
|
|
color_temp = cv_features.get('color_temp', 1.0) |
|
|
|
|
|
|
|
|
brightness_norm = brightness / 255.0 |
|
|
contrast_norm = min(contrast / 100.0, 1.0) |
|
|
|
|
|
|
|
|
if contrast_norm < 0.5: |
|
|
|
|
|
if color_temp < 1.0: |
|
|
return 'soft diffused light' |
|
|
else: |
|
|
return 'warm ambient light' |
|
|
elif brightness_norm > 0.7: |
|
|
|
|
|
return 'natural daylight' |
|
|
elif color_temp > 1.1: |
|
|
|
|
|
return 'warm ambient light' |
|
|
else: |
|
|
|
|
|
return 'soft diffused light' |
|
|
|
|
|
def analyze_composition(self, image, detections: List[Dict]) -> Dict: |
|
|
"""Analyze image composition""" |
|
|
if not detections: |
|
|
return {'composition_type': 'empty', 'vertical_ratio': 0.0} |
|
|
|
|
|
|
|
|
vertical_objects = [ |
|
|
d for d in detections |
|
|
if (d['bbox'][3] - d['bbox'][1]) > (d['bbox'][2] - d['bbox'][0]) |
|
|
] |
|
|
vertical_ratio = len(vertical_objects) / max(len(detections), 1) |
|
|
|
|
|
|
|
|
if vertical_ratio > 0.6: |
|
|
composition_type = 'urban canyon' |
|
|
elif vertical_ratio > 0.4: |
|
|
composition_type = 'vertical emphasis' |
|
|
else: |
|
|
composition_type = 'standard street view' |
|
|
|
|
|
return { |
|
|
'composition_type': composition_type, |
|
|
'vertical_ratio': vertical_ratio, |
|
|
'vertical_objects_count': len(vertical_objects), |
|
|
'total_objects': len(detections) |
|
|
} |
|
|
|
|
|
def fuse_detections(self, yolo_results: List[Dict], unknown_regions: List[Dict], |
|
|
scene_info: Dict, image=None, cv_lighting: Dict = None) -> Dict: |
|
|
"""Fuse all detection results with intelligent lighting fusion""" |
|
|
all_detections = [] |
|
|
|
|
|
|
|
|
for det in yolo_results: |
|
|
attention_score = self._calculate_attention_score(det) |
|
|
det['attention_score'] = attention_score |
|
|
all_detections.append(det) |
|
|
|
|
|
|
|
|
for region in unknown_regions: |
|
|
if 'image' not in region: |
|
|
continue |
|
|
|
|
|
classification = self.clip_manager.classify_hierarchical(region['image']) |
|
|
|
|
|
detection = { |
|
|
'class_name': classification['top_prediction'], |
|
|
'bbox': region['bbox'], |
|
|
'confidence': classification.get('confidence', 0.5), |
|
|
'attention_score': region.get('saliency_score', 0.5), |
|
|
'source': 'openclip' |
|
|
} |
|
|
all_detections.append(detection) |
|
|
|
|
|
|
|
|
ranked_detections = sorted( |
|
|
all_detections, |
|
|
key=lambda x: x['attention_score'], |
|
|
reverse=True |
|
|
) |
|
|
|
|
|
|
|
|
filtered = [] |
|
|
for det in ranked_detections: |
|
|
if len(filtered) >= 15: |
|
|
if det.get('brand') and det.get('brand_confidence', 0) > 0.45: |
|
|
filtered.append(det) |
|
|
else: |
|
|
break |
|
|
else: |
|
|
filtered.append(det) |
|
|
|
|
|
|
|
|
composition = self.analyze_composition(image, filtered) if image else {} |
|
|
|
|
|
|
|
|
if cv_lighting: |
|
|
fused_lighting = self.fuse_lighting_analysis(cv_lighting, scene_info) |
|
|
|
|
|
scene_info['lighting'] = { |
|
|
'top': fused_lighting['lighting_type'], |
|
|
'confidence': fused_lighting['confidence'], |
|
|
'fusion_details': fused_lighting |
|
|
} |
|
|
|
|
|
return { |
|
|
'detections': filtered, |
|
|
'scene_info': scene_info, |
|
|
'composition': composition, |
|
|
'total_objects': len(all_detections) |
|
|
} |
|
|
|
|
|
def _calculate_attention_score(self, detection: Dict) -> float: |
|
|
"""Calculate attention score based on position, size, and confidence""" |
|
|
bbox = detection['bbox'] |
|
|
x1, y1, x2, y2 = bbox |
|
|
|
|
|
center_x = (x1 + x2) / 2 |
|
|
center_y = (y1 + y2) / 2 |
|
|
|
|
|
if x2 > 100: |
|
|
position_score = 0.5 |
|
|
else: |
|
|
position_score = 1.0 - (abs(center_x - 0.5) + abs(center_y - 0.5)) |
|
|
|
|
|
area = abs((x2 - x1) * (y2 - y1)) |
|
|
if x2 > 100: |
|
|
area = area / (1000 * 1000) |
|
|
size_score = min(area, 0.5) |
|
|
|
|
|
conf_score = detection.get('confidence', 0.5) |
|
|
|
|
|
attention = ( |
|
|
0.3 * position_score + |
|
|
0.3 * size_score + |
|
|
0.4 * conf_score |
|
|
) |
|
|
|
|
|
return attention |
|
|
|
|
|
print("✓ DetectionFusionManager (V2 with intelligent fusion) defined") |
|
|
|