Pixcribe / detection_fusion_manager.py
DawnC's picture
Upload 22 files
6a3bd1f verified
from typing import List, Dict
import numpy as np
class DetectionFusionManager:
"""Integrate and prioritize detection results with intelligent lighting fusion"""
def __init__(self, clip_manager):
self.clip_manager = clip_manager
def fuse_lighting_analysis(self, cv_lighting: Dict, clip_scene: Dict) -> Dict:
"""Intelligently fuse CV+Places365 lighting with CLIP scene understanding"""
cv_lighting_type = cv_lighting.get('lighting_type', 'soft diffused light')
cv_confidence = cv_lighting.get('confidence', 0.7)
cv_features = cv_lighting.get('cv_features', {})
# Get CLIP's lighting prediction
clip_lighting_data = clip_scene.get('lighting', {})
clip_lighting_type = clip_lighting_data.get('top', 'natural light')
clip_confidence = clip_lighting_data.get('confidence', 0.5)
# Intelligent fusion strategy:
# 1. If CV has high confidence (>0.85), trust it
# 2. If CV and CLIP semantically agree, boost confidence
# 3. Otherwise, weighted average based on confidence
if cv_confidence > 0.85:
# High confidence from CV+Places365
final_lighting = cv_lighting_type
final_confidence = cv_confidence
fusion_method = 'cv_dominant'
elif self._lighting_semantically_similar(cv_lighting_type, clip_lighting_type):
# Semantic agreement between CV and CLIP
final_lighting = cv_lighting_type # Prefer CV's specific description
# Boost confidence when both agree
final_confidence = min(cv_confidence * 1.15, 0.95)
fusion_method = 'consensus'
else:
# Weighted fusion based on confidence
cv_weight = cv_confidence / (cv_confidence + clip_confidence)
clip_weight = 1.0 - cv_weight
# If CV weight is higher, use CV result
if cv_weight > 0.6:
final_lighting = cv_lighting_type
final_confidence = cv_confidence * 0.9
fusion_method = 'cv_weighted'
else:
# Use more generic description when uncertain
final_lighting = self._generalize_lighting_description(
cv_lighting_type, clip_lighting_type, cv_features
)
final_confidence = (cv_confidence * cv_weight + clip_confidence * clip_weight) * 0.85
fusion_method = 'generalized'
return {
'lighting_type': final_lighting,
'confidence': min(final_confidence, 0.95),
'cv_analysis': cv_lighting_type,
'clip_prediction': clip_lighting_type,
'fusion_method': fusion_method,
'cv_confidence': cv_confidence,
'clip_confidence': clip_confidence
}
def _lighting_semantically_similar(self, cv_type: str, clip_type: str) -> bool:
"""Check if two lighting descriptions are semantically similar"""
# Define semantic similarity groups
similarity_groups = [
{'soft', 'diffused', 'overcast', 'cloudy'},
{'bright', 'sunny', 'sunlight', 'clear'},
{'warm', 'golden', 'amber', 'evening'},
{'natural', 'daylight', 'outdoor'},
{'cool', 'blue', 'twilight'},
]
cv_words = set(cv_type.lower().split())
clip_words = set(clip_type.lower().split())
# Check if both descriptions share words from same semantic group
for group in similarity_groups:
cv_match = cv_words & group
clip_match = clip_words & group
if cv_match and clip_match:
return True
# Direct word overlap
common_words = cv_words & clip_words
return len(common_words) >= 1
def _generalize_lighting_description(self, cv_type: str, clip_type: str,
cv_features: Dict) -> str:
"""Generate a generalized lighting description when CV and CLIP disagree"""
brightness = cv_features.get('brightness', 128)
contrast = cv_features.get('contrast', 50)
color_temp = cv_features.get('color_temp', 1.0)
# Use feature-based generalization (not hard thresholds)
brightness_norm = brightness / 255.0
contrast_norm = min(contrast / 100.0, 1.0)
# Decision tree based on physical features
if contrast_norm < 0.5:
# Low contrast
if color_temp < 1.0:
return 'soft diffused light'
else:
return 'warm ambient light'
elif brightness_norm > 0.7:
# High brightness
return 'natural daylight'
elif color_temp > 1.1:
# Warm temperature
return 'warm ambient light'
else:
# Default safe description
return 'soft diffused light'
def analyze_composition(self, image, detections: List[Dict]) -> Dict:
"""Analyze image composition"""
if not detections:
return {'composition_type': 'empty', 'vertical_ratio': 0.0}
# Calculate vertical element ratio
vertical_objects = [
d for d in detections
if (d['bbox'][3] - d['bbox'][1]) > (d['bbox'][2] - d['bbox'][0])
]
vertical_ratio = len(vertical_objects) / max(len(detections), 1)
# Determine composition type
if vertical_ratio > 0.6:
composition_type = 'urban canyon'
elif vertical_ratio > 0.4:
composition_type = 'vertical emphasis'
else:
composition_type = 'standard street view'
return {
'composition_type': composition_type,
'vertical_ratio': vertical_ratio,
'vertical_objects_count': len(vertical_objects),
'total_objects': len(detections)
}
def fuse_detections(self, yolo_results: List[Dict], unknown_regions: List[Dict],
scene_info: Dict, image=None, cv_lighting: Dict = None) -> Dict:
"""Fuse all detection results with intelligent lighting fusion"""
all_detections = []
# Process YOLO detections with attention scores
for det in yolo_results:
attention_score = self._calculate_attention_score(det)
det['attention_score'] = attention_score
all_detections.append(det)
# Classify unknown regions using OpenCLIP
for region in unknown_regions:
if 'image' not in region:
continue
classification = self.clip_manager.classify_hierarchical(region['image'])
detection = {
'class_name': classification['top_prediction'],
'bbox': region['bbox'],
'confidence': classification.get('confidence', 0.5),
'attention_score': region.get('saliency_score', 0.5),
'source': 'openclip'
}
all_detections.append(detection)
# Sort by attention score
ranked_detections = sorted(
all_detections,
key=lambda x: x['attention_score'],
reverse=True
)
# Filter top 15
filtered = []
for det in ranked_detections:
if len(filtered) >= 15:
if det.get('brand') and det.get('brand_confidence', 0) > 0.45:
filtered.append(det)
else:
break
else:
filtered.append(det)
# Analyze composition
composition = self.analyze_composition(image, filtered) if image else {}
# Intelligent lighting fusion
if cv_lighting:
fused_lighting = self.fuse_lighting_analysis(cv_lighting, scene_info)
# Update scene_info with fused lighting
scene_info['lighting'] = {
'top': fused_lighting['lighting_type'],
'confidence': fused_lighting['confidence'],
'fusion_details': fused_lighting
}
return {
'detections': filtered,
'scene_info': scene_info,
'composition': composition,
'total_objects': len(all_detections)
}
def _calculate_attention_score(self, detection: Dict) -> float:
"""Calculate attention score based on position, size, and confidence"""
bbox = detection['bbox']
x1, y1, x2, y2 = bbox
center_x = (x1 + x2) / 2
center_y = (y1 + y2) / 2
if x2 > 100:
position_score = 0.5
else:
position_score = 1.0 - (abs(center_x - 0.5) + abs(center_y - 0.5))
area = abs((x2 - x1) * (y2 - y1))
if x2 > 100:
area = area / (1000 * 1000)
size_score = min(area, 0.5)
conf_score = detection.get('confidence', 0.5)
attention = (
0.3 * position_score +
0.3 * size_score +
0.4 * conf_score
)
return attention
print("✓ DetectionFusionManager (V2 with intelligent fusion) defined")