import logging import traceback from typing import Dict, List, Tuple, Optional, Any from scene_type import SCENE_TYPES class SceneScoringEngine: """ 負責場景評分相關的所有計算邏輯,包括基於 YOLO 檢測的場景評分、 多種場景分數融合,以及最終場景類型的確定。 這邊會有YOLO, CLIP, Places365混合運用的分數計算 """ # 日常場景,用於特殊評分 EVERYDAY_SCENE_TYPE_KEYS = [ "general_indoor_space", "generic_street_view", "desk_area_workspace", "outdoor_gathering_spot", "kitchen_counter_or_utility_area" ] def __init__(self, scene_types: Dict[str, Any], enable_landmark: bool = True): """ 初始化場景評分引擎。 Args: scene_types: 場景類型定義字典 enable_landmark: 是否啟用地標檢測功能 """ self.logger = logging.getLogger(__name__) self.scene_types = scene_types self.enable_landmark = enable_landmark def compute_scene_scores(self, detected_objects: List[Dict], spatial_analysis_results: Optional[Dict] = None) -> Dict[str, float]: """ 基於檢測到的物體計算各場景類型的置信度分數。 增強了對日常場景的評分能力,並考慮物體豐富度和空間聚合性。 Args: detected_objects: 檢測到的物體列表,包含物體詳細資訊 spatial_analysis_results: 空間分析器的輸出結果,特別是 'objects_by_region' 部分 Returns: 場景類型到置信度分數的映射字典 """ scene_scores = {} if not detected_objects: for scene_type_key in self.scene_types: scene_scores[scene_type_key] = 0.0 return scene_scores # 準備檢測物體的數據 detected_class_ids_all = [obj["class_id"] for obj in detected_objects] detected_classes_set_all = set(detected_class_ids_all) class_counts_all = {} for obj in detected_objects: class_id = obj["class_id"] class_counts_all[class_id] = class_counts_all.get(class_id, 0) + 1 # 評估 scene_types 中定義的每個場景類型 for scene_type, scene_def in self.scene_types.items(): required_obj_ids_defined = set(scene_def.get("required_objects", [])) optional_obj_ids_defined = set(scene_def.get("optional_objects", [])) min_required_matches_needed = scene_def.get("minimum_required", 0) # 確定哪些實際檢測到的物體與此場景類型相關 # 這些列表將存儲實際檢測到的物體字典,而不僅僅是 class_ids actual_required_objects_found_list = [] for req_id in required_obj_ids_defined: if req_id in detected_classes_set_all: # 找到此必需物體的第一個實例添加到列表中(用於後續的聚合性檢查) for dobj in detected_objects: if dobj['class_id'] == req_id: actual_required_objects_found_list.append(dobj) break num_required_matches_found = len(actual_required_objects_found_list) actual_optional_objects_found_list = [] for opt_id in optional_obj_ids_defined: if opt_id in detected_classes_set_all: for dobj in detected_objects: if dobj['class_id'] == opt_id: actual_optional_objects_found_list.append(dobj) break num_optional_matches_found = len(actual_optional_objects_found_list) # 初始分數計算權重 # 基礎分數:55% 來自必需物體,25% 來自可選物體,10% 豐富度,10% 聚合性(最大值) required_weight = 0.55 optional_weight = 0.25 richness_bonus_max = 0.10 cohesion_bonus_max = 0.10 # _get_object_spatial_cohesion_score 的最大獎勵是 0.1 current_scene_score = 0.0 objects_to_check_for_cohesion = [] # 用於空間聚合性評分 # 檢查 minimum_required 條件並計算基礎分數 if num_required_matches_found >= min_required_matches_needed: if len(required_obj_ids_defined) > 0: required_ratio = num_required_matches_found / len(required_obj_ids_defined) else: # 沒有定義必需物體,但 min_required_matches_needed 可能為 0 required_ratio = 1.0 if min_required_matches_needed == 0 else 0.0 current_scene_score = required_ratio * required_weight objects_to_check_for_cohesion.extend(actual_required_objects_found_list) # 從可選物體添加分數 if len(optional_obj_ids_defined) > 0: optional_ratio = num_optional_matches_found / len(optional_obj_ids_defined) current_scene_score += optional_ratio * optional_weight objects_to_check_for_cohesion.extend(actual_optional_objects_found_list) # 日常場景的靈活處理,如果嚴格的 minimum_required(基於 'required_objects')未滿足 elif scene_type in self.EVERYDAY_SCENE_TYPE_KEYS: # 如果日常場景有許多可選項目,它仍可能是一個弱候選 # 檢查是否存在相當比例的 'optional_objects' if (len(optional_obj_ids_defined) > 0 and (num_optional_matches_found / len(optional_obj_ids_defined)) >= 0.25): # 例如,至少 25% 的典型可選項目 # 對這些類型的基礎分數更多地基於可選物體的滿足度 current_scene_score = (num_optional_matches_found / len(optional_obj_ids_defined)) * (required_weight + optional_weight * 0.5) # 給予一些基礎分數 objects_to_check_for_cohesion.extend(actual_optional_objects_found_list) else: scene_scores[scene_type] = 0.0 continue # 跳過此場景類型 else: # 對於非日常場景,如果未滿足 minimum_required,分數為 0 scene_scores[scene_type] = 0.0 continue # 物體豐富度/多樣性的獎勵 # 考慮找到的與場景定義相關的唯一物體類別 relevant_defined_class_ids = required_obj_ids_defined.union(optional_obj_ids_defined) unique_relevant_detected_classes = relevant_defined_class_ids.intersection(detected_classes_set_all) object_richness_score = 0.0 if len(relevant_defined_class_ids) > 0: richness_ratio = len(unique_relevant_detected_classes) / len(relevant_defined_class_ids) object_richness_score = min(richness_bonus_max, richness_ratio * 0.15) # 豐富度最大 10% 獎勵 current_scene_score += object_richness_score # 空間聚合性的獎勵(如果提供了 spatial_analysis_results) spatial_cohesion_bonus = 0.0 if spatial_analysis_results and objects_to_check_for_cohesion: spatial_cohesion_bonus = self._get_object_spatial_cohesion_score( objects_to_check_for_cohesion, # 傳遞實際檢測到的物體字典列表 spatial_analysis_results ) current_scene_score += spatial_cohesion_bonus # 此獎勵最大 0.1 # 關鍵物體多個實例的獎勵(原始邏輯的精煉版) multiple_instance_bonus = 0.0 # 對於多實例獎勵,專注於場景定義中心的物體 key_objects_for_multi_instance_check = required_obj_ids_defined if scene_type in self.EVERYDAY_SCENE_TYPE_KEYS and len(optional_obj_ids_defined) > 0: # 對於日常場景,如果某些可選物體多次出現,也可以是關鍵的 # 例如,"general_indoor_space" 中的多把椅子 key_objects_for_multi_instance_check = key_objects_for_multi_instance_check.union( set(list(optional_obj_ids_defined)[:max(1, len(optional_obj_ids_defined)//2)]) # 考慮前半部分的可選物體 ) for class_id_check in key_objects_for_multi_instance_check: if class_id_check in detected_classes_set_all and class_counts_all.get(class_id_check, 0) > 1: multiple_instance_bonus += 0.025 # 每種類型稍微小一點的獎勵 current_scene_score += min(0.075, multiple_instance_bonus) # 最大 7.5% 獎勵 # 應用 SCENE_TYPES 中定義的場景特定優先級 if "priority" in scene_def: current_scene_score *= scene_def["priority"] scene_scores[scene_type] = min(1.0, max(0.0, current_scene_score)) # 如果通過實例屬性 self.enable_landmark 禁用地標檢測, # 確保地標特定場景類型的分數被歸零。 if not self.enable_landmark: landmark_scene_types = ["tourist_landmark", "natural_landmark", "historical_monument"] for lm_scene_type in landmark_scene_types: if lm_scene_type in scene_scores: scene_scores[lm_scene_type] = 0.0 return scene_scores def _get_object_spatial_cohesion_score(self, objects_for_scene: List[Dict], spatial_analysis_results: Optional[Dict]) -> float: """ 基於場景關鍵物體的空間聚合程度計算分數。 較高的分數意味著物體在較少的區域中更加集中。 這是一個啟發式方法,可以進一步精煉。 Args: objects_for_scene: 與當前評估場景類型相關的檢測物體列表(至少包含 'class_id' 的字典) spatial_analysis_results: SpatialAnalyzer._analyze_regions 的輸出 預期格式:{'objects_by_region': {'region_name': [{'class_id': id, ...}, ...]}} Returns: float: 聚合性分數,通常是小額獎勵(例如,0.0 到 0.1) """ if (not objects_for_scene or not spatial_analysis_results or "objects_by_region" not in spatial_analysis_results or not spatial_analysis_results["objects_by_region"]): return 0.0 # 獲取定義當前場景類型的關鍵物體的 class_ids 集合 key_object_class_ids = {obj.get('class_id') for obj in objects_for_scene if obj.get('class_id') is not None} if not key_object_class_ids: return 0.0 # 找出這些關鍵物體出現在哪些區域 regions_containing_key_objects = set() # 計算找到的關鍵物體實例數量 # 這有助於區分 1 個區域中的 1 把椅子與分佈在 5 個區域中的 5 把椅子 total_key_object_instances_found = 0 for region_name, objects_in_region_list in spatial_analysis_results["objects_by_region"].items(): region_has_key_object = False for obj_in_region in objects_in_region_list: if obj_in_region.get('class_id') in key_object_class_ids: region_has_key_object = True total_key_object_instances_found += 1 # 計算每個實例 if region_has_key_object: regions_containing_key_objects.add(region_name) num_distinct_key_objects_in_scene = len(key_object_class_ids) # 關鍵物體的類型數量 num_instances_of_key_objects_passed = len(objects_for_scene) # 傳遞的實例數量 if not regions_containing_key_objects or num_instances_of_key_objects_passed == 0: return 0.0 # 簡單的啟發式方法: if (len(regions_containing_key_objects) == 1 and total_key_object_instances_found >= num_instances_of_key_objects_passed * 0.75): return 0.10 # 最強聚合性:大部分/所有關鍵物體實例在單個區域中 elif (len(regions_containing_key_objects) <= 2 and total_key_object_instances_found >= num_instances_of_key_objects_passed * 0.60): return 0.05 # 中等聚合性:大部分/所有關鍵物體實例在最多兩個區域中 elif (len(regions_containing_key_objects) <= 3 and total_key_object_instances_found >= num_instances_of_key_objects_passed * 0.50): return 0.02 # 較弱聚合性 return 0.0 def determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]: """ 基於分數確定最可能的場景類型。如果偵測到地標分數夠高,則優先回傳 "tourist_landmark"。 Args: scene_scores: 場景類型到置信度分數的映射字典 Returns: (最佳場景類型, 置信度) 的元組 """ print(f"DEBUG: determine_scene_type input scores: {scene_scores}") if not scene_scores: return "unknown", 0.0 # 檢查地標相關分數是否達到門檻,如果是,直接回傳 "tourist_landmark" # 假設場景分數 dictionary 中,"tourist_landmark"、"historical_monument"、"natural_landmark" 三個 key # 分別代表不同類型地標。將它們加總,若總分超過 0.3,就認定為地標場景。 landmark_score = ( scene_scores.get("tourist_landmark", 0.0) + scene_scores.get("historical_monument", 0.0) + scene_scores.get("natural_landmark", 0.0) ) if landmark_score >= 0.3: # 回傳地標場景類型,以及該分數總和 return "tourist_landmark", float(landmark_score) # 找分數最高的那個場景 best_scene = max(scene_scores, key=scene_scores.get) best_score = scene_scores[best_scene] print(f"DEBUG: determine_scene_type result: scene={best_scene}, score={best_score}") return best_scene, float(best_score) def fuse_scene_scores(self, yolo_scene_scores: Dict[str, float], clip_scene_scores: Dict[str, float], num_yolo_detections: int = 0, avg_yolo_confidence: float = 0.0, lighting_info: Optional[Dict] = None, places365_info: Optional[Dict] = None) -> Dict[str, float]: """ 融合來自 YOLO 物體檢測、CLIP 分析和 Places365 場景分類的場景分數。 根據場景類型、YOLO 檢測的豐富度、照明資訊和 Places365 置信度調整權重。 Args: yolo_scene_scores: 基於 YOLO 物體檢測的場景分數 clip_scene_scores: 基於 CLIP 分析的場景分數 num_yolo_detections: YOLO 檢測到的置信度足夠的非地標物體總數 avg_yolo_confidence: YOLO 檢測到的非地標物體的平均置信度 lighting_info: 可選的照明條件分析結果,預期包含 'is_indoor' (bool) 和 'confidence' (float) places365_info: 可選的 Places365 場景分類結果,預期包含 'mapped_scene_type'、'confidence' 和 'is_indoor' Returns: Dict: 融合了所有三個分析來源的場景分數 """ # 處理其中一個分數字典可能為空或所有分數實際上為零的情況 # 提取和處理 Places365 場景分數 # print(f"DEBUG: fuse_scene_scores input - yolo_scores: {yolo_scene_scores}") # print(f"DEBUG: fuse_scene_scores input - clip_scores: {clip_scene_scores}") # print(f"DEBUG: fuse_scene_scores input - num_yolo_detections: {num_yolo_detections}") # print(f"DEBUG: fuse_scene_scores input - avg_yolo_confidence: {avg_yolo_confidence}") # print(f"DEBUG: fuse_scene_scores input - lighting_info: {lighting_info}") # print(f"DEBUG: fuse_scene_scores input - places365_info: {places365_info}") places365_scene_scores_map = {} # 修改變數名稱以避免與傳入的字典衝突 if places365_info and places365_info.get('confidence', 0) > 0.1: mapped_scene_type = places365_info.get('mapped_scene_type', 'unknown') places365_confidence = places365_info.get('confidence', 0.0) if mapped_scene_type in self.scene_types.keys(): places365_scene_scores_map[mapped_scene_type] = places365_confidence # 使用新的字典 self.logger.info(f"Places365 contributing: {mapped_scene_type} with confidence {places365_confidence:.3f}") # 檢查各個數據來源是否具有有意義的分數 yolo_has_meaningful_scores = bool(yolo_scene_scores and any(s > 1e-5 for s in yolo_scene_scores.values())) # 確保是布林值 clip_has_meaningful_scores = bool(clip_scene_scores and any(s > 1e-5 for s in clip_scene_scores.values())) # 確保是布林值 places365_has_meaningful_scores = bool(places365_scene_scores_map and any(s > 1e-5 for s in places365_scene_scores_map.values())) # 計算有意義的數據來源數量 meaningful_sources_count = sum([ yolo_has_meaningful_scores, clip_has_meaningful_scores, places365_has_meaningful_scores ]) # 處理特殊情況:無有效數據源或僅有單一數據源 if meaningful_sources_count == 0: return {st: 0.0 for st in self.scene_types.keys()} elif meaningful_sources_count == 1: if yolo_has_meaningful_scores: return {st: yolo_scene_scores.get(st, 0.0) for st in self.scene_types.keys()} elif clip_has_meaningful_scores: return {st: clip_scene_scores.get(st, 0.0) for st in self.scene_types.keys()} elif places365_has_meaningful_scores: return {st: places365_scene_scores_map.get(st, 0.0) for st in self.scene_types.keys()} # 初始化融合分數結果字典 fused_scores = {} all_relevant_scene_types = set(self.scene_types.keys()) all_possible_scene_types = all_relevant_scene_types.union( set(yolo_scene_scores.keys()), set(clip_scene_scores.keys()), set(places365_scene_scores_map.keys()) ) # 基礎權重 - 調整以適應三個來源 default_yolo_weight = 0.5 default_clip_weight = 0.3 default_places365_weight = 0.2 is_lighting_indoor = None lighting_analysis_confidence = 0.0 if lighting_info and isinstance(lighting_info, dict): is_lighting_indoor = lighting_info.get("is_indoor") lighting_analysis_confidence = lighting_info.get("confidence", 0.0) for scene_type in all_possible_scene_types: yolo_score = yolo_scene_scores.get(scene_type, 0.0) clip_score = clip_scene_scores.get(scene_type, 0.0) places365_score = places365_scene_scores_map.get(scene_type, 0.0) current_yolo_weight = default_yolo_weight current_clip_weight = default_clip_weight current_places365_weight = default_places365_weight print(f"DEBUG: Scene {scene_type} - yolo_score: {yolo_score}, clip_score: {clip_score}, places365_score: {places365_score}") print(f"DEBUG: Scene {scene_type} - weights: yolo={current_yolo_weight:.3f}, clip={current_clip_weight:.3f}, places365={current_places365_weight:.3f}") scene_definition = self.scene_types.get(scene_type, {}) # 基於場景類型性質和 YOLO 豐富度的權重調整 if scene_type in self.EVERYDAY_SCENE_TYPE_KEYS: # Places365 在日常場景分類方面表現出色 if num_yolo_detections >= 5 and avg_yolo_confidence >= 0.45: # 豐富的 YOLO 用於日常場景 current_yolo_weight = 0.60 current_clip_weight = 0.15 current_places365_weight = 0.25 elif num_yolo_detections >= 3: # 中等 YOLO 用於日常場景 current_yolo_weight = 0.50 current_clip_weight = 0.20 current_places365_weight = 0.30 else: # 降低 YOLO 用於日常場景,更多依賴 Places365 current_yolo_weight = 0.35 current_clip_weight = 0.25 current_places365_weight = 0.40 # 對於 CLIP 的全域理解或特定訓練通常更有價值的場景 elif any(keyword in scene_type.lower() for keyword in ["asian", "cultural", "aerial", "landmark", "monument", "tourist", "natural_landmark", "historical_monument"]): current_yolo_weight = 0.25 current_clip_weight = 0.65 current_places365_weight = 0.10 # 地標場景的較低權重 # 對於特定室內常見場景(非地標),物體檢測是關鍵,但 Places365 提供強大的場景上下文 elif any(keyword in scene_type.lower() for keyword in ["room", "kitchen", "office", "bedroom", "desk_area", "indoor_space", "professional_kitchen", "cafe", "library", "gym", "retail_store", "supermarket", "classroom", "conference_room", "medical_facility", "educational_setting", "dining_area"]): current_yolo_weight = 0.50 current_clip_weight = 0.25 current_places365_weight = 0.25 # 對於特定室外常見場景(非地標),物體仍然重要 elif any(keyword in scene_type.lower() for keyword in ["parking_lot", "park_area", "beach", "harbor", "playground", "sports_field", "bus_stop", "train_station", "airport"]): current_yolo_weight = 0.50 current_clip_weight = 0.25 current_places365_weight = 0.25 # 如果為此次運行全域禁用地標檢測 if not self.enable_landmark: if any(keyword in scene_type.lower() for keyword in ["landmark", "monument", "tourist"]): yolo_score = 0.0 # 應該已經從 compute_scene_scores 中為 0 clip_score *= 0.05 # 重度懲罰 places365_score *= 0.8 if scene_type not in self.EVERYDAY_SCENE_TYPE_KEYS else 1.0 # 地標場景的輕微懲罰 elif (scene_type not in self.EVERYDAY_SCENE_TYPE_KEYS and not any(keyword in scene_type.lower() for keyword in ["asian", "cultural", "aerial"])): # 將權重從 CLIP 重新分配給 YOLO 和 Places365 weight_boost = 0.05 current_yolo_weight = min(0.9, current_yolo_weight + weight_boost) current_places365_weight = min(0.9, current_places365_weight + weight_boost) current_clip_weight = max(0.1, current_clip_weight - weight_boost * 2) # 如果 Places365 對此特定場景類型有高置信度,則提升其權重 if places365_score > 0.0 and places365_info: # 這裡的 places365_score 已經是從 map 中獲取 places365_original_confidence = places365_info.get('confidence', 0.0) # 獲取原始的 Places365 信心度 if places365_original_confidence > 0.7: boost_factor = min(0.2, (places365_original_confidence - 0.7) * 0.4) current_places365_weight += boost_factor total_other_weight = current_yolo_weight + current_clip_weight if total_other_weight > 0: reduction_factor = boost_factor / total_other_weight current_yolo_weight *= (1 - reduction_factor) current_clip_weight *= (1 - reduction_factor) # 權重標準化處理 total_weight = current_yolo_weight + current_clip_weight + current_places365_weight if total_weight > 0: # 避免除以零 current_yolo_weight /= total_weight current_clip_weight /= total_weight current_places365_weight /= total_weight else: current_yolo_weight = 1/3 current_clip_weight = 1/3 current_places365_weight = 1/3 # 計算融合score fused_score = (yolo_score * current_yolo_weight) + (clip_score * current_clip_weight) + (places365_score * current_places365_weight) # 處理室內外判斷的衝突分析 places365_is_indoor = None places365_confidence_for_indoor = 0.0 effective_is_indoor = is_lighting_indoor effective_confidence = lighting_analysis_confidence if places365_info and isinstance(places365_info, dict): places365_is_indoor = places365_info.get('is_indoor') places365_confidence_for_indoor = places365_info.get('confidence', 0.0) # Places365 在置信度高時覆蓋照明分析 if places365_confidence_for_indoor >= 0.8 and places365_is_indoor is not None: effective_is_indoor = places365_is_indoor effective_confidence = places365_confidence_for_indoor # 只在特定場景類型首次處理時輸出調試資訊 if (scene_type == "intersection" or (scene_type in ["urban_intersection", "street_view"] and scene_type == sorted(all_possible_scene_types)[0])): self.logger.debug(f"Using Places365 indoor/outdoor decision: {places365_is_indoor} (confidence: {places365_confidence_for_indoor:.3f}) over lighting analysis") if effective_is_indoor is not None and effective_confidence >= 0.65: # 基於其定義確定場景類型本質上是室內還是室外 is_defined_as_indoor = ("indoor" in scene_definition.get("description", "").lower() or any(kw in scene_type.lower() for kw in ["room", "kitchen", "office", "indoor", "library", "cafe", "gym"])) is_defined_as_outdoor = ("outdoor" in scene_definition.get("description", "").lower() or any(kw in scene_type.lower() for kw in ["street", "park", "aerial", "beach", "harbor", "intersection", "crosswalk"])) lighting_adjustment_strength = 0.20 # 最大調整因子(例如,20%) # 根據分析在閾值以上的置信度來縮放調整 adjustment_scale = (effective_confidence - 0.65) / (1.0 - 0.65) # 從 0 到 1 縮放 adjustment = lighting_adjustment_strength * adjustment_scale adjustment = min(lighting_adjustment_strength, max(0, adjustment)) # 限制調整 if effective_is_indoor and is_defined_as_outdoor: fused_score *= (1.0 - adjustment) elif not effective_is_indoor and is_defined_as_indoor: fused_score *= (1.0 - adjustment) elif effective_is_indoor and is_defined_as_indoor: fused_score = min(1.0, fused_score * (1.0 + adjustment * 0.5)) elif not effective_is_indoor and is_defined_as_outdoor: fused_score = min(1.0, fused_score * (1.0 + adjustment * 0.5)) fused_scores[scene_type] = min(1.0, max(0.0, fused_score)) return fused_scores print(f"DEBUG: fuse_scene_scores final result: {fused_scores}") def update_enable_landmark_status(self, enable_landmark: bool): """ 更新地標檢測的啟用狀態。 Args: enable_landmark: 是否啟用地標檢測 """ self.enable_landmark = enable_landmark