Spaces:

DawnC
/

VisionScout

Running on Zero

App Files Files Community

DawnC commited on May 27

Commit

4d1f920

verified ·

1 Parent(s): 58a70f4

Upload 31 files

Browse files

Files changed (22) hide show

activity_templates.py +56 -0
app.py +189 -40
clip_analyzer.py +316 -62
clip_prompts.py +1 -1
clip_zero_shot_classifier.py +1415 -0
enhance_scene_describer.py +1108 -292
evaluation_metrics.py +7 -8
image_processor.py +302 -102
landmark_activities.py +0 -0
landmark_data.py +0 -0
lighting_analyzer.py +0 -0
lighting_conditions.py +40 -0
llm_enhancer.py +198 -60
object_template_fillers.py +7 -2
places365_model.py +492 -0
requirements.txt +17 -16
scene_analyzer.py +0 -0
scene_description.py +92 -21
scene_detail_templates.py +15 -0
scene_type.py +123 -0
spatial_analyzer.py +589 -138
video_processor.py +1 -1

activity_templates.py CHANGED Viewed

@@ -320,5 +320,61 @@ ACTIVITY_TEMPLATES = {
                 "Chef activities",
                 "Commercial food handling",
                 "Restaurant meal preparation"
             ]
         }

                 "Chef activities",
                 "Commercial food handling",
                 "Restaurant meal preparation"
+            ],
+            "tourist_landmark": [
+                "Sightseeing",
+                "Photography",
+                "Guided tours",
+                "Learning about landmark history",
+                "Souvenir shopping",
+                "Cultural appreciation",
+                "Architectural observation"
+            ],
+            "natural_landmark": [
+                "Nature photography",
+                "Scenic viewing",
+                "Hiking",
+                "Nature appreciation",
+                "Wildlife watching",
+                "Outdoor recreation",
+                "Environmental education"
+            ],
+            "historical_monument": [
+                "Historical tours",
+                "Cultural heritage appreciation",
+                "Educational visits",
+                "Historical photography",
+                "Learning about past events",
+                "Architectural study",
+                "Heritage tourism"
+            ],
+             "general_indoor_space": [
+                "Engaging in general indoor activities",
+                "Resting or relaxing in an indoor setting",
+                "Possibly having a conversation or reading"
+            ],
+            "generic_street_view": [
+                "People walking or commuting",
+                "Vehicles driving on the road",
+                "Observing street traffic and urban activity",
+                "Waiting at a crosswalk or bus stop (if applicable objects present)"
+            ],
+            "desk_area_workspace": [
+                "Working on a computer or laptop",
+                "Studying or reading documents",
+                "Writing or taking notes",
+                "Participating in an online meeting (if computer present)"
+            ],
+            "outdoor_gathering_spot": [
+                "People socializing outdoors",
+                "Relaxing on a bench or in a park-like setting",
+                "Engaging in light recreational activities",
+                "Having a picnic (if food items or backpacks are present)"
+            ],
+            "kitchen_counter_or_utility_area": [
+                "Preparing food or drinks",
+                "Using kitchen appliances like a microwave or toaster",
+                "Washing dishes or cleaning",
+                "Storing food items"
             ]
         }

app.py CHANGED Viewed

@@ -19,8 +19,57 @@ from video_processor import VideoProcessor
 from llm_enhancer import LLMEnhancer
 # Initialize Processors with LLM support
-image_processor = ImageProcessor(use_llm=True, llm_model_path="meta-llama/Llama-3.2-3B-Instruct")
-video_processor = VideoProcessor(image_processor)
 # Helper Function
 def get_all_classes():
@@ -58,14 +107,93 @@ def get_all_classes():
     return sorted(default_classes.items())
 @spaces.GPU
-def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True):
     """Processes a single uploaded image."""
-    print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}")
     try:
         image_processor.use_llm = use_llm
-        if hasattr(image_processor, 'scene_analyzer'):
-            image_processor.scene_analyzer.use_llm = use_llm
-            print(f"Updated existing scene_analyzer use_llm setting to: {use_llm}")
         class_ids_to_filter = None
         if filter_classes:
@@ -92,11 +220,13 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
             print(f"Filtering image results for class IDs: {class_ids_to_filter}")
         # Call the existing image processing logic
         result_image, result_text, stats = image_processor.process_image(
             image,
             model_name,
             confidence_threshold,
-            class_ids_to_filter
         )
         # Format stats for JSON display
@@ -191,15 +321,13 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
         print(f"Original scene description (first 50 chars): {scene_desc[:50]}...")
-        # 確保使用的是有效的描述
         clean_scene_desc = clean_description(scene_desc)
         print(f"Cleaned scene description (first 50 chars): {clean_scene_desc[:50]}...")
-        # 即使清理後為空也確保顯示原始內容
         if not clean_scene_desc.strip():
             clean_scene_desc = scene_desc
-        # 創建原始描述的HTML
         scene_desc_html = f"<div>{clean_scene_desc}</div>"
         # 獲取LLM增強描述並且確保設置默認值為空字符串而非 None，不然會有None type Error
@@ -210,18 +338,18 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
         if not enhanced_description or not enhanced_description.strip():
             print("WARNING: LLM enhanced description is empty!")
-        # 準備徽章和描述標籤
         llm_badge = ""
         description_to_show = ""
         if use_llm and enhanced_description:
             llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background: linear-gradient(90deg, #38b2ac, #4299e1); color:white; font-size:0.7rem; font-weight:bold; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); border: 1px solid rgba(255, 255, 255, 0.2);">LLM Enhanced</span>'
             description_to_show = enhanced_description
-            # 在 Original Scene Analysis 折疊區顯示原始的描述
         else:
             llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background-color:#718096; color:white; font-size:0.7rem; font-weight:bold;">Basic</span>'
             description_to_show = clean_scene_desc
-            # 不使用 LLM 時，折疊區不顯示內容
         # 使用LLM敘述時會有徽章標籤在標題上
         scene_description_html = f'''
@@ -271,7 +399,7 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
             print("WARNING: LLM enhanced description is empty!")
         return (result_image, result_text, formatted_stats, plot_figure,
-            scene_description_html, original_desc_html,
             activities_list_data, safety_data, zones, lighting)
     except Exception as e:
@@ -471,6 +599,12 @@ def create_interface():
                                     info="Provides more detailed and natural language descriptions (may increase processing time)"
                                 )
                                 with gr.Accordion("Filter Classes", open=False):
                                      gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
                                      with gr.Row():
@@ -490,24 +624,39 @@ def create_interface():
                         with gr.Group(elem_classes="how-to-use"):
                              gr.HTML('<div class="section-heading">How to Use (Image)</div>')
                              gr.Markdown("""
-                                    1. Upload an image or use the camera
-                                    2. (Optional) Adjust settings like confidence threshold or model size (n, m=balanced, x=accurate)
-                                    3. In Analysis Settings, you can uncheck "Use LLM for enhanced scene descriptions" if you prefer faster processing
-                                    4. Optionally filter to specific object classes
-                                    5. Click **Detect Objects** button
                                 """)
                         # Image Examples
                         gr.Examples(
                             examples=[
-                                "room_01.jpg",
-                                "room_02.jpg",
-                                "street_02.jpg",
-                                "street_04.jpg"
                                 ],
                             inputs=image_input,
                             label="Example Images"
                          )
                     # Right Column: Image Results
                     with gr.Column(scale=6, elem_classes="output-panel"):
                         with gr.Tabs(elem_classes="tabs"):
@@ -540,8 +689,8 @@ def create_interface():
                                         </p>
                                     </div>
                                     ''')
-                                image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
                                 # 使用LLM增強敘述時也會顯示原本敘述內容
                                 with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
                                     image_llm_description = gr.HTML(label=None, elem_id="original_scene_description_text")
@@ -709,7 +858,7 @@ def create_interface():
         image_detect_btn.click(
             fn=handle_image_upload,
-            inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter, use_llm],
             outputs=[
                 image_result_image, image_result_text, image_stats_json, image_plot_output,
                 image_scene_description_html, image_llm_description, image_activities_list, image_safety_list, image_zones_json,
@@ -732,18 +881,18 @@ def create_interface():
         # Footer
         gr.HTML("""
-             <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
-                 <div style="margin-bottom: 15px;">
-                     <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
-                 </div>
-                 <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
-                     <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
-                     <a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" target="_blank" style="text-decoration: none;">
-                         <img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
-                     </a>
-                 </div>
-             </div>
-         """)
     return demo
@@ -751,4 +900,4 @@ def create_interface():
 if __name__ == "__main__":
     demo_interface = create_interface()
-    demo_interface.launch()

 from llm_enhancer import LLMEnhancer
 # Initialize Processors with LLM support
+image_processor = None
+video_processor = None
+def initialize_processors():
+    global image_processor, video_processor
+    try:
+        print("Attempting to initialize ImageProcessor with LLM support...")
+        image_processor = ImageProcessor(use_llm=True, llm_model_path="meta-llama/Llama-3.2-3B-Instruct")
+        print("ImageProcessor initialized successfully with LLM")
+        # 添加診斷檢查
+        if hasattr(image_processor, 'scene_analyzer'):
+            if image_processor.scene_analyzer is not None:
+                print(f"scene_analyzer initialized: {type(image_processor.scene_analyzer)}")
+                if hasattr(image_processor.scene_analyzer, 'use_llm'):
+                    print(f"scene_analyzer.use_llm available: {image_processor.scene_analyzer.use_llm}")
+            else:
+                print("WARNING: scene_analyzer is None after initialization")
+        else:
+            print("WARNING: scene_analyzer attribute not found in image_processor")
+        video_processor = VideoProcessor(image_processor)
+        print("VideoProcessor initialized successfully")
+        return True
+    except Exception as e:
+        print(f"Error initializing processors with LLM: {e}")
+        import traceback
+        traceback.print_exc()
+        # Create fallback processor without LLM
+        try:
+            print("Attempting fallback initialization without LLM...")
+            image_processor = ImageProcessor(use_llm=False, enable_places365=False)
+            video_processor = VideoProcessor(image_processor)
+            print("Fallback processors initialized successfully without LLM and Places365")
+            return True
+        except Exception as fallback_error:
+            print(f"Fatal error: Cannot initialize processors: {fallback_error}")
+            import traceback
+            traceback.print_exc()
+            image_processor = None
+            video_processor = None
+            return False
+# Initialize processors
+initialization_success = initialize_processors()
+if not initialization_success:
+    print("WARNING: Failed to initialize processors. Application may not function correctly.")
 # Helper Function
 def get_all_classes():
     return sorted(default_classes.items())
 @spaces.GPU
+def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True, enable_landmark=True):
     """Processes a single uploaded image."""
+    # Enhanced safety check for image_processor
+    if image_processor is None:
+        error_msg = "Image processor is not initialized. Please restart the application or check system dependencies."
+        print(f"ERROR: {error_msg}")
+        # Create error plot
+        fig, ax = plt.subplots(figsize=(8, 6))
+        ax.text(0.5, 0.5, "Initialization Error\nProcessor Not Available",
+                color="red", ha="center", va="center", fontsize=14, fontweight="bold")
+        ax.axis('off')
+        return (None, error_msg, {}, fig, f"<div style='color: red; font-weight: bold;'>Error: {error_msg}</div>",
+                "<div style='color: red;'>Error: System not initialized</div>",
+                [["System Error"]], [["System Error"]], {}, {"time_of_day": "error", "confidence": 0})
+    # Additional safety check for processor attributes
+    if not hasattr(image_processor, 'use_llm'):
+        error_msg = "Image processor is corrupted. Missing required attributes."
+        print(f"ERROR: {error_msg}")
+        fig, ax = plt.subplots(figsize=(8, 6))
+        ax.text(0.5, 0.5, "Processor Error\nCorrupted State",
+                color="red", ha="center", va="center", fontsize=14, fontweight="bold")
+        ax.axis('off')
+        return (None, error_msg, {}, fig, f"<div style='color: red; font-weight: bold;'>Error: {error_msg}</div>",
+                "<div style='color: red;'>Error: Processor corrupted</div>",
+                [["Processor Error"]], [["Processor Error"]], {}, {"time_of_day": "error", "confidence": 0})
+    print(f"DIAGNOSTIC: Image upload handled with enable_landmark={enable_landmark}, use_llm={use_llm}")
+    print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}, enable_landmark: {enable_landmark}")
     try:
         image_processor.use_llm = use_llm
+        # ���保 scene_analyzer 不是 None
+        if hasattr(image_processor, 'scene_analyzer') and image_processor.scene_analyzer is not None:
+            if hasattr(image_processor.scene_analyzer, 'use_llm'):
+                image_processor.scene_analyzer.use_llm = use_llm
+                print(f"Updated existing scene_analyzer use_llm setting to: {use_llm}")
+            # 檢查並設置 landmark detection
+            if hasattr(image_processor.scene_analyzer, 'use_landmark_detection'):
+                # 設置所有相關標記
+                image_processor.scene_analyzer.use_landmark_detection = enable_landmark
+                image_processor.scene_analyzer.enable_landmark = enable_landmark
+                # 確保處理器也設置了這選項
+                image_processor.enable_landmark = enable_landmark
+                # 檢查並設置更深層次的組件
+                if hasattr(image_processor.scene_analyzer, 'scene_describer') and image_processor.scene_analyzer.scene_describer is not None:
+                    image_processor.scene_analyzer.scene_describer.enable_landmark = enable_landmark
+                # 檢查並設置CLIP分析器上的標記
+                if hasattr(image_processor.scene_analyzer, 'clip_analyzer') and image_processor.scene_analyzer.clip_analyzer is not None:
+                    if hasattr(image_processor.scene_analyzer.clip_analyzer, 'enable_landmark'):
+                        image_processor.scene_analyzer.clip_analyzer.enable_landmark = enable_landmark
+                # 檢查並設置LLM增強器
+                if hasattr(image_processor.scene_analyzer, 'llm_enhancer') and image_processor.scene_analyzer.llm_enhancer is not None:
+                    if hasattr(image_processor.scene_analyzer.llm_enhancer, 'enable_landmark'):
+                        image_processor.scene_analyzer.llm_enhancer.enable_landmark = enable_landmark
+                        print(f"Updated LLM enhancer enable_landmark to: {enable_landmark}")
+                print(f"Updated all landmark detection settings to: {enable_landmark}")
+        else:
+            print("WARNING: scene_analyzer is None or not available")
+            if hasattr(image_processor, 'enable_landmark'):
+                image_processor.enable_landmark = enable_landmark
+                # 設置更深層次的組別
+                if hasattr(image_processor.scene_analyzer, 'scene_describer'):
+                    image_processor.scene_analyzer.scene_describer.enable_landmark = enable_landmark
+                # 設置CLIP分析器上的標記
+                if hasattr(image_processor.scene_analyzer, 'clip_analyzer'):
+                    if hasattr(image_processor.scene_analyzer.clip_analyzer, 'enable_landmark'):
+                        image_processor.scene_analyzer.clip_analyzer.enable_landmark = enable_landmark
+                # 如果有LLM增強器，也設置它
+                if hasattr(image_processor.scene_analyzer, 'llm_enhancer'):
+                    image_processor.scene_analyzer.llm_enhancer.enable_landmark = enable_landmark
+                    print(f"Updated LLM enhancer enable_landmark to: {enable_landmark}")
+                print(f"Updated all landmark detection settings to: {enable_landmark}")
         class_ids_to_filter = None
         if filter_classes:
             print(f"Filtering image results for class IDs: {class_ids_to_filter}")
         # Call the existing image processing logic
+        print(f"DEBUG: app.py 傳遞 enable_landmark={enable_landmark} 到 process_image")
         result_image, result_text, stats = image_processor.process_image(
             image,
             model_name,
             confidence_threshold,
+            class_ids_to_filter,
+            enable_landmark
         )
         # Format stats for JSON display
         print(f"Original scene description (first 50 chars): {scene_desc[:50]}...")
+        # determine original description
         clean_scene_desc = clean_description(scene_desc)
         print(f"Cleaned scene description (first 50 chars): {clean_scene_desc[:50]}...")
         if not clean_scene_desc.strip():
             clean_scene_desc = scene_desc
         scene_desc_html = f"<div>{clean_scene_desc}</div>"
         # 獲取LLM增強描述並且確保設置默認值為空字符串而非 None，不然會有None type Error
         if not enhanced_description or not enhanced_description.strip():
             print("WARNING: LLM enhanced description is empty!")
+        # bedge & label
         llm_badge = ""
         description_to_show = ""
+        # 在 Original Scene Analysis 折疊區顯示原始的描述
         if use_llm and enhanced_description:
             llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background: linear-gradient(90deg, #38b2ac, #4299e1); color:white; font-size:0.7rem; font-weight:bold; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); border: 1px solid rgba(255, 255, 255, 0.2);">LLM Enhanced</span>'
             description_to_show = enhanced_description
         else:
             llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background-color:#718096; color:white; font-size:0.7rem; font-weight:bold;">Basic</span>'
             description_to_show = clean_scene_desc
         # 使用LLM敘述時會有徽章標籤在標題上
         scene_description_html = f'''
             print("WARNING: LLM enhanced description is empty!")
         return (result_image, result_text, formatted_stats, plot_figure,
+            scene_description_html, original_desc_html,
             activities_list_data, safety_data, zones, lighting)
     except Exception as e:
                                     info="Provides more detailed and natural language descriptions (may increase processing time)"
                                 )
+                                use_landmark_detection = gr.Checkbox(
+                                    label="Use CLIP for Landmark Detection",
+                                    value=False,
+                                    info="Detect famous landmarks, monuments, and tourist attractions that standard object detection cannot recognize (increases processing time)"
+                                )
                                 with gr.Accordion("Filter Classes", open=False):
                                      gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
                                      with gr.Row():
                         with gr.Group(elem_classes="how-to-use"):
                              gr.HTML('<div class="section-heading">How to Use (Image)</div>')
                              gr.Markdown("""
+                                1. Upload an image or use the camera
+                                2. *(Optional)* Adjust settings like confidence threshold or model size (n, m = balanced, x = accurate)
+                                3. In **Analysis Settings**, you can:
+                                    * Uncheck **Use LLM** to skip enhanced descriptions (faster)
+                                    * Check **Use CLIP for Landmark Detection** to identify famous landmarks like museums, monuments, and tourist attractions *(may take longer)*
+                                    * Filter object classes to focus on specific types of objects *(optional)*
+                                4. Click **Analyze Image** button
+                                **💡 Tip:** For landmark recognition (e.g. Louvre Museum), make sure to enable **CLIP for Landmark Detection** in the settings above.
                                 """)
                         # Image Examples
                         gr.Examples(
                             examples=[
+                                "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/room_01.jpg",
+                                "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/room_02.jpg",
+                                "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/street_04.jpg",
+                                "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/street_05.jpg",
+                                "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/landmark_Louvre_01.jpg",
                                 ],
                             inputs=image_input,
                             label="Example Images"
                          )
+                        gr.HTML("""
+                            <div style="text-align: center; margin-top: 8px; padding: 6px; background-color: #f8f9fa; border-radius: 4px; border: 1px solid #e2e8f0;">
+                                <p style="font-size: 12px; color: #718096; margin: 0;">
+                                    📷 Sample images sourced from <a href="https://unsplash.com" target="_blank" style="color: #3182ce; text-decoration: underline;">Unsplash</a>
+                                </p>
+                            </div>
+                        """)
                     # Right Column: Image Results
                     with gr.Column(scale=6, elem_classes="output-panel"):
                         with gr.Tabs(elem_classes="tabs"):
                                         </p>
                                     </div>
                                     ''')
+                                image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
                                 # 使用LLM增強敘述時也會顯示原本敘述內容
                                 with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
                                     image_llm_description = gr.HTML(label=None, elem_id="original_scene_description_text")
         image_detect_btn.click(
             fn=handle_image_upload,
+            inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter, use_llm, use_landmark_detection ],
             outputs=[
                 image_result_image, image_result_text, image_stats_json, image_plot_output,
                 image_scene_description_html, image_llm_description, image_activities_list, image_safety_list, image_zones_json,
         # Footer
         gr.HTML("""
+            <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
+                <div style="margin-bottom: 15px;">
+                    <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Places365, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
+                </div>
+                <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
+                    <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
+                    <a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" target="_blank" style="text-decoration: none;">
+                        <img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
+                    </a>
+                </div>
+            </div>
+        """)
     return demo
 if __name__ == "__main__":
     demo_interface = create_interface()
+    demo_interface.launch(debug=True)

clip_analyzer.py CHANGED Viewed

@@ -20,12 +20,12 @@ class CLIPAnalyzer:
     Use Clip to intergrate scene understanding function
     """
-    def __init__(self, model_name: str = "ViT-B/32", device: str = None):
         """
         初始化 CLIP 分析器。
         Args:
-            model_name: CLIP Model name,  "ViT-B/32"、"ViT-B/16"、"ViT-L/14"
             device: Use GPU if it can use
         """
         # 自動選擇設備
@@ -55,49 +55,150 @@ class CLIPAnalyzer:
         self._prepare_text_prompts()
     def _prepare_text_prompts(self):
-        """準備所有文本提示的 CLIP 特徵"""
-        # base prompt
-        scene_texts = [self.scene_type_prompts[scene_type] for scene_type in self.scene_type_prompts]
-        self.scene_type_tokens = clip.tokenize(scene_texts).to(self.device)
-        # cultural
-        self.cultural_tokens_dict = {}
-        for scene_type, prompts in self.cultural_scene_prompts.items():
-            self.cultural_tokens_dict[scene_type] = clip.tokenize(prompts).to(self.device)
-        # Light
-        lighting_texts = [self.lighting_condition_prompts[cond] for cond in self.lighting_condition_prompts]
-        self.lighting_tokens = clip.tokenize(lighting_texts).to(self.device)
-        # specializes_status
-        self.specialized_tokens_dict = {}
-        for scene_type, prompts in self.specialized_scene_prompts.items():
-            self.specialized_tokens_dict[scene_type] = clip.tokenize(prompts).to(self.device)
-        # view point
-        viewpoint_texts = [self.viewpoint_prompts[viewpoint] for viewpoint in self.viewpoint_prompts]
-        self.viewpoint_tokens = clip.tokenize(viewpoint_texts).to(self.device)
-        # object combination
-        object_combination_texts = [self.object_combination_prompts[combo] for combo in self.object_combination_prompts]
-        self.object_combination_tokens = clip.tokenize(object_combination_texts).to(self.device)
-        # activicty prompt
-        activity_texts = [self.activity_prompts[activity] for activity in self.activity_prompts]
-        self.activity_tokens = clip.tokenize(activity_texts).to(self.device)
-    def analyze_image(self, image, include_cultural_analysis: bool = True) -> Dict[str, Any]:
         """
         分析圖像，預測場景類型和光照條件。
         Args:
             image: 輸入圖像 (PIL Image 或 numpy array)
             include_cultural_analysis: 是否包含文化場景的詳細分析
         Returns:
             Dict: 包含場景類型預測和光照條件的分析結果
         """
         try:
             # 確保圖像是 PIL 格式
             if not isinstance(image, Image.Image):
                 if isinstance(image, np.ndarray):
@@ -113,46 +214,127 @@ class CLIPAnalyzer:
                 image_features = self.model.encode_image(image_input)
                 image_features = image_features / image_features.norm(dim=-1, keepdim=True)
-            # 分析場景類型
-            scene_scores = self._analyze_scene_type(image_features)
-            # 分析光照條件
             lighting_scores = self._analyze_lighting_condition(image_features)
-            # 文化場景的增強分析
             cultural_analysis = {}
-            if include_cultural_analysis:
-                for scene_type in self.cultural_scene_prompts:
-                    if scene_type in scene_scores and scene_scores[scene_type] > 0.2:
-                        cultural_analysis[scene_type] = self._analyze_cultural_scene(
-                            image_features, scene_type
                         )
             specialized_analysis = {}
-            for scene_type in self.specialized_scene_prompts:
-                if scene_type in scene_scores and scene_scores[scene_type] > 0.2:
-                    specialized_analysis[scene_type] = self._analyze_specialized_scene(
-                        image_features, scene_type
                     )
             viewpoint_scores = self._analyze_viewpoint(image_features)
             object_combination_scores = self._analyze_object_combinations(image_features)
             activity_scores = self._analyze_activities(image_features)
-            # display results
             result = {
                 "scene_scores": scene_scores,
-                "top_scene": max(scene_scores.items(), key=lambda x: x[1]),
-                "lighting_condition": max(lighting_scores.items(), key=lambda x: x[1]),
-                "embedding": image_features.cpu().numpy().tolist()[0] if self.device == "cuda" else image_features.numpy().tolist()[0],
-                "viewpoint": max(viewpoint_scores.items(), key=lambda x: x[1]),
-                "object_combinations": sorted(object_combination_scores.items(), key=lambda x: x[1], reverse=True)[:3],
-                "activities": sorted(activity_scores.items(), key=lambda x: x[1], reverse=True)[:3]
             }
-            if cultural_analysis:
                 result["cultural_analysis"] = cultural_analysis
             if specialized_analysis:
@@ -164,15 +346,49 @@ class CLIPAnalyzer:
             print(f"Error analyzing image with CLIP: {e}")
             import traceback
             traceback.print_exc()
-            return {"error": str(e)}
-    def _analyze_scene_type(self, image_features: torch.Tensor) -> Dict[str, float]:
-        """分析圖像特徵與各場景類型的相似度"""
         with torch.no_grad():
             # 計算場景類型文本特徵
             text_features = self.model.encode_text(self.scene_type_tokens)
             text_features = text_features / text_features.norm(dim=-1, keepdim=True)
             # 計算相似度分數
             similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
             similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
@@ -180,7 +396,36 @@ class CLIPAnalyzer:
             # 建立場景分數字典
             scene_scores = {}
             for i, scene_type in enumerate(self.scene_type_prompts.keys()):
-                scene_scores[scene_type] = float(similarity[i])
             return scene_scores
@@ -388,3 +633,12 @@ class CLIPAnalyzer:
             result[query] = float(similarity[i])
         return result

     Use Clip to intergrate scene understanding function
     """
+    def __init__(self, model_name: str = "ViT-L/14", device: str = None):
         """
         初始化 CLIP 分析器。
         Args:
+            model_name: CLIP Model name, 默認 "ViT-L/14"
             device: Use GPU if it can use
         """
         # 自動選擇設備
         self._prepare_text_prompts()
     def _prepare_text_prompts(self):
+        """準備所有文本提示的 CLIP 特徵並存儲到 self.text_features_cache 中"""
+        self.text_features_cache = {}
+        # 處理基礎場景類型 (SCENE_TYPE_PROMPTS)
+        if hasattr(self, 'scene_type_prompts') and self.scene_type_prompts:
+            scene_texts = [prompt for scene_type, prompt in self.scene_type_prompts.items()]
+            if scene_texts:
+                self.text_features_cache["scene_type_keys"] = list(self.scene_type_prompts.keys())
+                try:
+                    self.text_features_cache["scene_type_tokens"] = clip.tokenize(scene_texts).to(self.device)
+                except Exception as e:
+                    print(f"Warning: Error tokenizing scene_type_prompts: {e}")
+                    self.text_features_cache["scene_type_tokens"] = None # 標記錯誤或空
+            else:
+                self.text_features_cache["scene_type_keys"] = []
+                self.text_features_cache["scene_type_tokens"] = None
+        else:
+            self.text_features_cache["scene_type_keys"] = []
+            self.text_features_cache["scene_type_tokens"] = None
+        # 處理文化場景 (CULTURAL_SCENE_PROMPTS)
+        # cultural_tokens_dict 存儲的是 tokenized prompts
+        cultural_tokens_dict_val = {}
+        if hasattr(self, 'cultural_scene_prompts') and self.cultural_scene_prompts:
+            for scene_type, prompts in self.cultural_scene_prompts.items():
+                if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
+                    try:
+                        cultural_tokens_dict_val[scene_type] = clip.tokenize(prompts).to(self.device)
+                    except Exception as e:
+                        print(f"Warning: Error tokenizing cultural_scene_prompts for {scene_type}: {e}")
+                        cultural_tokens_dict_val[scene_type] = None # 標記錯誤或空
+                else:
+                    cultural_tokens_dict_val[scene_type] = None # prompts 不合規
+        self.text_features_cache["cultural_tokens_dict"] = cultural_tokens_dict_val
+        # 處理光照條件 (LIGHTING_CONDITION_PROMPTS)
+        if hasattr(self, 'lighting_condition_prompts') and self.lighting_condition_prompts:
+            lighting_texts = [prompt for cond, prompt in self.lighting_condition_prompts.items()]
+            if lighting_texts:
+                self.text_features_cache["lighting_condition_keys"] = list(self.lighting_condition_prompts.keys())
+                try:
+                    self.text_features_cache["lighting_tokens"] = clip.tokenize(lighting_texts).to(self.device)
+                except Exception as e:
+                    print(f"Warning: Error tokenizing lighting_condition_prompts: {e}")
+                    self.text_features_cache["lighting_tokens"] = None
+            else:
+                self.text_features_cache["lighting_condition_keys"] = []
+                self.text_features_cache["lighting_tokens"] = None
+        else:
+            self.text_features_cache["lighting_condition_keys"] = []
+            self.text_features_cache["lighting_tokens"] = None
+        # 處理特殊場景 (SPECIALIZED_SCENE_PROMPTS)
+        specialized_tokens_dict_val = {}
+        if hasattr(self, 'specialized_scene_prompts') and self.specialized_scene_prompts:
+            for scene_type, prompts in self.specialized_scene_prompts.items():
+                if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
+                    try:
+                        specialized_tokens_dict_val[scene_type] = clip.tokenize(prompts).to(self.device)
+                    except Exception as e:
+                        print(f"Warning: Error tokenizing specialized_scene_prompts for {scene_type}: {e}")
+                        specialized_tokens_dict_val[scene_type] = None
+                else:
+                    specialized_tokens_dict_val[scene_type] = None
+        self.text_features_cache["specialized_tokens_dict"] = specialized_tokens_dict_val
+        # 處理視角 (VIEWPOINT_PROMPTS)
+        if hasattr(self, 'viewpoint_prompts') and self.viewpoint_prompts:
+            viewpoint_texts = [prompt for viewpoint, prompt in self.viewpoint_prompts.items()]
+            if viewpoint_texts:
+                self.text_features_cache["viewpoint_keys"] = list(self.viewpoint_prompts.keys())
+                try:
+                    self.text_features_cache["viewpoint_tokens"] = clip.tokenize(viewpoint_texts).to(self.device)
+                except Exception as e:
+                    print(f"Warning: Error tokenizing viewpoint_prompts: {e}")
+                    self.text_features_cache["viewpoint_tokens"] = None
+            else:
+                self.text_features_cache["viewpoint_keys"] = []
+                self.text_features_cache["viewpoint_tokens"] = None
+        else:
+            self.text_features_cache["viewpoint_keys"] = []
+            self.text_features_cache["viewpoint_tokens"] = None
+        # 處理物件組合 (OBJECT_COMBINATION_PROMPTS)
+        if hasattr(self, 'object_combination_prompts') and self.object_combination_prompts:
+            object_combination_texts = [prompt for combo, prompt in self.object_combination_prompts.items()]
+            if object_combination_texts:
+                self.text_features_cache["object_combination_keys"] = list(self.object_combination_prompts.keys())
+                try:
+                    self.text_features_cache["object_combination_tokens"] = clip.tokenize(object_combination_texts).to(self.device)
+                except Exception as e:
+                    print(f"Warning: Error tokenizing object_combination_prompts: {e}")
+                    self.text_features_cache["object_combination_tokens"] = None
+            else:
+                self.text_features_cache["object_combination_keys"] = []
+                self.text_features_cache["object_combination_tokens"] = None
+        else:
+            self.text_features_cache["object_combination_keys"] = []
+            self.text_features_cache["object_combination_tokens"] = None
+        # 處理活動 (ACTIVITY_PROMPTS)
+        if hasattr(self, 'activity_prompts') and self.activity_prompts:
+            activity_texts = [prompt for activity, prompt in self.activity_prompts.items()]
+            if activity_texts:
+                self.text_features_cache["activity_keys"] = list(self.activity_prompts.keys())
+                try:
+                    self.text_features_cache["activity_tokens"] = clip.tokenize(activity_texts).to(self.device)
+                except Exception as e:
+                    print(f"Warning: Error tokenizing activity_prompts: {e}")
+                    self.text_features_cache["activity_tokens"] = None
+            else:
+                self.text_features_cache["activity_keys"] = []
+                self.text_features_cache["activity_tokens"] = None
+        else:
+            self.text_features_cache["activity_keys"] = []
+            self.text_features_cache["activity_tokens"] = None
+        self.scene_type_tokens = self.text_features_cache["scene_type_tokens"]
+        self.lighting_tokens = self.text_features_cache["lighting_tokens"]
+        self.viewpoint_tokens = self.text_features_cache["viewpoint_tokens"]
+        self.object_combination_tokens = self.text_features_cache["object_combination_tokens"]
+        self.activity_tokens = self.text_features_cache["activity_tokens"]
+        self.cultural_tokens_dict = self.text_features_cache["cultural_tokens_dict"]
+        self.specialized_tokens_dict = self.text_features_cache["specialized_tokens_dict"]
+        print("CLIP text_features_cache prepared.")
+    def analyze_image(self, image, include_cultural_analysis=True, exclude_categories=None, enable_landmark=True, places365_guidance=None):
         """
         分析圖像，預測場景類型和光照條件。
         Args:
             image: 輸入圖像 (PIL Image 或 numpy array)
             include_cultural_analysis: 是否包含文化場景的詳細分析
+            exclude_categories: 要排除的類別列表
+            enable_landmark: 是否啟用地標檢測功能
+            places365_guidance: Places365 提供的場景指導信息 (可選)
         Returns:
             Dict: 包含場景類型預測和光照條件的分析結果
         """
         try:
+            self.enable_landmark = enable_landmark # 更新實例的 enable_landmark 狀態
             # 確保圖像是 PIL 格式
             if not isinstance(image, Image.Image):
                 if isinstance(image, np.ndarray):
                 image_features = self.model.encode_image(image_input)
                 image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+            places365_focus_areas = []
+            places365_scene_context = "" # 用於存儲 Places365 提供的場景描述
+            if places365_guidance and isinstance(places365_guidance, dict) and places365_guidance.get('confidence', 0) > 0.4:
+                mapped_scene = places365_guidance.get('mapped_scene_type', '')
+                scene_label = places365_guidance.get('scene_label', '')
+                # is_indoor = places365_guidance.get('is_indoor', None) # 未使用，可註釋
+                attributes = places365_guidance.get('attributes', [])
+                places365_scene_context = f"Scene identified by Places365 as {scene_label}" # 更新上下文描述
+                # Adjust CLIP analysis focus based on Places365 scene type
+                if mapped_scene in ['kitchen', 'dining_area', 'restaurant']:
+                    places365_focus_areas.extend(['food preparation', 'dining setup', 'kitchen appliances'])
+                elif mapped_scene in ['office_workspace', 'educational_setting', 'library', 'conference_room']:
+                    places365_focus_areas.extend(['work environment', 'professional setting', 'learning space', 'study area'])
+                elif mapped_scene in ['retail_store', 'shopping_mall', 'market', 'supermarket']: # 擴展匹配
+                    places365_focus_areas.extend(['commercial space', 'shopping environment', 'retail display', 'goods for sale'])
+                elif mapped_scene in ['park_area', 'beach', 'natural_outdoor_area', 'playground', 'sports_field']: # 擴展匹配
+                    places365_focus_areas.extend(['outdoor recreation', 'natural environment', 'leisure activity', 'open space'])
+                # 根據屬性添加更通用的 focus areas
+                if isinstance(attributes, list): # 確保 attributes 是列表
+                    if 'commercial' in attributes:
+                        places365_focus_areas.append('business activity')
+                    if 'recreational' in attributes:
+                        places365_focus_areas.append('entertainment or leisure')
+                    if 'residential' in attributes:
+                        places365_focus_areas.append('living space')
+                # 去重
+                places365_focus_areas = list(set(places365_focus_areas))
+                if places365_focus_areas: # 只有在確實有 focus areas 時才打印
+                    print(f"CLIP analysis guided by Places365: {places365_scene_context}, focus areas: {places365_focus_areas}")
+            # 分析場景類型，傳遞 enable_landmark 參數和 Places365 指導
+            scene_scores = self._analyze_scene_type(image_features,
+                                                  enable_landmark=self.enable_landmark, # 使用更新後的實例屬性
+                                                  places365_focus=places365_focus_areas)
+            # 如果禁用地標功能，確保排除地標相關類別
+            current_exclude_categories = list(exclude_categories) if exclude_categories is not None else []
+            if not self.enable_landmark: # 使用更新後的實例屬性
+                landmark_related_terms = ["landmark", "monument", "tower", "tourist", "attraction", "historical", "famous", "iconic"]
+                for term in landmark_related_terms:
+                    if term not in current_exclude_categories:
+                        current_exclude_categories.append(term)
+            if current_exclude_categories:
+                filtered_scores = {}
+                for scene, score in scene_scores.items():
+                    # 檢查 scene 的鍵名（通常是英文）是否包含任何排除詞彙
+                    if not any(cat.lower() in scene.lower() for cat in current_exclude_categories):
+                        filtered_scores[scene] = score
+                if filtered_scores:
+                    total_score = sum(filtered_scores.values())
+                    if total_score > 1e-5: # 避免除以零或非常小的數
+                        scene_scores = {k: v / total_score for k, v in filtered_scores.items()}
+                    else: # 如果總分趨近於0，則保持原樣或設為0
+                        scene_scores = {k: 0.0 for k in filtered_scores.keys()} # 或者 scene_scores = filtered_scores
+                else: # 如果過濾後沒有場景了
+                    scene_scores = {k: (0.0 if any(cat.lower() in k.lower() for cat in current_exclude_categories) else v) for k,v in scene_scores.items()}
+                    if not any(s > 1e-5 for s in scene_scores.values()): # 如果還是全0
+                         scene_scores = {"unknown": 1.0} # 給一個默認值避免空字典
             lighting_scores = self._analyze_lighting_condition(image_features)
             cultural_analysis = {}
+            if include_cultural_analysis and self.enable_landmark: # 使用更新後的實例屬性
+                for scene_type_cultural_key in self.text_features_cache.get("cultural_tokens_dict", {}).keys():
+                     # 確保 scene_type_cultural_key 是 SCENE_TYPE_PROMPTS 中的鍵，或者有一個映射關係
+                    if scene_type_cultural_key in scene_scores and scene_scores[scene_type_cultural_key] > 0.2:
+                        cultural_analysis[scene_type_cultural_key] = self._analyze_cultural_scene(
+                            image_features, scene_type_cultural_key
                         )
             specialized_analysis = {}
+            for scene_type_specialized_key in self.text_features_cache.get("specialized_tokens_dict", {}).keys():
+                if scene_type_specialized_key in scene_scores and scene_scores[scene_type_specialized_key] > 0.2:
+                    specialized_analysis[scene_type_specialized_key] = self._analyze_specialized_scene(
+                        image_features, scene_type_specialized_key
                     )
             viewpoint_scores = self._analyze_viewpoint(image_features)
             object_combination_scores = self._analyze_object_combinations(image_features)
             activity_scores = self._analyze_activities(image_features)
+            if scene_scores: # 確保 scene_scores 不是空的
+                top_scene = max(scene_scores.items(), key=lambda x: x[1])
+                 # 如果禁用地標，再次確認 top_scene 不是地標相關
+                if not self.enable_landmark and any(cat.lower() in top_scene[0].lower() for cat in current_exclude_categories):
+                    non_excluded_scores = {k:v for k,v in scene_scores.items() if not any(cat.lower() in k.lower() for cat in current_exclude_categories)}
+                    if non_excluded_scores:
+                        top_scene = max(non_excluded_scores.items(), key=lambda x: x[1])
+                    else:
+                        top_scene = ("unknown", 0.0) # 或其他合適的默認值
+            else:
+                top_scene = ("unknown", 0.0)
             result = {
                 "scene_scores": scene_scores,
+                "top_scene": top_scene,
+                "lighting_condition": max(lighting_scores.items(), key=lambda x: x[1]) if lighting_scores else ("unknown", 0.0),
+                "embedding": image_features.cpu().numpy().tolist()[0], # 簡化
+                "viewpoint": max(viewpoint_scores.items(), key=lambda x: x[1]) if viewpoint_scores else ("unknown", 0.0),
+                "object_combinations": sorted(object_combination_scores.items(), key=lambda x: x[1], reverse=True)[:3] if object_combination_scores else [],
+                "activities": sorted(activity_scores.items(), key=lambda x: x[1], reverse=True)[:3] if activity_scores else []
             }
+            if places365_guidance and isinstance(places365_guidance, dict) and places365_focus_areas: # 檢查 places365_focus_areas 是否被填充
+                result["places365_guidance"] = {
+                    "scene_context": places365_scene_context,
+                    "focus_areas": places365_focus_areas, # 現在這個會包含基於 guidance 的內容
+                    "guided_analysis": True,
+                    "original_places365_scene": places365_guidance.get('scene_label', 'N/A'),
+                    "original_places365_confidence": places365_guidance.get('confidence', 0.0)
+                }
+            if cultural_analysis and self.enable_landmark:
                 result["cultural_analysis"] = cultural_analysis
             if specialized_analysis:
             print(f"Error analyzing image with CLIP: {e}")
             import traceback
             traceback.print_exc()
+            return {"error": str(e), "scene_scores": {}, "top_scene": ("error", 0.0)}
+    def _analyze_scene_type(self, image_features: torch.Tensor, enable_landmark: bool = True, places365_focus: List[str] = None) -> Dict[str, float]:
+        """
+        分析圖像特徵與各場景類型的相似度，並可選擇性地排除地標相關場景
+        Args:
+            image_features: 經過 CLIP 編碼的圖像特徵
+            enable_landmark: 是否啟用地標識別功能
+        Returns:
+            Dict[str, float]: 各場景類型的相似度分數字典
+        """
         with torch.no_grad():
             # 計算場景類型文本特徵
             text_features = self.model.encode_text(self.scene_type_tokens)
             text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            # Apply Places365 guidance if available
+            if places365_focus and len(places365_focus) > 0:
+                # Create enhanced prompts that incorporate Places365 guidance
+                enhanced_prompts = []
+                for scene_type in self.scene_type_prompts.keys():
+                    base_prompt = self.scene_type_prompts[scene_type]
+                    # Check if this scene type should be emphasized based on Places365 guidance
+                    scene_lower = scene_type.lower()
+                    should_enhance = False
+                    for focus_area in places365_focus:
+                        if any(keyword in scene_lower for keyword in focus_area.split()):
+                            should_enhance = True
+                            enhanced_prompts.append(f"{base_prompt} with {focus_area}")
+                            break
+                    if not should_enhance:
+                        enhanced_prompts.append(base_prompt)
+                # Re-tokenize and encode enhanced prompts
+                enhanced_tokens = clip.tokenize(enhanced_prompts).to(self.device)
+                text_features = self.model.encode_text(enhanced_tokens)
+                text_features = text_features / text_features.norm(dim=-1, keepdim=True)
             # 計算相似度分數
             similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
             similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
             # 建立場景分數字典
             scene_scores = {}
             for i, scene_type in enumerate(self.scene_type_prompts.keys()):
+                # 如果未啟用地標功能，則跳過地標相關場景類型
+                if not enable_landmark and scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
+                    scene_scores[scene_type] = 0.0  # 將地標場景分數設為零
+                else:
+                    base_score = float(similarity[i])
+                    # Apply Places365 guidance boost if applicable
+                    if places365_focus:
+                        scene_lower = scene_type.lower()
+                        boost_factor = 1.0
+                        for focus_area in places365_focus:
+                            if any(keyword in scene_lower for keyword in focus_area.split()):
+                                boost_factor = 1.15  # 15% boost for matching scenes
+                                break
+                        scene_scores[scene_type] = base_score * boost_factor
+                    else:
+                        scene_scores[scene_type] = base_score
+            # 如果禁用地標功能，確保重新歸一化剩餘場景分數
+            if not enable_landmark:
+                # 獲取所有非零分數
+                non_zero_scores = {k: v for k, v in scene_scores.items() if v > 0}
+                if non_zero_scores:
+                    # 計算總和並歸一化
+                    total_score = sum(non_zero_scores.values())
+                    if total_score > 0:
+                        for scene_type in non_zero_scores:
+                            scene_scores[scene_type] = non_zero_scores[scene_type] / total_score
             return scene_scores
             result[query] = float(similarity[i])
         return result
+    def get_clip_instance(self):
+        """
+        獲取初始化好的CLIP模型實例，便於其他模組重用
+        Returns:
+            tuple: (模型實例, 預處理函數, 設備名稱)
+        """
+        return self.model, self.preprocess, self.device

clip_prompts.py CHANGED Viewed

@@ -137,7 +137,7 @@ COMPARATIVE_PROMPTS = {
     "asian_vs_western_commercial": [
         "An Asian shopping street with vertical signage and compact multi-level shops.",
         "A Western commercial street with horizontal storefronts and wider sidewalks.",
-        "An East Asian retail area with dense signage in Asian scripts and narrow walkways.",
         "A Western shopping district with uniform building heights and Latin alphabetic signs."
     ],
     "daytime_vs_nighttime": [

     "asian_vs_western_commercial": [
         "An Asian shopping street with vertical signage and compact multi-level shops.",
         "A Western commercial street with horizontal storefronts and wider sidewalks.",
+        "An East Asian retail area with dense signage in Asian scripts and narrow walkways."
         "A Western shopping district with uniform building heights and Latin alphabetic signs."
     ],
     "daytime_vs_nighttime": [

clip_zero_shot_classifier.py ADDED Viewed

	@@ -0,0 +1,1415 @@

+import torch
+import clip
+from PIL import Image
+import numpy as np
+from typing import List, Dict, Tuple, Optional, Union, Any
+from landmark_data import ALL_LANDMARKS, get_all_landmark_prompts
+class CLIPZeroShotClassifier:
+    """
+    使用CLIP模型進行零樣本分類，專注於識別世界知名地標。
+    作為YOLO檢測的補充，處理標準對象檢測無法識別的地標建築。
+    """
+    def __init__(self, model_name: str = "ViT-L/14", device: str = None):
+        """
+        初始化CLIP零樣本分類器
+        Args:
+            model_name: CLIP模型名稱，默認為"ViT-L/14"
+            device: 運行設備，None則自動選擇
+        """
+        # 設置運行設備
+        if device is None:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+        print(f"Initializing CLIP Zero-Shot Landmark Classifier ({model_name}) on {self.device}")
+        try:
+            self.model, self.preprocess = clip.load(model_name, device=self.device)
+            print(f"Successfully loaded CLIP model")
+        except Exception as e:
+            print(f"Error loading CLIP model: {e}")
+            raise
+        # 加載地標數據
+        try:
+            self.landmark_data = ALL_LANDMARKS
+            self.landmark_prompts = get_all_landmark_prompts()
+            print(f"Loaded {len(self.landmark_prompts)} landmark prompts for classification")
+            # 預計算地標文本特徵
+            self.landmark_text_features = self._precompute_text_features(self.landmark_prompts)
+            # 創建地標ID到索引的映射，可快速查找
+            self.landmark_id_to_index = {landmark_id: i for i, landmark_id in enumerate(ALL_LANDMARKS.keys())}
+            # 初始化批處理參數
+            self.batch_size = 16  # 默認批處理大小
+            self.confidence_threshold_multipliers = {
+                "close_up": 0.9,     # 近景標準閾值
+                "partial": 0.6,      # 部分可見降低閾值要求
+                "distant": 0.5,      # 遠景更低閾值要求
+                "full_image": 0.7    # 整張圖像需要更高閾值
+            }
+            self.landmark_type_thresholds = {
+                "tower": 0.5,         # 塔型建築需要更高閾值
+                "skyscraper": 0.4,    # 摩天大樓使用較低閾值
+                "building": 0.55,     # 一般建築物閾值略微降低
+                "monument": 0.5,      # 紀念碑閾值
+                "natural": 0.6        # 自然地標可以使用較低閾值
+            }
+            # 初始化結果快取
+            self.results_cache = {}  # 使用圖像hash作為鍵
+            self.cache_max_size = 100  # 最大快取項目數
+        except ImportError:
+            print("Warning: landmark_data.py not found. Landmark classification will be limited")
+            self.landmark_data = {}
+            self.landmark_prompts = []
+            self.landmark_text_features = None
+            self.landmark_id_to_index = {}
+            self.results_cache = {}
+    def _get_image_hash(self, image):
+        """
+        為圖像生成簡單的 hash 值用於快取
+        Args:
+            image: PIL Image 或 numpy 數組
+        Returns:
+            str: 圖像的 hash 值
+        """
+        if isinstance(image, np.ndarray):
+            # 對於 numpy 數組，降採樣並計算簡單 hash
+            small_img = image[::10, ::10] if image.ndim == 3 else image
+            return hash(small_img.tobytes())
+        else:
+            # 對於 PIL 圖像，調整大小後轉換為 bytes
+            small_img = image.resize((32, 32))
+            return hash(small_img.tobytes())
+    def _manage_cache(self):
+        """
+        管理結果快取大小
+        """
+        if len(self.results_cache) > self.cache_max_size:
+            oldest_key = next(iter(self.results_cache))
+            del self.results_cache[oldest_key]
+    def set_batch_size(self, batch_size: int):
+        """
+        設置批處理大小
+        Args:
+            batch_size: 新的批處理大小
+        """
+        self.batch_size = max(1, batch_size)
+        print(f"Batch size set to {self.batch_size}")
+    def adjust_confidence_threshold(self, detection_type: str, multiplier: float):
+        """
+        調整特定檢測類型的置信度閾值乘數
+        Args:
+            detection_type: 檢測類型 ('close_up', 'partial', 'distant', 'full_image')
+            multiplier: 置信度閾值乘數
+        """
+        if detection_type in self.confidence_threshold_multipliers:
+            self.confidence_threshold_multipliers[detection_type] = max(0.1, min(1.5, multiplier))
+            print(f"Adjusted confidence threshold multiplier for {detection_type} to {multiplier}")
+        else:
+            print(f"Unknown detection type: {detection_type}")
+    def _precompute_text_features(self, text_prompts: List[str]) -> torch.Tensor:
+        """
+        預計算文本提示的CLIP特徵，提高批處理效率
+        Args:
+            text_prompts: 文本提示列表
+        Returns:
+            torch.Tensor: 預計算的文本特徵
+        """
+        if not text_prompts:
+            return None
+        with torch.no_grad():
+            # Process in batches to avoid CUDA memory issues
+            batch_size = 128  # Adjust based on GPU memory
+            features_list = []
+            for i in range(0, len(text_prompts), batch_size):
+                batch_prompts = text_prompts[i:i+batch_size]
+                text_tokens = clip.tokenize(batch_prompts).to(self.device)
+                batch_features = self.model.encode_text(text_tokens)
+                batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True)
+                features_list.append(batch_features)
+            # Concatenate all batches
+            if len(features_list) > 1:
+                text_features = torch.cat(features_list, dim=0)
+            else:
+                text_features = features_list[0]
+        return text_features
+    def _perform_pyramid_analysis(self,
+                         image: Union[Image.Image, np.ndarray],
+                         levels: int = 4,
+                         base_threshold: float = 0.25,
+                         aspect_ratios: List[float] = [1.0, 0.75, 1.5]) -> Dict[str, Any]:
+        """
+        Performs multi-scale pyramid analysis on the image to improve landmark detection.
+        Args:
+            image: Input image
+            levels: Number of pyramid levels
+            base_threshold: Base confidence threshold
+            aspect_ratios: Different aspect ratios to try (for tall buildings vs wide landscapes)
+        Returns:
+            Dict: Results of pyramid analysis
+        """
+        # Ensure image is PIL format
+        if not isinstance(image, Image.Image):
+            if isinstance(image, np.ndarray):
+                image = Image.fromarray(image)
+            else:
+                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+        width, height = image.size
+        pyramid_results = []
+        # 對每個縮放和縱橫比組合進行處理
+        for level in range(levels):
+            # 計算縮放因子
+            scale_factor = 1.0 - (level * 0.2)
+            for aspect_ratio in aspect_ratios:
+                # 計算新尺寸，保持面積近似不變
+                if aspect_ratio != 1.0:
+                    # 保持面積近似不變的情況下調整縱橫比
+                    new_width = int(width * scale_factor * (1/aspect_ratio)**0.5)
+                    new_height = int(height * scale_factor * aspect_ratio**0.5)
+                else:
+                    new_width = int(width * scale_factor)
+                    new_height = int(height * scale_factor)
+                # 調整圖像大小
+                scaled_image = image.resize((new_width, new_height), Image.LANCZOS)
+                # 預處理圖像
+                image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)
+                # 獲取圖像特徵
+                with torch.no_grad():
+                    image_features = self.model.encode_image(image_input)
+                    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+                    # 計算相似度
+                    similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
+                    similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
+                # 找到最佳匹配
+                best_idx = similarity.argmax().item()
+                best_score = similarity[best_idx]
+                if best_score >= base_threshold:
+                    landmark_id = list(self.landmark_data.keys())[best_idx]
+                    landmark_info = self.landmark_data[landmark_id]
+                    pyramid_results.append({
+                        "landmark_id": landmark_id,
+                        "landmark_name": landmark_info["name"],
+                        "confidence": float(best_score),
+                        "scale_factor": scale_factor,
+                        "aspect_ratio": aspect_ratio,
+                        "location": landmark_info["location"]
+                    })
+        # 按置信度排序
+        pyramid_results.sort(key=lambda x: x["confidence"], reverse=True)
+        return {
+            "is_landmark": len(pyramid_results) > 0,
+            "results": pyramid_results,
+            "best_result": pyramid_results[0] if pyramid_results else None
+        }
+    def _enhance_features(self, image: Union[Image.Image, np.ndarray]) -> Image.Image:
+        """
+        Enhances image features to improve landmark detection.
+        Args:
+            image: Input image
+        Returns:
+            PIL.Image: Enhanced image
+        """
+        # Ensure image is PIL format
+        if not isinstance(image, Image.Image):
+            if isinstance(image, np.ndarray):
+                image = Image.fromarray(image)
+            else:
+                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+        # Convert to numpy for processing
+        img_array = np.array(image)
+        # Skip processing for grayscale images
+        if len(img_array.shape) < 3:
+            return image
+        # Apply adaptive contrast enhancement
+        # Convert to LAB color space
+        from skimage import color, exposure
+        try:
+            # Convert to LAB color space
+            if img_array.shape[2] == 4:  # Handle RGBA
+                img_array = img_array[:,:,:3]
+            lab = color.rgb2lab(img_array[:,:,:3] / 255.0)
+            l_channel = lab[:,:,0]
+            # Enhance contrast of L channel
+            p2, p98 = np.percentile(l_channel, (2, 98))
+            l_channel_enhanced = exposure.rescale_intensity(l_channel, in_range=(p2, p98))
+            # Replace L channel and convert back to RGB
+            lab[:,:,0] = l_channel_enhanced
+            enhanced_img = color.lab2rgb(lab) * 255.0
+            enhanced_img = enhanced_img.astype(np.uint8)
+            return Image.fromarray(enhanced_img)
+        except ImportError:
+            print("Warning: skimage not available for feature enhancement")
+            return image
+        except Exception as e:
+            print(f"Error in feature enhancement: {e}")
+            return image
+    def _determine_landmark_type(self, landmark_id):
+        """
+        自動判斷地標類型，基於地標數據和命名
+        Returns:
+            str: 地標類型，用於調整閾值
+        """
+        if not landmark_id:
+            return "building"  # 預設類型
+        # 獲取地標詳細數據
+        landmark_data = self.landmark_data if hasattr(self, 'landmark_data') else {}
+        landmark_info = landmark_data.get(landmark_id, {})
+        # 獲取地標相關文本
+        landmark_id_lower = landmark_id.lower()
+        landmark_name = landmark_info.get("name", "").lower()
+        landmark_location = landmark_info.get("location", "").lower()
+        landmark_aliases = [alias.lower() for alias in landmark_info.get("aliases", [])]
+        # 合併所有文本數據用於特徵判斷
+        combined_text = " ".join([landmark_id_lower, landmark_name] + landmark_aliases)
+        # 地標類型的特色特徵
+        type_features = {
+            "skyscraper": ["skyscraper", "tall", "tower", "高樓", "摩天", "大厦", "タワー"],
+            "tower": ["tower", "bell", "clock", "塔", "鐘樓", "タワー", "campanile"],
+            "monument": ["monument", "memorial", "statue", "紀念", "雕像", "像", "memorial"],
+            "natural": ["mountain", "lake", "canyon", "falls", "beach", "山", "湖", "峽谷", "瀑布", "海灘"],
+            "temple": ["temple", "shrine", "寺", "神社", "廟"],
+            "palace": ["palace", "castle", "宮", "城", "皇宮", "宫殿"],
+            "distinctive": ["unique", "leaning", "slanted", "傾斜", "斜", "獨特", "傾く"]
+        }
+        # 檢查是否位於亞洲地區
+        asian_regions = ["china", "japan", "korea", "taiwan", "singapore", "vietnam", "thailand",
+                        "hong kong", "中國", "日本", "韓國", "台灣", "新加坡", "越南", "泰國", "香港"]
+        is_asian = any(region in landmark_location for region in asian_regions)
+        # 判斷地標類型
+        best_type = None
+        max_matches = 0
+        for type_name, features in type_features.items():
+            # 計算特徵詞匹配數量
+            matches = sum(1 for feature in features if feature in combined_text)
+            if matches > max_matches:
+                max_matches = matches
+                best_type = type_name
+        # 處理亞洲地區特例
+        if is_asian and best_type == "tower":
+            best_type = "skyscraper"  # 亞洲地區的塔型建築閾值較低
+        # 特例處理：檢測傾斜建築
+        if any(term in combined_text for term in ["leaning", "slanted", "tilt", "inclined", "斜", "傾斜"]):
+            return "distinctive"  # 傾斜建築需要特殊處理
+        return best_type if best_type and max_matches > 0 else "building"  # 預設為一般建築
+    def classify_image_region(self,
+                    image: Union[Image.Image, np.ndarray],
+                    box: List[float],
+                    threshold: float = 0.25,
+                    detection_type: str = "close_up") -> Dict[str, Any]:
+        """
+        對圖像的特定區域進行地標分類，具有增強的多尺度和部分識別能力
+        Args:
+            image: 原始圖像 (PIL Image 或 numpy數組)
+            box: 邊界框 [x1, y1, x2, y2]
+            threshold: 基礎分類置信度閾值
+            detection_type: 檢測類型，影響置信度調整
+        Returns:
+            Dict: 地標���類結果
+        """
+        # 確保圖像是PIL格式
+        if not isinstance(image, Image.Image):
+            if isinstance(image, np.ndarray):
+                image = Image.fromarray(image)
+            else:
+                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+        # 生成圖像區域的hash用於快取
+        region_key = (self._get_image_hash(image), tuple(box), detection_type)
+        if region_key in self.results_cache:
+            return self.results_cache[region_key]
+        # 裁剪區域
+        x1, y1, x2, y2 = map(int, box)
+        cropped_image = image.crop((x1, y1, x2, y2))
+        enhanced_image = self._enhance_features(cropped_image)
+        # 分析視角信息
+        viewpoint_info = self._analyze_viewpoint(enhanced_image)
+        dominant_viewpoint = viewpoint_info["dominant_viewpoint"]
+        # 計算區域信息
+        region_width = x2 - x1
+        region_height = y2 - y1
+        image_width, image_height = image.size
+        # 根據區域大小判斷可能的檢測類型
+        region_area_ratio = (region_width * region_height) / (image_width * image_height)
+        if detection_type == "auto":
+            if region_area_ratio > 0.5:
+                detection_type = "close_up"
+            elif region_area_ratio > 0.2:
+                detection_type = "partial"
+            else:
+                detection_type = "distant"
+        # 根據視角調整檢測類型
+        if dominant_viewpoint == "close_up" and detection_type != "close_up":
+            detection_type = "close_up"
+        elif dominant_viewpoint == "distant" and detection_type != "distant":
+            detection_type = "distant"
+        elif dominant_viewpoint == "angled_view":
+            detection_type = "partial"  # 角度視圖可能是部分可見
+        # 調整置信度閾值
+        base_multiplier = self.confidence_threshold_multipliers.get(detection_type, 1.0)
+        adjusted_threshold = threshold * base_multiplier
+        # 調整多尺度處理的尺度範圍和縱橫比 - 增強對傾斜建築的支持
+        scales = [1.0]  # 默認尺度
+        # 基於視角選擇合適的尺度和縱橫比
+        if detection_type in ["partial", "distant"]:
+            scales = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3]  # 標準範圍
+        # 如果是特殊視角，進一步調整尺度和縱橫比 - 新增
+        if dominant_viewpoint in ["angled_view", "low_angle"]:
+            scales = [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4]  # 更寬的範圍
+        # 準備縱橫比 - 同時支持水平和垂直地標
+        aspect_ratios = [1.0, 0.8, 1.2]  # 標準縱橫比
+        # 針對可能的傾斜建築增加更多縱橫比 - 新增
+        if dominant_viewpoint in ["angled_view", "unique_feature"]:
+            aspect_ratios = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5]  # 更多樣的縱橫比
+        best_result = {
+            "landmark_id": None,
+            "landmark_name": None,
+            "confidence": 0.0,
+            "is_landmark": False
+        }
+        # 多尺度和縱橫比分析
+        for scale in scales:
+            for aspect_ratio in aspect_ratios:
+                # 縮放裁剪區域
+                current_width, current_height = cropped_image.size
+                # 計算新尺寸，保持面積不變但調整縱橫比
+                if aspect_ratio != 1.0:
+                    new_width = int(current_width * scale * (1/aspect_ratio)**0.5)
+                    new_height = int(current_height * scale * aspect_ratio**0.5)
+                else:
+                    new_width = int(current_width * scale)
+                    new_height = int(current_height * scale)
+                # 確保尺寸至少為1像素
+                new_width = max(1, new_width)
+                new_height = max(1, new_height)
+                # 縮放圖像
+                try:
+                    scaled_image = cropped_image.resize((new_width, new_height), Image.LANCZOS)
+                except Exception as e:
+                    print(f"Failed to resize image to {new_width}x{new_height}: {e}")
+                    continue
+                # 預處理裁剪圖像
+                try:
+                    image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)
+                except Exception as e:
+                    print(f"Failed to preprocess image: {e}")
+                    continue
+                # 獲取圖像特徵
+                with torch.no_grad():
+                    try:
+                        image_features = self.model.encode_image(image_input)
+                        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+                        # 計算與地標提示的相似度
+                        similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
+                        similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
+                        # 找到最佳匹配
+                        best_idx = similarity.argmax().item()
+                        best_score = similarity[best_idx]
+                        # 如果當前尺度結果更好，則更新
+                        if best_score > best_result["confidence"]:
+                            landmark_id = list(self.landmark_data.keys())[best_idx]
+                            landmark_info = self.landmark_data[landmark_id]
+                            best_result = {
+                                "landmark_id": landmark_id,
+                                "landmark_name": landmark_info["name"],
+                                "location": landmark_info["location"],
+                                "confidence": float(best_score),
+                                "is_landmark": best_score >= adjusted_threshold,
+                                "scale_used": scale,
+                                "aspect_ratio_used": aspect_ratio,
+                                "viewpoint": dominant_viewpoint
+                            }
+                            # 添加額外可用信息
+                            for key in ["year_built", "architectural_style", "significance"]:
+                                if key in landmark_info:
+                                    best_result[key] = landmark_info[key]
+                    except Exception as e:
+                        print(f"Error in calculating similarity: {e}")
+                        continue
+        # 只有在有識別出地標ID且信心度足夠高時才應用地標類型閾值調整
+        if best_result["landmark_id"]:
+            landmark_type = self._determine_landmark_type(best_result["landmark_id"])
+            # 檢測是否為特殊類型的建築如斜塔
+            if landmark_type == "distinctive":
+                # 特殊建築的閾值降低25%
+                type_multiplier = 0.75
+            else:
+                # 使用已有的類型閾值
+                type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5
+            # 更新判斷是否為地標的標準
+            final_threshold = adjusted_threshold * type_multiplier
+            best_result["is_landmark"] = best_result["confidence"] >= final_threshold
+            best_result["landmark_type"] = landmark_type  # 添加地標類型信息
+            best_result["threshold_applied"] = final_threshold  # 記錄應用的閾值
+        # 快取結果
+        self.results_cache[region_key] = best_result
+        self._manage_cache()
+        return best_result
+    def classify_batch_regions(self,
+                              image: Union[Image.Image, np.ndarray],
+                              boxes: List[List[float]],
+                              threshold: float = 0.28) -> List[Dict[str, Any]]:
+        """
+        批量處理多個圖像區域，提高效率
+        Args:
+            image: 原始圖像
+            boxes: 邊界框列表
+            threshold: 置信度閾值
+        Returns:
+            List[Dict]: 分類結果列表
+        """
+        if not self.landmark_text_features is not None:
+            return [{"is_landmark": False, "confidence": 0.0} for _ in boxes]
+        # 確保圖像是PIL格式
+        if not isinstance(image, Image.Image):
+            if isinstance(image, np.ndarray):
+                image = Image.fromarray(image)
+            else:
+                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+        # 無框可處理時
+        if not boxes:
+            return []
+        # 裁剪並預處理所有區域
+        cropped_inputs = []
+        for box in boxes:
+            x1, y1, x2, y2 = map(int, box)
+            cropped_image = image.crop((x1, y1, x2, y2))
+            processed_image = self.preprocess(cropped_image).unsqueeze(0)
+            cropped_inputs.append(processed_image)
+        # batch process
+        batch_tensor = torch.cat(cropped_inputs).to(self.device)
+        # batch encoding
+        with torch.no_grad():
+            image_features = self.model.encode_image(batch_tensor)
+            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+            # 計算相似度
+            similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
+            similarity = similarity.cpu().numpy() if self.device == "cuda" else similarity.numpy()
+        # 處理每個區域的結果
+        results = []
+        for i, sim in enumerate(similarity):
+            best_idx = sim.argmax().item()
+            best_score = sim[best_idx]
+            if best_score >= threshold:
+                landmark_id = list(self.landmark_data.keys())[best_idx]
+                landmark_info = self.landmark_data[landmark_id]
+                results.append({
+                    "landmark_id": landmark_id,
+                    "landmark_name": landmark_info["name"],
+                    "location": landmark_info["location"],
+                    "confidence": float(best_score),
+                    "is_landmark": True,
+                    "box": boxes[i]
+                })
+            else:
+                results.append({
+                    "landmark_id": None,
+                    "landmark_name": None,
+                    "confidence": float(best_score),
+                    "is_landmark": False,
+                    "box": boxes[i]
+                })
+        return results
+    def search_entire_image(self,
+                        image: Union[Image.Image, np.ndarray],
+                        threshold: float = 0.35,
+                        detailed_analysis: bool = False) -> Dict[str, Any]:
+        """
+        檢查整張圖像是否包含地標，具有增強的分析能力
+        Args:
+            image: 原始圖像
+            threshold: 置信度閾值
+            detailed_analysis: 是否進行詳細分析，包括多區域檢測
+        Returns:
+            Dict: 地標分類結果
+        """
+        # 確保圖像是PIL格式
+        if not isinstance(image, Image.Image):
+            if isinstance(image, np.ndarray):
+                image = Image.fromarray(image)
+            else:
+                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+        # 檢查快取
+        image_key = (self._get_image_hash(image), "entire_image", detailed_analysis)
+        if image_key in self.results_cache:
+            return self.results_cache[image_key]
+        # 調整閾值
+        adjusted_threshold = threshold * self.confidence_threshold_multipliers.get("full_image", 1.0)
+        # 預處理圖像
+        image_input = self.preprocess(image).unsqueeze(0).to(self.device)
+        # 獲取圖像特徵
+        with torch.no_grad():
+            image_features = self.model.encode_image(image_input)
+            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+            # 計算與地標提示的相似度
+            similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
+            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
+        # 找到最佳匹配
+        best_idx = similarity.argmax().item()
+        best_score = similarity[best_idx]
+        # top3 landmark
+        top_indices = similarity.argsort()[-3:][::-1]
+        top_landmarks = []
+        for idx in top_indices:
+            score = similarity[idx]
+            landmark_id = list(self.landmark_data.keys())[idx]
+            landmark_info = self.landmark_data[landmark_id]
+            landmark_result = {
+                "landmark_id": landmark_id,
+                "landmark_name": landmark_info["name"],
+                "location": landmark_info["location"],
+                "confidence": float(score)
+            }
+            # 添加額外可用信息
+            if "year_built" in landmark_info:
+                landmark_result["year_built"] = landmark_info["year_built"]
+            if "architectural_style" in landmark_info:
+                landmark_result["architectural_style"] = landmark_info["architectural_style"]
+            if "significance" in landmark_info:
+                landmark_result["significance"] = landmark_info["significance"]
+            top_landmarks.append(landmark_result)
+        # main result
+        result = {}
+        if best_score >= adjusted_threshold:
+            landmark_id = list(self.landmark_data.keys())[best_idx]
+            landmark_info = self.landmark_data[landmark_id]
+            # 應用地標類型特定閾值
+            landmark_type = self._determine_landmark_type(landmark_id)
+            type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5
+            final_threshold = adjusted_threshold * type_multiplier
+            if best_score >= final_threshold:
+                result = {
+                    "landmark_id": landmark_id,
+                    "landmark_name": landmark_info["name"],
+                    "location": landmark_info["location"],
+                    "confidence": float(best_score),
+                    "is_landmark": True,
+                    "landmark_type": landmark_type,
+                    "top_landmarks": top_landmarks
+                }
+                # 添加額外可用信息
+                if "year_built" in landmark_info:
+                    result["year_built"] = landmark_info["year_built"]
+                if "architectural_style" in landmark_info:
+                    result["architectural_style"] = landmark_info["architectural_style"]
+                if "significance" in landmark_info:
+                    result["significance"] = landmark_info["significance"]
+            else:
+                result = {
+                    "landmark_id": None,
+                    "landmark_name": None,
+                    "confidence": float(best_score),
+                    "is_landmark": False,
+                    "top_landmarks": top_landmarks
+                }
+        # 如果請求詳細分析且是地標，進一步分析圖像區域
+        if detailed_analysis and result.get("is_landmark", False):
+            # 創建不同區域進行更深入分析
+            width, height = image.size
+            regions = [
+                # 中心區域
+                [width * 0.25, height * 0.25, width * 0.75, height * 0.75],
+                # 左半部
+                [0, 0, width * 0.5, height],
+                # 右半部
+                [width * 0.5, 0, width, height],
+                # 上半部
+                [0, 0, width, height * 0.5],
+                # 下半部
+                [0, height * 0.5, width, height]
+            ]
+            region_results = []
+            for i, box in enumerate(regions):
+                region_result = self.classify_image_region(
+                    image,
+                    box,
+                    threshold=threshold * 0.9,
+                    detection_type="partial"
+                )
+                if region_result["is_landmark"]:
+                    region_result["region_name"] = ["center", "left", "right", "top", "bottom"][i]
+                    region_results.append(region_result)
+            # 添加區域分析結果
+            if region_results:
+                result["region_analyses"] = region_results
+        # 快取結果
+        self.results_cache[image_key] = result
+        self._manage_cache()
+        return result
+    def enhanced_landmark_detection(self,
+                              image: Union[Image.Image, np.ndarray],
+                              threshold: float = 0.3) -> Dict[str, Any]:
+        """
+        Enhanced landmark detection using multiple analysis techniques.
+        Args:
+            image: Input image
+            threshold: Base confidence threshold
+        Returns:
+            Dict: Comprehensive landmark detection results
+        """
+        # Ensure image is PIL format
+        if not isinstance(image, Image.Image):
+            if isinstance(image, np.ndarray):
+                image = Image.fromarray(image)
+            else:
+                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+        # Phase 1: Analyze viewpoint to adjust detection parameters
+        viewpoint_info = self._analyze_viewpoint(image)
+        viewpoint = viewpoint_info["dominant_viewpoint"]
+        # Adjust threshold based on viewpoint
+        if viewpoint == "distant":
+            adjusted_threshold = threshold * 0.7  # Lower threshold for distant views
+        elif viewpoint == "close_up":
+            adjusted_threshold = threshold * 1.1  # Higher threshold for close-ups
+        else:
+            adjusted_threshold = threshold
+        # Phase 2: Perform multi-scale pyramid analysis
+        pyramid_results = self._perform_pyramid_analysis(image, levels=3, base_threshold=adjusted_threshold)
+        # Phase 3: Perform grid-based region analysis
+        grid_results = []
+        width, height = image.size
+        # Create adaptive grid based on viewpoint
+        if viewpoint == "distant":
+            grid_size = 3  # Coarser grid for distant views
+        elif viewpoint == "close_up":
+            grid_size = 5  # Finer grid for close-ups
+        else:
+            grid_size = 4  # Default grid size
+        # Generate grid regions
+        for i in range(grid_size):
+            for j in range(grid_size):
+                box = [
+                    width * (j/grid_size),
+                    height * (i/grid_size),
+                    width * ((j+1)/grid_size),
+                    height * ((i+1)/grid_size)
+                ]
+                # Apply feature enhancement
+                region_result = self.classify_image_region(
+                    image,
+                    box,
+                    threshold=adjusted_threshold,
+                    detection_type="auto"
+                )
+                if region_result["is_landmark"]:
+                    region_result["grid_position"] = (i, j)
+                    grid_results.append(region_result)
+        # Phase 4: Cross-validate and combine results
+        all_detections = []
+        # Add pyramid results
+        if pyramid_results["is_landmark"] and pyramid_results["best_result"]:
+            all_detections.append({
+                "source": "pyramid",
+                "landmark_id": pyramid_results["best_result"]["landmark_id"],
+                "landmark_name": pyramid_results["best_result"]["landmark_name"],
+                "confidence": pyramid_results["best_result"]["confidence"],
+                "scale_factor": pyramid_results["best_result"].get("scale_factor", 1.0)
+            })
+        # Add grid results
+        for result in grid_results:
+            all_detections.append({
+                "source": "grid",
+                "landmark_id": result["landmark_id"],
+                "landmark_name": result["landmark_name"],
+                "confidence": result["confidence"],
+                "grid_position": result.get("grid_position", (0, 0))
+            })
+        # Search entire image
+        full_image_result = self.search_entire_image(image, threshold=adjusted_threshold)
+        if full_image_result and full_image_result.get("is_landmark", False):
+            all_detections.append({
+                "source": "full_image",
+                "landmark_id": full_image_result["landmark_id"],
+                "landmark_name": full_image_result["landmark_name"],
+                "confidence": full_image_result["confidence"]
+            })
+        # Group by landmark_id and calculate aggregate confidence
+        landmark_groups = {}
+        for detection in all_detections:
+            landmark_id = detection["landmark_id"]
+            if landmark_id not in landmark_groups:
+                landmark_groups[landmark_id] = {
+                    "landmark_id": landmark_id,
+                    "landmark_name": detection["landmark_name"],
+                    "detections": [],
+                    "sources": set()
+                }
+            landmark_groups[landmark_id]["detections"].append(detection)
+            landmark_groups[landmark_id]["sources"].add(detection["source"])
+        # Calculate aggregate confidence for each landmark
+        for landmark_id, group in landmark_groups.items():
+            detections = group["detections"]
+            # Base confidence is the maximum confidence from any source
+            max_confidence = max(d["confidence"] for d in detections)
+            # Bonus for detection from multiple sources
+            source_count = len(group["sources"])
+            source_bonus = min(0.15, (source_count - 1) * 0.05)  # Up to 15% bonus
+            # Consistency bonus for multiple detections of the same landmark
+            detection_count = len(detections)
+            consistency_bonus = min(0.1, (detection_count - 1) * 0.02)  # Up to 10% bonus
+            # Calculate final confidence
+            aggregate_confidence = min(1.0, max_confidence + source_bonus + consistency_bonus)
+            group["confidence"] = aggregate_confidence
+            group["detection_count"] = detection_count
+            group["source_count"] = source_count
+        # Sort landmarks by confidence
+        sorted_landmarks = sorted(
+            landmark_groups.values(),
+            key=lambda x: x["confidence"],
+            reverse=True
+        )
+        return {
+            "is_landmark_scene": len(sorted_landmarks) > 0,
+            "detected_landmarks": sorted_landmarks,
+            "viewpoint_info": viewpoint_info,
+            "primary_landmark": sorted_landmarks[0] if sorted_landmarks else None
+        }
+    def _analyze_architectural_features(self, image):
+        """
+        Analyzes the architectural features of a structure in the image without hardcoding specific landmarks.
+        Args:
+            image: Input image
+        Returns:
+            Dict: Architectural feature analysis results
+        """
+        # Define universal architectural feature prompts that apply to all types of landmarks
+        architecture_prompts = {
+            "tall_structure": "a tall vertical structure standing alone",
+            "tiered_building": "a building with multiple stacked tiers or segments",
+            "historical_structure": "a building with historical architectural elements",
+            "modern_design": "a modern structure with contemporary architectural design",
+            "segmented_exterior": "a structure with visible segmented or sectioned exterior",
+            "viewing_platform": "a tall structure with observation area at the top",
+            "time_display": "a structure with timepiece features",
+            "glass_facade": "a building with prominent glass exterior surfaces",
+            "memorial_structure": "a monument or memorial structure",
+            "ancient_construction": "ancient constructed elements or archaeological features",
+            "natural_landmark": "a natural geographic formation or landmark",
+            "slanted_design": "a structure with non-vertical or leaning profile"
+        }
+        # Calculate similarity scores against universal architectural patterns
+        context_scores = self.calculate_similarity_scores(image, architecture_prompts)
+        # Determine most relevant architectural features
+        top_features = sorted(context_scores.items(), key=lambda x: x[1], reverse=True)[:3]
+        # Calculate feature confidence
+        context_confidence = sum(score for _, score in top_features) / 3
+        # Determine primary architectural category based on top features
+        architectural_categories = {
+            "tower": ["tall_structure", "viewing_platform", "time_display"],
+            "skyscraper": ["tall_structure", "modern_design", "glass_facade"],
+            "historical": ["historical_structure", "ancient_construction", "memorial_structure"],
+            "natural": ["natural_landmark"],
+            "distinctive": ["tiered_building", "segmented_exterior", "slanted_design"]
+        }
+        # Score each category based on the top features
+        category_scores = {}
+        for category, features in architectural_categories.items():
+            category_score = 0
+            for feature, score in context_scores.items():
+                if feature in features:
+                    category_score += score
+            category_scores[category] = category_score
+        primary_category = max(category_scores.items(), key=lambda x: x[1])[0]
+        return {
+            "architectural_features": top_features,
+            "context_confidence": context_confidence,
+            "primary_category": primary_category,
+            "category_scores": category_scores
+        }
+    def intelligent_landmark_search(self,
+                                image: Union[Image.Image, np.ndarray],
+                                yolo_boxes: Optional[List[List[float]]] = None,
+                                base_threshold: float = 0.25) -> Dict[str, Any]:
+        """
+        對圖像進行智能地標搜索，綜合整張圖像分析和區域分析
+        Args:
+            image: 原始圖像
+            yolo_boxes: YOLO檢測到的邊界框 (可選)
+            base_threshold: 基礎置信度閾值
+        Returns:
+            Dict: 包含所有檢測結果的綜合分析
+        """
+        # 確保圖像是PIL格式
+        if not isinstance(image, Image.Image):
+            if isinstance(image, np.ndarray):
+                image = Image.fromarray(image)
+            else:
+                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+        # No YOLO 框時，可以稍微降低閾值以提高召回率
+        actual_threshold = base_threshold * 0.85 if yolo_boxes is None or len(yolo_boxes) == 0 else base_threshold
+        # 首先對整張圖像進行分析
+        try:
+            full_image_result = self.search_entire_image(
+                image,
+                threshold=actual_threshold,
+                detailed_analysis=True  # 確保詳細分析開啟
+            )
+            # No YOLO 框，則進行多尺度分析以提高檢測機會
+            if (yolo_boxes is None or len(yolo_boxes) == 0) and (not full_image_result or not full_image_result.get("is_landmark", False)):
+                print("No YOLO boxes provided, attempting multi-scale pyramid analysis")
+                try:
+                    if hasattr(self, '_perform_pyramid_analysis'):
+                        pyramid_results = self._perform_pyramid_analysis(
+                            image,
+                            levels=4,  #
+                            base_threshold=actual_threshold,
+                            aspect_ratios=[1.0, 0.75, 1.5, 0.5, 2.0]
+                        )
+                        if pyramid_results and pyramid_results.get("is_landmark", False) and pyramid_results.get("best_result", {}).get("confidence", 0) > actual_threshold:
+                            # 使用金字塔分析結果增強或替代全圖結果
+                            if not full_image_result or not full_image_result.get("is_landmark", False):
+                                full_image_result = {
+                                    "is_landmark": True,
+                                    "landmark_id": pyramid_results["best_result"]["landmark_id"],
+                                    "landmark_name": pyramid_results["best_result"]["landmark_name"],
+                                    "confidence": pyramid_results["best_result"]["confidence"],
+                                    "location": pyramid_results["best_result"].get("location", "Unknown Location")
+                                }
+                                print(f"Pyramid analysis detected landmark: {pyramid_results['best_result']['landmark_name']} with confidence {pyramid_results['best_result']['confidence']:.3f}")
+                    else:
+                        print("Pyramid analysis not available, skipping multi-scale detection")
+                except Exception as e:
+                    print(f"Error in pyramid analysis: {e}")
+        except Exception as e:
+            print(f"Error in search_entire_image: {e}")
+            import traceback
+            traceback.print_exc()
+            full_image_result = None
+        # 初始化結果字典
+        result = {
+            "full_image_analysis": full_image_result if full_image_result else {},
+            "is_landmark_scene": False,  # 默認值
+            "detected_landmarks": []
+        }
+        # 上下文感知比較，處理接近的排名結果
+        if full_image_result and "top_landmarks" in full_image_result and len(full_image_result["top_landmarks"]) >= 2:
+            top_landmarks = full_image_result["top_landmarks"]
+            # 檢查前兩個結果是否非常接近（信心度差異小於 0.1）
+            if len(top_landmarks) >= 2 and abs(top_landmarks[0]["confidence"] - top_landmarks[1]["confidence"]) < 0.1:
+                # 對於接近的結果，使用通用建築特徵分析進行區分
+                try:
+                    # 分析建築特徵
+                    if hasattr(self, '_analyze_architectural_features'):
+                        architectural_analysis = self._analyze_architectural_features(image)
+                        top_features = architectural_analysis.get("architectural_features", [])
+                        primary_category = architectural_analysis.get("primary_category", "")
+                        # 根據建築特徵調整地標置信度
+                        for i, landmark in enumerate(top_landmarks[:2]):
+                            if i >= len(top_landmarks):
+                                continue
+                            landmark_id = landmark.get("landmark_id", "").lower()
+                            confidence_boost = 0
+                            # 使用主要建築類別來調整置信度，使用通用條件而非特定地標名稱
+                            if primary_category == "tower" and any(term in landmark_id for term in ["tower", "spire", "needle"]):
+                                confidence_boost += 0.05
+                            elif primary_category == "skyscraper" and any(term in landmark_id for term in ["building", "skyscraper", "tall"]):
+                                confidence_boost += 0.05
+                            elif primary_category == "historical" and any(term in landmark_id for term in ["monument", "castle", "palace", "temple"]):
+                                confidence_boost += 0.05
+                            elif primary_category == "distinctive" and any(term in landmark_id for term in ["unusual", "unique", "special", "famous"]):
+                                confidence_boost += 0.05
+                            # 根據特定特徵進一步微調，使用通用特徵描述而非特定地標
+                            for feature, score in top_features:
+                                if feature == "time_display" and "clock" in landmark_id:
+                                    confidence_boost += 0.03
+                                elif feature == "segmented_exterior" and "segmented" in landmark_id:
+                                    confidence_boost += 0.03
+                                elif feature == "slanted_design" and "leaning" in landmark_id:
+                                    confidence_boost += 0.03
+                            # 應用信心度調整
+                            if confidence_boost > 0 and i < len(top_landmarks):
+                                top_landmarks[i]["confidence"] += confidence_boost
+                                print(f"Boosted {landmark['landmark_name']} confidence by {confidence_boost:.2f} based on architectural features ({primary_category})")
+                        # 重新排序
+                        top_landmarks.sort(key=lambda x: x["confidence"], reverse=True)
+                        full_image_result["top_landmarks"] = top_landmarks
+                        if top_landmarks:
+                            full_image_result["landmark_id"] = top_landmarks[0]["landmark_id"]
+                            full_image_result["landmark_name"] = top_landmarks[0]["landmark_name"]
+                            full_image_result["confidence"] = top_landmarks[0]["confidence"]
+                            full_image_result["location"] = top_landmarks[0].get("location", "Unknown Location")
+                except Exception as e:
+                    print(f"Error in architectural feature analysis: {e}")
+                    import traceback
+                    traceback.print_exc()
+        if full_image_result and full_image_result.get("is_landmark", False):
+            result["is_landmark_scene"] = True
+            landmark_id = full_image_result.get("landmark_id", "unknown")
+            # extract landmark info
+            landmark_specific_info = self._extract_landmark_specific_info(landmark_id)
+            landmark_info = {
+                "landmark_id": landmark_id,
+                "landmark_name": full_image_result.get("landmark_name", "Unknown Landmark"),
+                "confidence": full_image_result.get("confidence", 0.0),
+                "location": full_image_result.get("location", "Unknown Location"),
+                "region_type": "full_image",
+                "box": [0, 0, getattr(image, 'width', 0), getattr(image, 'height', 0)]
+            }
+            # 整合地標特定info，確保正確的名稱被使用
+            landmark_info.update(landmark_specific_info)
+            # 如果特定信息中有更準確的地標名稱，使用它
+            if landmark_specific_info.get("landmark_name"):
+                landmark_info["landmark_name"] = landmark_specific_info["landmark_name"]
+            result["detected_landmarks"].append(landmark_info)
+            # 確保地標特定活動被正確設置為主要結果
+            if landmark_specific_info.get("has_specific_activities", False):
+                result["primary_landmark_activities"] = landmark_specific_info.get("landmark_specific_activities", [])
+                print(f"Set primary landmark activities: {len(result['primary_landmark_activities'])} activities for {landmark_info['landmark_name']}")
+        # 如果提供了YOLO邊界框，分析這些區域
+        if yolo_boxes and len(yolo_boxes) > 0:
+            for box in yolo_boxes:
+                try:
+                    if hasattr(self, 'classify_image_region'):
+                        box_result = self.classify_image_region(
+                            image,
+                            box,
+                            threshold=base_threshold,
+                            detection_type="auto"
+                        )
+                        # 如果檢測到地標
+                        if box_result and box_result.get("is_landmark", False):
+                            # 檢查是否與已檢測的地標重複
+                            is_duplicate = False
+                            for existing in result["detected_landmarks"]:
+                                if existing.get("landmark_id") == box_result.get("landmark_id"):
+                                    # 如果新的置信度更高，則更新
+                                    if box_result.get("confidence", 0) > existing.get("confidence", 0):
+                                        existing.update({
+                                            "confidence": box_result.get("confidence", 0),
+                                            "region_type": "yolo_box",
+                                            "box": box
+                                        })
+                                    is_duplicate = True
+                                    break
+                            # 如果不是重複的，添加到列表
+                            if not is_duplicate:
+                                result["detected_landmarks"].append({
+                                    "landmark_id": box_result.get("landmark_id", "unknown"),
+                                    "landmark_name": box_result.get("landmark_name", "Unknown Landmark"),
+                                    "confidence": box_result.get("confidence", 0.0),
+                                    "location": box_result.get("location", "Unknown Location"),
+                                    "region_type": "yolo_box",
+                                    "box": box
+                                })
+                except Exception as e:
+                    print(f"Error in analyzing YOLO box: {e}")
+                    continue
+        # 最後，執行額外的網格搜索以捕獲可能被遺漏的地標
+        # 但只有在尚未發現地標或僅發現低置信度地標時
+        should_do_grid_search = (
+            len(result["detected_landmarks"]) == 0 or
+            max([landmark.get("confidence", 0) for landmark in result["detected_landmarks"]], default=0) < 0.5
+        )
+        if should_do_grid_search and hasattr(self, 'classify_image_region'):
+            try:
+                # 創建5x5網格
+                width, height = getattr(image, 'size', (getattr(image, 'width', 0), getattr(image, 'height', 0)))
+                if not isinstance(width, (int, float)) or width <= 0:
+                    width = getattr(image, 'width', 0)
+                if not isinstance(height, (int, float)) or height <= 0:
+                    height = getattr(image, 'height', 0)
+                if width > 0 and height > 0:
+                    grid_boxes = []
+                    for i in range(5):
+                        for j in range(5):
+                            grid_boxes.append([
+                                width * (j/5), height * (i/5),
+                                width * ((j+1)/5), height * ((i+1)/5)
+                            ])
+                    # 分析每個網格區域
+                    for box in grid_boxes:
+                        try:
+                            grid_result = self.classify_image_region(
+                                image,
+                                box,
+                                threshold=base_threshold * 0.9,  # 稍微降低網格搜索閾值
+                                detection_type="partial"
+                            )
+                            # 如果檢測到地標
+                            if grid_result and grid_result.get("is_landmark", False):
+                                # 檢查是否與已檢測的地標重複
+                                is_duplicate = False
+                                for existing in result["detected_landmarks"]:
+                                    if existing.get("landmark_id") == grid_result.get("landmark_id"):
+                                        is_duplicate = True
+                                        break
+                                # 如果不是重複的，添加到列表
+                                if not is_duplicate:
+                                    result["detected_landmarks"].append({
+                                        "landmark_id": grid_result.get("landmark_id", "unknown"),
+                                        "landmark_name": grid_result.get("landmark_name", "Unknown Landmark"),
+                                        "confidence": grid_result.get("confidence", 0.0),
+                                        "location": grid_result.get("location", "Unknown Location"),
+                                        "region_type": "grid",
+                                        "box": box
+                                    })
+                        except Exception as e:
+                            print(f"Error in analyzing grid region: {e}")
+                            continue
+            except Exception as e:
+                print(f"Error in grid search: {e}")
+                import traceback
+                traceback.print_exc()
+        # 按置信度排序檢測結果
+        result["detected_landmarks"].sort(key=lambda x: x.get("confidence", 0), reverse=True)
+        # 更新整體場景類型判斷
+        if len(result["detected_landmarks"]) > 0:
+            result["is_landmark_scene"] = True
+            result["primary_landmark"] = result["detected_landmarks"][0]
+            # 添加 clip_analysis_on_full_image 結果，以便給 LLM 提供更多上下文
+            if full_image_result and "clip_analysis" in full_image_result:
+                result["clip_analysis_on_full_image"] = full_image_result["clip_analysis"]
+        return result
+    def _extract_landmark_specific_info(self, landmark_id: str) -> Dict[str, Any]:
+        """
+        提取特定地標的詳細信息，包括特色模板和活動建議
+        Args:
+            landmark_id: 地標ID
+        Returns:
+            Dict: 地標特定信息
+        """
+        if not landmark_id or landmark_id == "unknown":
+            return {"has_specific_activities": False}
+        specific_info = {"has_specific_activities": False}
+        # 從 ALL_LANDMARKS 或 self.landmark_data 中提取基本信息
+        landmark_data_source = None
+        # 優先嘗試從類屬性獲取
+        if hasattr(self, 'landmark_data') and self.landmark_data and landmark_id in self.landmark_data:
+            landmark_data_source = self.landmark_data[landmark_id]
+            print(f"Using landmark data from class attribute for {landmark_id}")
+        else:
+            try:
+                if landmark_id in ALL_LANDMARKS:
+                    landmark_data_source = ALL_LANDMARKS[landmark_id]
+                    print(f"Using landmark data from ALL_LANDMARKS for {landmark_id}")
+            except ImportError:
+                print("Warning: Could not import ALL_LANDMARKS from landmark_data")
+            except Exception as e:
+                print(f"Error accessing ALL_LANDMARKS: {e}")
+        # 處理地標基本數據
+        if landmark_data_source:
+            # 提取正確的地標名稱
+            if "name" in landmark_data_source:
+                specific_info["landmark_name"] = landmark_data_source["name"]
+            # 提取所有可用的 prompts 作為特色模板
+            if "prompts" in landmark_data_source:
+                specific_info["feature_templates"] = landmark_data_source["prompts"][:5]
+                specific_info["primary_template"] = landmark_data_source["prompts"][0]
+            # 提取別名info
+            if "aliases" in landmark_data_source:
+                specific_info["aliases"] = landmark_data_source["aliases"]
+            # 提取位置信息
+            if "location" in landmark_data_source:
+                specific_info["location"] = landmark_data_source["location"]
+            # 提取其他相關信息
+            for key in ["year_built", "architectural_style", "significance", "description"]:
+                if key in landmark_data_source:
+                    specific_info[key] = landmark_data_source[key]
+        # 嘗試從 LANDMARK_ACTIVITIES 中提取活動建議
+        try:
+            if landmark_id in LANDMARK_ACTIVITIES:
+                activities = LANDMARK_ACTIVITIES[landmark_id]
+                specific_info["landmark_specific_activities"] = activities
+                specific_info["has_specific_activities"] = True
+                print(f"Found {len(activities)} specific activities for landmark {landmark_id}")
+            else:
+                print(f"No specific activities found for landmark {landmark_id} in LANDMARK_ACTIVITIES")
+                specific_info["has_specific_activities"] = False
+        except ImportError:
+            print("Warning: Could not import LANDMARK_ACTIVITIES from landmark_activities")
+            specific_info["has_specific_activities"] = False
+        except Exception as e:
+            print(f"Error loading landmark activities for {landmark_id}: {e}")
+            specific_info["has_specific_activities"] = False
+        return specific_info
+    def _analyze_viewpoint(self, image: Union[Image.Image, np.ndarray]) -> Dict[str, float]:
+        """
+        Analyzes the image viewpoint to adjust detection parameters.
+        Args:
+            image: Input image
+        Returns:
+            Dict: Viewpoint analysis results
+        """
+        viewpoint_prompts = {
+            "aerial_view": "an aerial view from above looking down",
+            "street_level": "a street level view looking up at a tall structure",
+            "eye_level": "an eye-level horizontal view of a landmark",
+            "distant": "a distant view of a landmark on the horizon",
+            "close_up": "a close-up detailed view of architectural features",
+            "interior": "an interior view inside a structure"
+        }
+        # Calculate similarity scores
+        viewpoint_scores = self.calculate_similarity_scores(image, viewpoint_prompts)
+        # Find dominant viewpoint
+        dominant_viewpoint = max(viewpoint_scores.items(), key=lambda x: x[1])
+        return {
+            "viewpoint_scores": viewpoint_scores,
+            "dominant_viewpoint": dominant_viewpoint[0],
+            "confidence": dominant_viewpoint[1]
+        }
+    def calculate_similarity_scores(self, image: Union[Image.Image, np.ndarray],
+                                prompts: Dict[str, str]) -> Dict[str, float]:
+        """
+        計算圖像與一組特定提示之間的相似度分數
+        Args:
+            image: 輸入圖像
+            prompts: 提示詞字典 {名稱: 提示文本}
+        Returns:
+            Dict[str, float]: 每個提示的相似度分數
+        """
+        # 確保圖像是PIL格式
+        if not isinstance(image, Image.Image):
+            if isinstance(image, np.ndarray):
+                image = Image.fromarray(image)
+            else:
+                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+        # 預處理圖像
+        image_input = self.preprocess(image).unsqueeze(0).to(self.device)
+        # 獲取圖像特徵
+        with torch.no_grad():
+            image_features = self.model.encode_image(image_input)
+            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        # 計算與每個提示的相似度
+        scores = {}
+        prompt_texts = list(prompts.values())
+        prompt_tokens = clip.tokenize(prompt_texts).to(self.device)
+        with torch.no_grad():
+            prompt_features = self.model.encode_text(prompt_tokens)
+            prompt_features = prompt_features / prompt_features.norm(dim=-1, keepdim=True)
+            # calculate similarity
+            similarity = (100.0 * image_features @ prompt_features.T).softmax(dim=-1)
+            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
+        # 填充結果字典
+        for i, (name, _) in enumerate(prompts.items()):
+            scores[name] = float(similarity[i])
+        return scores

enhance_scene_describer.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import re
 import json
 import random
 import numpy as np
 from typing import Dict, List, Tuple, Any, Optional
@@ -12,6 +13,7 @@ from lighting_conditions import LIGHTING_CONDITIONS
 from viewpoint_templates import VIEWPOINT_TEMPLATES
 from cultural_templates import CULTURAL_TEMPLATES
 from confifence_templates import CONFIDENCE_TEMPLATES
 class EnhancedSceneDescriber:
     """
@@ -21,7 +23,7 @@ class EnhancedSceneDescriber:
     detection results and scene classification.
     """
-    def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None):
         """
         Initialize the enhanced scene describer.
@@ -29,6 +31,15 @@ class EnhancedSceneDescriber:
             templates_db: Optional custom templates database
             scene_types: Dictionary of scene type definitions
         """
         # Load or use provided scene types
         self.scene_types = scene_types or self._load_default_scene_types()
@@ -57,7 +68,7 @@ class EnhancedSceneDescriber:
         """
         templates = {}
-        # 直接從導入的 Python 模組中獲取模板
         templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES
         templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS
         templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES
@@ -100,19 +111,19 @@ class EnhancedSceneDescriber:
                 "low": "This might be {description}, but the confidence is low. {details}"
             }
-        # 場景細節模板 - 如果未從外部導入
         if "scene_detail_templates" not in templates:
             templates["scene_detail_templates"] = {
                 "default": ["A space with various objects."]
             }
-        # 物體填充模板 - 用於生成物體描述
         if "object_template_fillers" not in templates:
             templates["object_template_fillers"] = {
                 "default": ["various items"]
             }
-        # 視角模板 - 雖然我們現在從專門模組導入，但作為備份
         if "viewpoint_templates" not in templates:
             # 使用簡化版的默認視角模板
             templates["viewpoint_templates"] = {
@@ -147,6 +158,7 @@ class EnhancedSceneDescriber:
                 "unknown": "The lighting conditions are not easily determined."
             }
     def _initialize_viewpoint_parameters(self):
         """
         Initialize parameters used for viewpoint detection.
@@ -165,232 +177,444 @@ class EnhancedSceneDescriber:
             "elevated_top_threshold": 0.3  # Few objects at top of frame
         }
-    def generate_description(self,
-                        scene_type: str,
-                        detected_objects: List[Dict],
-                        confidence: float,
-                        lighting_info: Optional[Dict] = None,
-                        functional_zones: Optional[Dict] = None) -> str:
         """
-        Generate enhanced scene description based on detection results, scene type,
-        and additional contextual information.
-        This is the main entry point that replaces the original _generate_scene_description.
         Args:
-            scene_type: Identified scene type
-            detected_objects: List of detected objects
-            confidence: Scene classification confidence
-            lighting_info: Optional lighting condition information
-            functional_zones: Optional identified functional zones
         Returns:
-            str: Natural language description of the scene
         """
-        # Handle unknown scene type or very low confidence
-        if scene_type == "unknown" or confidence < 0.4:
-            return self._format_final_description(self._generate_generic_description(detected_objects, lighting_info))
-        # Detect viewpoint
-        viewpoint = self._detect_viewpoint(detected_objects)
-        # Process aerial viewpoint scene types
-        if viewpoint == "aerial":
-            if "intersection" in scene_type or self._is_intersection(detected_objects):
-                scene_type = "aerial_view_intersection"
-            elif any(keyword in scene_type for keyword in ["commercial", "shopping", "retail"]):
-                scene_type = "aerial_view_commercial_area"
-            elif any(keyword in scene_type for keyword in ["plaza", "square"]):
-                scene_type = "aerial_view_plaza"
             else:
-                scene_type = "aerial_view_intersection"
-        # Detect cultural context - only for non-aerial viewpoints
-        cultural_context = None
-        if viewpoint != "aerial":
-            cultural_context = self._detect_cultural_context(scene_type, detected_objects)
-        # Select appropriate template based on confidence
-        if confidence > 0.75:
-            confidence_level = "high"
-        elif confidence > 0.5:
-            confidence_level = "medium"
-        else:
-            confidence_level = "low"
-        # Get base description for the scene type
-        if viewpoint == "aerial":
-            if 'base_description' not in locals():
-                base_description = "An aerial view showing the layout and movement patterns from above"
-        elif scene_type in self.scene_types:
-            base_description = self.scene_types[scene_type].get("description", "A scene")
-        else:
-            base_description = "A scene"
-        # Generate detailed scene information
-        scene_details = self._generate_scene_details(
-            scene_type,
-            detected_objects,
-            lighting_info,
-            viewpoint
-        )
-        # Start with the base description
-        description = base_description
-        # If there's a secondary description from the scene type template, append it properly
-        if scene_type in self.scene_types and "secondary_description" in self.scene_types[scene_type]:
-            secondary_desc = self.scene_types[scene_type]["secondary_description"]
-            if secondary_desc:
-                description = self._smart_append(description, secondary_desc)
-        # Improve description based on people count
-        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]  # Person class
-        if people_objs:
-            people_count = len(people_objs)
-            if people_count > 5:
-                people_phrase = f"numerous people ({people_count})"
-            else:
-                people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}"
-            # Add people information to the scene details if not already mentioned
-            if "people" not in description.lower() and "pedestrian" not in description.lower():
-                description = self._smart_append(description, f"The scene includes {people_phrase}")
-        # Apply cultural context if detected (only for non-aerial viewpoints)
-        if cultural_context and viewpoint != "aerial":
-            cultural_elements = self._generate_cultural_elements(cultural_context)
-            if cultural_elements:
-                description = self._smart_append(description, cultural_elements)
-        # Now append the detailed scene information if available
-        if scene_details:
-            # Use smart_append to ensure proper formatting between base description and details
-            description = self._smart_append(description, scene_details)
-        # Include lighting information if available
-        lighting_description = ""
         if lighting_info and "time_of_day" in lighting_info:
             lighting_type = lighting_info["time_of_day"]
             if lighting_type in self.templates.get("lighting_templates", {}):
                 lighting_description = self.templates["lighting_templates"][lighting_type]
-        # Add lighting description if available
-        if lighting_description and lighting_description not in description:
-            description = self._smart_append(description, lighting_description)
-        # Process viewpoint information
         if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
             viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
-            # Special handling for viewpoint prefix
             prefix = viewpoint_template.get('prefix', '')
             if prefix and not description.startswith(prefix):
-                # Prefix is a phrase like "From above, " that should precede the description
                 if description and description[0].isupper():
-                    # Maintain the flow by lowercasing the first letter after the prefix
                     description = prefix + description[0].lower() + description[1:]
                 else:
                     description = prefix + description
-            # Get appropriate scene elements description based on viewpoint
-            if viewpoint == "aerial":
-                scene_elements = "the crossing patterns and pedestrian movement"
-            else:
-                scene_elements = "objects and layout"
             viewpoint_desc = viewpoint_template.get("observation", "").format(
-                scene_elements=scene_elements
             )
-            # Add viewpoint observation if not already included
             if viewpoint_desc and viewpoint_desc not in description:
                 description = self._smart_append(description, viewpoint_desc)
-        # Add information about functional zones if available
         if functional_zones and len(functional_zones) > 0:
             zones_desc = self._describe_functional_zones(functional_zones)
             if zones_desc:
                 description = self._smart_append(description, zones_desc)
-        # Calculate actual people count
-        people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
-        # Check for inconsistencies in people count descriptions
-        if people_count > 5:
-            # Identify fragments that might contain smaller people counts
-            small_people_patterns = [
-                r"Area with \d+ people\.",
-                r"Area with \d+ person\.",
-                r"with \d+ people",
-                r"with \d+ person"
             ]
-            # Check and remove each pattern
-            filtered_description = description
-            for pattern in small_people_patterns:
-                matches = re.findall(pattern, filtered_description)
-                for match in matches:
-                    # Extract the number from the match
-                    number_match = re.search(r'\d+', match)
-                    if number_match:
-                        try:
-                            people_mentioned = int(number_match.group())
-                            # If the mentioned count is less than total, remove the entire sentence
-                            if people_mentioned < people_count:
-                                # Split description into sentences
-                                sentences = re.split(r'(?<=[.!?])\s+', filtered_description)
-                                # Remove sentences containing the match
-                                filtered_sentences = []
-                                for sentence in sentences:
-                                    if match not in sentence:
-                                        filtered_sentences.append(sentence)
-                                # Recombine the description
-                                filtered_description = " ".join(filtered_sentences)
-                        except ValueError:
-                            # Failed number conversion, continue processing
-                            continue
-            # Use the filtered description
-            description = filtered_description
-        # Final formatting to ensure correct punctuation and capitalization
-        description = self._format_final_description(description)
-        description_lines = description.split('\n')
-        clean_description = []
-        skip_block = False  # 添加這個變數的定義
-        for line in description_lines:
-            # 檢查是否需要跳過這行
-            if line.strip().startswith(':param') or line.strip().startswith('"""'):
-                continue
-            if line.strip().startswith("Exercise") or "class SceneDescriptionSystem" in line:
-                skip_block = True
-                continue
-            if ('def generate_scene_description' in line or
-                'def enhance_scene_descriptions' in line or
-                'def __init__' in line):
-                skip_block = True
-                continue
-            if line.strip().startswith('#TEST'):
-                skip_block = True
-                continue
-            # 空行結束跳過模式
-            if skip_block and line.strip() == "":
-                skip_block = False
-            # 如果不需要跳過，添加這行到結果
-            if not skip_block:
-                clean_description.append(line)
-        # 如果過濾後的描述為空，返回原始描述
-        if not clean_description:
-            return description
-        else:
-            return '\n'.join(clean_description)
     def _smart_append(self, current_text: str, new_fragment: str) -> str:
         """
@@ -424,13 +648,17 @@ class EnhancedSceneDescriber:
         (new_fragment.startswith("A ") or new_fragment.startswith("An ")):
             return current_text + ". " + new_fragment
         # Decide how to join the texts
         if ends_with_sentence:
             # After a sentence, start with uppercase and add proper spacing
             joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:])
         elif ends_with_comma:
             # After a comma, maintain flow with lowercase unless it's a proper noun or special case
-            if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
                 joined_text = current_text + " " + new_fragment
             else:
                 joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:]
@@ -440,7 +668,7 @@ class EnhancedSceneDescriber:
         else:
             # For other cases, decide based on the content
             if self._is_related_phrases(current_text, new_fragment):
-                if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
                     joined_text = current_text + ", " + new_fragment
                 else:
                     joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:]
@@ -489,88 +717,78 @@ class EnhancedSceneDescriber:
         return False
     def _format_final_description(self, text: str) -> str:
         """
         Format the final description text to ensure correct punctuation,
         capitalization, and spacing.
-        Args:
-            text: The text to format
-        Returns:
-            str: The properly formatted text
         """
-        import re
-        if not text:
             return ""
-        # 1. 特別處理連續以"A"開頭的片段 (這是一個常見問題)
-        text = re.sub(r'(A\s[^.!?]+?)\s+(A\s)', r'\1. \2', text, flags=re.IGNORECASE)
-        text = re.sub(r'(An\s[^.!?]+?)\s+(An?\s)', r'\1. \2', text, flags=re.IGNORECASE)
-        # 2. 確保第一個字母大寫
-        text = text[0].upper() + text[1:] if text else ""
-        # 3. 修正詞之間的空格問題
-        text = re.sub(r'\s{2,}', ' ', text)  # 多個空格改為一個
-        text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # 小寫後大寫間加空格
-        # 4. 修正詞連接問題
-        text = re.sub(r'([a-zA-Z])and', r'\1 and', text)  # "xxx"和"and"間加空格
-        text = re.sub(r'([a-zA-Z])with', r'\1 with', text)  # "xxx"和"with"間加空格
-        text = re.sub(r'plants(and|with|or)', r'plants \1', text)  # 修正"plantsand"這類問題
-        # 5. 修正標點符號後的大小寫問題
-        text = re.sub(r'\.(\s+)([a-z])', lambda m: f'.{m.group(1)}{m.group(2).upper()}', text)  # 句號後大寫
-        # 6. 修正逗號後接大寫單詞的問題
         def fix_capitalization_after_comma(match):
-            word = match.group(2)
-            # 例外情況：保留專有名詞、人稱代詞等的大寫
-            if word in ["I", "I'm", "I've", "I'd", "I'll"]:
-                return match.group(0)  # 保持原樣
-            # 保留月份、星期、地名等專有名詞的大寫
-            proper_nouns = ["January", "February", "March", "April", "May", "June", "July",
-                            "August", "September", "October", "November", "December",
-                            "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
-            if word in proper_nouns:
-                return match.group(0)  # 保持原樣
-            # 其他情況：將首字母改為小寫
-            return match.group(1) + word[0].lower() + word[1:]
-        # 匹配逗號後接空格再接大寫單詞的模式
-        text = re.sub(r'(,\s+)([A-Z][a-zA-Z]*)', fix_capitalization_after_comma, text)
-        common_phrases = [
-            (r'Social or seating area', r'social or seating area'),
-            (r'Sleeping area', r'sleeping area'),
-            (r'Dining area', r'dining area'),
-            (r'Living space', r'living space')
-        ]
-        for phrase, replacement in common_phrases:
-            # 只修改句中的術語，保留句首的大寫
-            text = re.sub(r'(?<=[.!?]\s)' + phrase, replacement, text)
-            # 修改句中的術語，但保留句首的大寫
-            text = re.sub(r'(?<=,\s)' + phrase, replacement, text)
-        # 7. 確保標點符號後有空格
-        text = re.sub(r'\s+([.,;:!?])', r'\1', text)  # 標點符號前不要空格
-        text = re.sub(r'([.,;:!?])([a-zA-Z0-9])', r'\1 \2', text)  # 標點符號後要有空格
-        # 8. 修正重複標點符號
-        text = re.sub(r'\.{2,}', '.', text)  # 多個句號變一個
-        text = re.sub(r',{2,}', ',', text)  # 多個逗號變一個
-        # 9. 確保文本以標點結束
-        if text and not text[-1] in '.!?':
-            text += '.'
-        return text
     def _is_intersection(self, detected_objects: List[Dict]) -> bool:
         """
@@ -652,65 +870,585 @@ class EnhancedSceneDescriber:
         return base_desc
     def _generate_scene_details(self,
-                              scene_type: str,
-                              detected_objects: List[Dict],
-                              lighting_info: Optional[Dict] = None,
-                              viewpoint: str = "eye_level") -> str:
         """
         Generate detailed description based on scene type and detected objects.
         Args:
-            scene_type: Identified scene type
-            detected_objects: List of detected objects
-            lighting_info: Optional lighting condition information
-            viewpoint: Detected viewpoint (aerial, eye_level, etc.)
         Returns:
-            str: Detailed scene description
         """
-        # Get scene-specific templates
         scene_details = ""
         scene_templates = self.templates.get("scene_detail_templates", {})
-        # Handle specific scene types
-        if scene_type in scene_templates:
-            # Select a template appropriate for the viewpoint if available
-            viewpoint_key = f"{scene_type}_{viewpoint}"
-            if viewpoint_key in scene_templates:
-                # We have a viewpoint-specific template
-                templates_list = scene_templates[viewpoint_key]
             else:
-                # Fall back to general templates for this scene type
-                templates_list = scene_templates[scene_type]
-            # Select a random template from the list
             if templates_list:
                 detail_template = random.choice(templates_list)
-                # Fill the template with object information
                 scene_details = self._fill_detail_template(
                     detail_template,
                     detected_objects,
-                    scene_type
-                )
-        else:
-            # Use default templates if specific ones aren't available
-            if "default" in scene_templates:
-                detail_template = random.choice(scene_templates["default"])
-                scene_details = self._fill_detail_template(
-                    detail_template,
-                    detected_objects,
-                    "default"
                 )
             else:
-                # Fall back to basic description if no templates are available
-                scene_details = self._generate_basic_details(scene_type, detected_objects)
-        return scene_details
-    def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str) -> str:
         """
         Fill a template with specific details based on detected objects.
@@ -731,6 +1469,41 @@ class EnhancedSceneDescriber:
         # Get object template fillers
         fillers = self.templates.get("object_template_fillers", {})
         # 為所有可能的變數設置默認值
         default_replacements = {
             # 室內相關
@@ -910,6 +1683,36 @@ class EnhancedSceneDescriber:
             "knowledge_transfer": "learning exchanges"
         }
         # For each placeholder, try to fill with appropriate content
         for placeholder in placeholders:
             if placeholder in fillers:
@@ -1137,7 +1940,7 @@ class EnhancedSceneDescriber:
         if not detected_objects:
             return "eye_level"  # default
-        # 提取物體位置和大小
         top_region_count = 0
         bottom_region_count = 0
         total_objects = len(detected_objects)
@@ -1153,29 +1956,29 @@ class EnhancedSceneDescriber:
         crosswalk_pattern_detected = False
         for obj in detected_objects:
-            # 計算頂部/底部區域中的物體
             region = obj["region"]
             if "top" in region:
                 top_region_count += 1
             elif "bottom" in region:
                 bottom_region_count += 1
-            # 計算標準化大小（面積）
             if "normalized_area" in obj:
                 sizes.append(obj["normalized_area"])
-            # 計算高度/寬度比例
             if "normalized_size" in obj:
                 width, height = obj["normalized_size"]
                 if width > 0:
                     height_width_ratios.append(height / width)
-            # 收集人的位置用於圖案檢測
             if obj["class_id"] == 0:  # 人
                 if "normalized_center" in obj:
                     people_positions.append(obj["normalized_center"])
-        # 專門為斑馬線十字路口添加檢測邏輯
         # 檢查是否有明顯的垂直和水平行人分布
         people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]  # 人
@@ -1194,7 +1997,7 @@ class EnhancedSceneDescriber:
                 y_range = max(y_coords) - min(y_coords)
                 # 嘗試檢測十字形分布
-                # 如果 x 和 y 方向都有較大範圍，且範圍相似，可能是十字路口
                 if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
                     # 計算到中心點的距離
@@ -1391,7 +2194,6 @@ class EnhancedSceneDescriber:
             description = description.replace("a bed in the room", "a bed")
         # 處理重複的物品列表
-        # 尋找格式如 "item, item, item" 的模式
         object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
         for obj_list in object_lists:
@@ -1441,6 +2243,20 @@ class EnhancedSceneDescriber:
         if not functional_zones:
             return ""
         # 計算場景中的總人數
         total_people_count = 0
         people_by_zone = {}
@@ -1480,12 +2296,12 @@ class EnhancedSceneDescriber:
         # 生成匯總描述
         summary = ""
-        max_mentioned_people = 0  # 跟踪已經提到的最大人數
         # 如果總人數顯著且還沒在主描述中提到，添加總人數描述
         if total_people_count > 5:
             summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
-            max_mentioned_people = total_people_count  # 更新已提到的最大人數
         # 處理每個區域的描述，確保人數信息的一致性
         processed_zones = []
@@ -1494,7 +2310,7 @@ class EnhancedSceneDescriber:
             zone_desc = zone_info.get("description", "a functional zone")
             zone_people_count = people_by_zone.get(zone_name, 0)
-            # 檢查描述中是否包含人數信息
             contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())
             # 如果描述包含人數信息，且人數較小（小於已提到的最大人數），則修改描述

 import os
 import re
 import json
+import logging
 import random
 import numpy as np
 from typing import Dict, List, Tuple, Any, Optional
 from viewpoint_templates import VIEWPOINT_TEMPLATES
 from cultural_templates import CULTURAL_TEMPLATES
 from confifence_templates import CONFIDENCE_TEMPLATES
+from landmark_data import ALL_LANDMARKS
 class EnhancedSceneDescriber:
     """
     detection results and scene classification.
     """
+    def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None, spatial_analyzer_instance: Optional[Any] = None):
         """
         Initialize the enhanced scene describer.
             templates_db: Optional custom templates database
             scene_types: Dictionary of scene type definitions
         """
+        self.logger = logging.getLogger(self.__class__.__name__) # Use class name for logger
+        self.logger.setLevel(logging.INFO) # Or your desired logging level
+        # Optional: Add a handler if not configured globally
+        if not self.logger.hasHandlers():
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+            handler.setFormatter(formatter)
+            self.logger.addHandler(handler)
         # Load or use provided scene types
         self.scene_types = scene_types or self._load_default_scene_types()
         """
         templates = {}
+        # 載入事先準備的模板
         templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES
         templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS
         templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES
                 "low": "This might be {description}, but the confidence is low. {details}"
             }
+        # 場景細節模板
         if "scene_detail_templates" not in templates:
             templates["scene_detail_templates"] = {
                 "default": ["A space with various objects."]
             }
+        # 物體填充模板，用於生成物體描述
         if "object_template_fillers" not in templates:
             templates["object_template_fillers"] = {
                 "default": ["various items"]
             }
+        # 視角模板，雖然現在從專門模組導入，但可作為備份
         if "viewpoint_templates" not in templates:
             # 使用簡化版的默認視角模板
             templates["viewpoint_templates"] = {
                 "unknown": "The lighting conditions are not easily determined."
             }
     def _initialize_viewpoint_parameters(self):
         """
         Initialize parameters used for viewpoint detection.
             "elevated_top_threshold": 0.3  # Few objects at top of frame
         }
+    def _generate_landmark_description(self,
+                                 scene_type: str,
+                                 detected_objects: List[Dict],
+                                 confidence: float,
+                                 lighting_info: Optional[Dict] = None,
+                                 functional_zones: Optional[Dict] = None,
+                                 landmark_objects: Optional[List[Dict]] = None) -> str:
         """
+        生成包含地標信息的場景描述
         Args:
+            scene_type: 識別的場景類型
+            detected_objects: 檢測到的物體列表
+            confidence: 場景分類置信度
+            lighting_info: 照明條件信息���可選）
+            functional_zones: 功能區域信息（可選）
+            landmark_objects: 識別為地標的物體列表（可選）
         Returns:
+            str: 包含地標信息的自然語言場景描述
         """
+        # 如果沒有提供地標物體，則從檢測物體中篩選
+        if landmark_objects is None:
+            landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
+        # 如果沒有地標，退回到標準描述
+        if not landmark_objects:
+            if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
+                # 場景類型是地標但沒有具體地標物體
+                base_description = "A scenic area that appears to be a tourist destination, though specific landmarks are not clearly identifiable."
+            else:
+                # 使用標準方法生成基本描述
+                return self._format_final_description(self._generate_scene_details(
+                    scene_type,
+                    detected_objects,
+                    lighting_info,
+                    self._detect_viewpoint(detected_objects)
+                ))
+        else:
+            # 獲取主要地標（信心度最高的）
+            primary_landmark = max(landmark_objects, key=lambda x: x.get("confidence", 0))
+            landmark_name = primary_landmark.get("class_name", "landmark")
+            landmark_location = primary_landmark.get("location", "")
+            # 根據地標類型選擇適當的描述模板
+            if scene_type == "natural_landmark" or primary_landmark.get("landmark_type") == "natural":
+                base_description = f"A natural landmark scene featuring {landmark_name} in {landmark_location}."
+            elif scene_type == "historical_monument" or primary_landmark.get("landmark_type") == "monument":
+                base_description = f"A historical monument scene showcasing {landmark_name}, a significant landmark in {landmark_location}."
             else:
+                base_description = f"A tourist landmark scene centered around {landmark_name}, an iconic structure in {landmark_location}."
+        # 加地標的額外信息
+        landmark_details = []
+        for landmark in landmark_objects:
+            details = []
+            # 加建造年份
+            if "year_built" in landmark:
+                details.append(f"built in {landmark['year_built']}")
+            # 加建築風格
+            if "architectural_style" in landmark:
+                details.append(f"featuring {landmark['architectural_style']} architectural style")
+            # 加重要性
+            if "significance" in landmark:
+                details.append(landmark["significance"])
+            # 如果有詳細信息，加到描述中
+            if details:
+                landmark_details.append(f"{landmark['class_name']} ({', '.join(details)})")
+        # 將詳細信息添加到基本描述中
+        if landmark_details:
+            description = base_description + " " + "The scene features " + ", ".join(landmark_details) + "."
+        else:
+            description = base_description
+        # 獲取視角
+        viewpoint = self._detect_viewpoint(detected_objects)
+        # 生成人員活動描述
+        people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])  # 人的類別ID通常為0
+        if people_count > 0:
+            if people_count == 1:
+                people_description = "There is one person in the scene, likely a tourist or visitor."
+            elif people_count < 5:
+                people_description = f"There are {people_count} people in the scene, possibly tourists visiting the landmark."
+            else:
+                people_description = f"The scene includes a group of {people_count} people, indicating this is a popular tourist destination."
+            description = self._smart_append(description, people_description)
+        # 添加照明信息
         if lighting_info and "time_of_day" in lighting_info:
             lighting_type = lighting_info["time_of_day"]
             if lighting_type in self.templates.get("lighting_templates", {}):
                 lighting_description = self.templates["lighting_templates"][lighting_type]
+                description = self._smart_append(description, lighting_description)
+        # 添加視角描述
         if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
             viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
+            # 添加視角前綴
             prefix = viewpoint_template.get('prefix', '')
             if prefix and not description.startswith(prefix):
+                # 保持句子流暢性
                 if description and description[0].isupper():
                     description = prefix + description[0].lower() + description[1:]
                 else:
                     description = prefix + description
+            # 添加視角觀察描述
             viewpoint_desc = viewpoint_template.get("observation", "").format(
+                scene_elements="the landmark and surrounding area"
             )
             if viewpoint_desc and viewpoint_desc not in description:
                 description = self._smart_append(description, viewpoint_desc)
+        # 添加功能區域描述
         if functional_zones and len(functional_zones) > 0:
             zones_desc = self._describe_functional_zones(functional_zones)
             if zones_desc:
                 description = self._smart_append(description, zones_desc)
+        # 描述可能的活動
+        landmark_activities = []
+        # 根據地標類型生成通用活動
+        if scene_type == "natural_landmark" or any(obj.get("landmark_type") == "natural" for obj in landmark_objects):
+            landmark_activities = [
+                "nature photography",
+                "scenic viewing",
+                "hiking or walking",
+                "guided nature tours",
+                "outdoor appreciation"
+            ]
+        elif scene_type == "historical_monument" or any(obj.get("landmark_type") == "monument" for obj in landmark_objects):
+            landmark_activities = [
+                "historical sightseeing",
+                "educational tours",
+                "cultural appreciation",
+                "photography of historical architecture",
+                "learning about historical significance"
+            ]
+        else:
+            landmark_activities = [
+                "sightseeing",
+                "taking photographs",
+                "guided tours",
+                "cultural tourism",
+                "souvenir shopping"
             ]
+        # 添加活動描述
+        if landmark_activities:
+            activities_text = "Common activities at this location include " + ", ".join(landmark_activities[:3]) + "."
+            description = self._smart_append(description, activities_text)
+        # 最後格式化描述
+        return self._format_final_description(description)
+    def filter_landmark_references(self, text, enable_landmark=True):
+        """
+        動態過濾文本中的地標引用
+        Args:
+            text: 需要過濾的文本
+            enable_landmark: 是否啟用地標功能
+        Returns:
+            str: 過濾後的文本
+        """
+        if enable_landmark or not text:
+            return text
+        try:
+            # 動態收集所有地標名稱和位置
+            landmark_names = []
+            locations = []
+            for landmark_id, info in ALL_LANDMARKS.items():
+                # 收集地標名稱及其別名
+                landmark_names.append(info["name"])
+                landmark_names.extend(info.get("aliases", []))
+                # 收集地理位置
+                if "location" in info:
+                    location = info["location"]
+                    locations.append(location)
+                    # 處理分離的城市和國家名稱
+                    parts = location.split(",")
+                    if len(parts) >= 1:
+                        locations.append(parts[0].strip())
+                    if len(parts) >= 2:
+                        locations.append(parts[1].strip())
+            # 使用正則表達式動態替換所有地標名稱
+            import re
+            for name in landmark_names:
+                if name and len(name) > 2:  # 避免過短的名稱
+                    text = re.sub(r'\b' + re.escape(name) + r'\b', "tall structure", text, flags=re.IGNORECASE)
+            # 動態替換所有位置引用
+            for location in locations:
+                if location and len(location) > 2:
+                    # 替換常見位置表述模式
+                    text = re.sub(r'in ' + re.escape(location), "in the urban area", text, flags=re.IGNORECASE)
+                    text = re.sub(r'of ' + re.escape(location), "of the urban area", text, flags=re.IGNORECASE)
+                    text = re.sub(r'\b' + re.escape(location) + r'\b', "the urban area", text, flags=re.IGNORECASE)
+        except ImportError:
+            # 如果無法導入，使用基本模式
+            pass
+        # 通用地標描述模式替換
+        landmark_patterns = [
+            (r'a (tourist|popular|famous) landmark', r'an urban structure'),
+            (r'an iconic structure in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'),
+            (r'a famous (monument|tower|landmark) in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'),
+            (r'(centered|built|located|positioned) around the ([A-Z][a-zA-Z\s]+? (Tower|Monument|Landmark))', r'located in this area'),
+            (r'(sightseeing|guided tours|cultural tourism) (at|around|near) (this landmark|the [A-Z][a-zA-Z\s]+)', r'\1 in this area'),
+            (r'this (famous|iconic|historic|well-known) (landmark|monument|tower|structure)', r'this urban structure'),
+            (r'([A-Z][a-zA-Z\s]+) Tower', r'tall structure'),
+            (r'a (tower|structure) in ([A-Z][a-zA-Z\s,]+)', r'a \1 in the area'),
+            (r'landmark scene', r'urban scene'),
+            (r'tourist destination', r'urban area'),
+            (r'tourist attraction', r'urban area')
+        ]
+        for pattern, replacement in landmark_patterns:
+            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
+        return text
+    def generate_description(self, scene_type: str, detected_objects: List[Dict], confidence: float,
+                    lighting_info: Dict, functional_zones: List[str], enable_landmark: bool = True,
+                    scene_scores: Optional[Dict] = None, spatial_analysis: Optional[Dict] = None,
+                    image_dimensions: Optional[Dict] = None, places365_info: Optional[Dict] = None,
+                    object_statistics: Optional[Dict] = None) -> str:
+        """
+        Generate enhanced scene description based on detection results, scene type,
+        and additional contextual information.
+        This version ensures that the main scene_details (from the first call)
+        is properly integrated and not overwritten by a simplified second call.
+        """
+        # Handle unknown scene type or very low confidence as an early exit
+        if scene_type == "unknown" or confidence < 0.4:
+            # _generate_generic_description should also ideally use image_dimensions if it does spatial reasoning
+            generic_desc = self._generate_generic_description(detected_objects, lighting_info)
+            return self._format_final_description(generic_desc)
+        # Filter out landmark objects if landmark detection is disabled for this run
+        current_detected_objects = detected_objects
+        if not enable_landmark:
+            current_detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
+        # Log Places365 context if available
+        places365_context = ""
+        if places365_info and places365_info.get('confidence', 0) > 0.3:
+            scene_label = places365_info.get('scene_label', '')
+            attributes = places365_info.get('attributes', [])
+            is_indoor = places365_info.get('is_indoor', None)
+            if scene_label:
+                places365_context = f"Scene context: {scene_label}"
+                if attributes:
+                    places365_context += f" with characteristics: {', '.join(attributes[:3])}"
+                if is_indoor is not None:
+                    indoor_outdoor = "indoor" if is_indoor else "outdoor"
+                    places365_context += f" ({indoor_outdoor} environment)"
+            print(f"Enhanced description incorporating Places365 context: {places365_context}")
+        landmark_objects_in_scene = [obj for obj in current_detected_objects if obj.get("is_landmark", False)]
+        has_landmark_in_scene = len(landmark_objects_in_scene) > 0
+        # If landmark processing is enabled and it's a landmark scene or landmarks are detected
+        if enable_landmark and (scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"] or has_landmark_in_scene):
+            landmark_desc = self._generate_landmark_description(
+                scene_type,
+                current_detected_objects, # Pass potentially filtered list
+                confidence,
+                lighting_info,
+                functional_zones,
+                landmark_objects_in_scene # Pass the explicitly filtered landmark objects
+            )
+            return self._format_final_description(landmark_desc)
+        # **[Start of main description construction for non-landmark or landmark-disabled everyday scenes]**
+        # Detect viewpoint based on current (potentially filtered) objects
+        viewpoint = self._detect_viewpoint(current_detected_objects)
+        current_scene_type = scene_type # Use a mutable variable for scene_type if it can change
+        # Process aerial viewpoint scene types (may re-assign current_scene_type)
+        if viewpoint == "aerial":
+            if "intersection" in current_scene_type.lower() or self._is_intersection(current_detected_objects): # Use lower for robustness
+                current_scene_type = "aerial_view_intersection"
+            elif any(keyword in current_scene_type.lower() for keyword in ["commercial", "shopping", "retail"]):
+                current_scene_type = "aerial_view_commercial_area"
+            elif any(keyword in current_scene_type.lower() for keyword in ["plaza", "square"]):
+                current_scene_type = "aerial_view_plaza"
+            else: # Default aerial if specific not matched
+                current_scene_type = "aerial_view_general" # Or use a specific default like aerial_view_intersection
+        # Detect cultural context (only for non-aerial viewpoints)
+        cultural_context = None
+        if viewpoint != "aerial":
+            cultural_context = self._detect_cultural_context(current_scene_type, current_detected_objects)
+        # Get base description for the (potentially updated) scene type
+        base_description = "A scene" # Default initialization
+        if viewpoint == "aerial":
+            # Check if current_scene_type (which might be an aerial type) has a base description
+            if current_scene_type in self.scene_types:
+                 base_description = self.scene_types[current_scene_type].get("description", "An aerial view showing the layout and movement patterns from above")
+            else:
+                 base_description = "An aerial view showing the layout and movement patterns from above"
+        elif current_scene_type in self.scene_types:
+            base_description = self.scene_types[current_scene_type].get("description", "A scene")
+        # spatial analysis, and image dimensions. This is where dynamic description or template filling happens.
+        core_scene_details = self._generate_scene_details(
+            current_scene_type, # Use the potentially updated scene_type
+            current_detected_objects,
+            lighting_info,
+            viewpoint,
+            spatial_analysis=spatial_analysis,    # Pass this through
+            image_dimensions=image_dimensions,     # Pass this through
+            places365_info=places365_info,        # Pass Places365 info
+            object_statistics=object_statistics   # Pass object statistics
+        )
+        # Start with the base description derived from SCENE_TYPES or a default.
+        description = base_description
+        if core_scene_details and core_scene_details.strip() != "": # Ensure core_scene_details is not empty
+            # If base_description is generic like "A scene", consider replacing it or appending smartly.
+            if base_description.lower() == "a scene" and len(core_scene_details) > len(base_description):
+                description = core_scene_details # Prioritize dynamic/template-filled details if base is too generic
+            else:
+                description = self._smart_append(description, core_scene_details)
+        elif not core_scene_details and not description: # If both are empty, use a generic fallback
+            description = self._generate_generic_description(current_detected_objects, lighting_info)
+        # Append secondary description from scene type template, if any
+        if current_scene_type in self.scene_types and "secondary_description" in self.scene_types[current_scene_type]:
+            secondary_desc = self.scene_types[current_scene_type]["secondary_description"]
+            if secondary_desc:
+                description = self._smart_append(description, secondary_desc)
+        # Append people count information
+        people_objs = [obj for obj in current_detected_objects if obj.get("class_id") == 0]
+        if people_objs:
+            people_count = len(people_objs)
+            if people_count == 1: people_phrase = "a single person"
+            elif people_count > 1 and people_count <= 3: people_phrase = f"{people_count} people" # Accurate for small counts
+            elif people_count > 3 and people_count <=7: people_phrase = "several people"
+            else: people_phrase = "multiple people" # For larger counts, or use "numerous"
+            # Only add if not already well covered in core_scene_details or base_description
+            if "person" not in description.lower() and "people" not in description.lower() and "pedestrian" not in description.lower():
+                description = self._smart_append(description, f"The scene includes {people_phrase}.")
+        # Append cultural context
+        if cultural_context and viewpoint != "aerial": # Already checked viewpoint
+            cultural_elements = self._generate_cultural_elements(cultural_context)
+            if cultural_elements:
+                description = self._smart_append(description, cultural_elements)
+        # Append lighting information
+        lighting_description_text = ""
+        if lighting_info and "time_of_day" in lighting_info:
+            lighting_type = lighting_info["time_of_day"]
+            lighting_desc_template = self.templates.get("lighting_templates", {}).get(lighting_type)
+            if lighting_desc_template:
+                lighting_description_text = lighting_desc_template
+        if lighting_description_text and lighting_description_text.lower() not in description.lower():
+            description = self._smart_append(description, lighting_description_text)
+        # Append viewpoint information (if not eye-level)
+        if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
+            viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
+            prefix = viewpoint_template.get('prefix', '')
+            observation_template = viewpoint_template.get("observation", "")
+            # Determine scene_elements for the observation template
+            scene_elements_for_vp = "the overall layout and objects" # Generic default
+            if viewpoint == "aerial":
+                scene_elements_for_vp = "crossing patterns and general layout"
+            viewpoint_observation_text = observation_template.format(scene_elements=scene_elements_for_vp)
+            # Combine prefix and observation carefully
+            full_viewpoint_text = ""
+            if prefix:
+                full_viewpoint_text = prefix.strip() + " "
+                if viewpoint_observation_text and viewpoint_observation_text[0].islower():
+                    full_viewpoint_text += viewpoint_observation_text
+                elif viewpoint_observation_text:
+                    full_viewpoint_text = prefix + viewpoint_observation_text[0].lower() + viewpoint_observation_text[1:] if description else prefix + viewpoint_observation_text
+            elif viewpoint_observation_text: # No prefix, but observation exists
+                 full_viewpoint_text = viewpoint_observation_text[0].upper() + viewpoint_observation_text[1:]
+            if full_viewpoint_text and full_viewpoint_text.lower() not in description.lower():
+                description = self._smart_append(description, full_viewpoint_text)
+        # Append functional zones information
+        if functional_zones and len(functional_zones) > 0:
+            zones_desc_text = self._describe_functional_zones(functional_zones)
+            if zones_desc_text:
+                description = self._smart_append(description, zones_desc_text)
+        final_formatted_description = self._format_final_description(description)
+        if not enable_landmark:
+            final_formatted_description = self.filter_landmark_references(final_formatted_description, enable_landmark=False)
+        # If after all processing, description is empty, fallback to a very generic one.
+        if not final_formatted_description.strip() or final_formatted_description.strip() == ".":
+            self.logger.warning(f"Description for scene_type '{current_scene_type}' became empty after processing. Falling back.")
+            final_formatted_description = self._format_final_description(
+                self._generate_generic_description(current_detected_objects, lighting_info)
+            )
+        return final_formatted_description
     def _smart_append(self, current_text: str, new_fragment: str) -> str:
         """
         (new_fragment.startswith("A ") or new_fragment.startswith("An ")):
             return current_text + ". " + new_fragment
+        # 檢查新片段是否包含地標名稱（通常為專有名詞）
+        has_landmark_name = any(word[0].isupper() for word in new_fragment.split()
+                            if len(word) > 2 and not word.startswith(("A ", "An ", "The ")))
         # Decide how to join the texts
         if ends_with_sentence:
             # After a sentence, start with uppercase and add proper spacing
             joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:])
         elif ends_with_comma:
             # After a comma, maintain flow with lowercase unless it's a proper noun or special case
+            if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name:
                 joined_text = current_text + " " + new_fragment
             else:
                 joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:]
         else:
             # For other cases, decide based on the content
             if self._is_related_phrases(current_text, new_fragment):
+                if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name:
                     joined_text = current_text + ", " + new_fragment
                 else:
                     joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:]
         return False
     def _format_final_description(self, text: str) -> str:
         """
         Format the final description text to ensure correct punctuation,
         capitalization, and spacing.
         """
+        if not text or not text.strip(): # Also check if text is just whitespace
             return ""
+        # Trim leading/trailing whitespace first
+        text = text.strip()
+        # 1. Handle consecutive "A/An" segments (potentially split them into sentences)
+        text = re.sub(r'(A\s+[^.!?]+?[\w\.])\s+(A\s+)', r'\1. \2', text, flags=re.IGNORECASE)
+        text = re.sub(r'(An\s+[^.!?]+?[\w\.])\s+(An?\s+)', r'\1. \2', text, flags=re.IGNORECASE)
+        # 2. Ensure first character of the entire text is uppercase
+        if text:
+            text = text[0].upper() + text[1:]
+        # 3. Normalize whitespace: multiple spaces to one
+        text = re.sub(r'\s{2,}', ' ', text)
+        # 4. Capitalize after sentence-ending punctuation (. ! ?)
+        def capitalize_after_punctuation(match):
+            return match.group(1) + match.group(2).upper()
+        text = re.sub(r'([.!?]\s+)([a-z])', capitalize_after_punctuation, text)
+        # 5. Handle capitalization after commas (your existing robust logic is good)
         def fix_capitalization_after_comma(match):
+            leading_comma_space = match.group(1) # (,\s+)
+            word_after_comma = match.group(2)    # ([A-Z][a-zA-Z]*)
+            proper_nouns_exceptions = ["I", "I'm", "I've", "I'd", "I'll",
+                                     "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
+                                     "January", "February", "March", "April", "May", "June", "July",
+                                     "August", "September", "October", "November", "December"]
+            if word_after_comma in proper_nouns_exceptions:
+                return match.group(0)
+            # If the word looks like a proper noun (e.g., multi-word capitalized, or a known location/brand)
+            # This heuristic can be tricky. For simplicity, if it's already capitalized and not a common word, keep it.
+            if len(word_after_comma) > 2 and word_after_comma[0].isupper() and word_after_comma.lower() not in ["this", "that", "these", "those", "they", "their", "then", "thus"]:
+                 return match.group(0) # Keep it if it looks like a proper noun already
+            return leading_comma_space + word_after_comma[0].lower() + word_after_comma[1:]
+        text = re.sub(r'(,\s+)([A-Z][a-zA-Z\'\-]+)', fix_capitalization_after_comma, text) # Added hyphen and apostrophe to word
+        # 6. Correct spacing around punctuation
+        text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) # Ensures one space AFTER punctuation, none before
+        text = text.replace(' .', '.').replace(' ,', ',') # Clean up potential space before period/comma from previous rule
+        # 7. Consolidate multiple sentence-ending punctuations (e.g., "!!", "?.", ".?")
+        text = re.sub(r'[.!?]{2,}', '.', text) # Convert multiple to a single period
+        text = re.sub(r',+', ',', text) # Multiple commas to one
+        # 8. Ensure text ends with a single sentence-ending punctuation mark
+        text = text.strip() # Remove trailing whitespace before checking last char
+        if text and not text[-1] in '.!?':
+            text += '.'
+        # 9. Remove any leading punctuation or extra spaces that might have been introduced
+        text = re.sub(r'^[.,;:!?\s]+', '', text)
+        # 10. Final check for first letter capitalization
+        if text:
+            text = text[0].upper() + text[1:]
+        # 11. Remove space before final punctuation mark if accidentally added by rule 7
+        text = re.sub(r'\s+([.!?])$', r'\1', text)
+        return text.strip() # Final strip
     def _is_intersection(self, detected_objects: List[Dict]) -> bool:
         """
         return base_desc
+    def _get_prominent_objects(self, detected_objects: List[Dict], min_prominence_score: float = 0.1, max_categories_to_return: int = 5, max_total_objects: int = 7) -> List[Dict]:
+        """
+        Helper function to get the most prominent objects.
+        Prioritizes high-confidence, large objects, and ensures a diversity of object types.
+        Args:
+            detected_objects: List of detected objects.
+            min_prominence_score: Minimum score for an object to be considered initially.
+            max_categories_to_return: Max number of different object categories to prioritize.
+            max_total_objects: Overall cap on the number of prominent objects returned.
+        Returns:
+            List of prominent detected objects.
+        """
+        if not detected_objects:
+            return []
+        scored_objects = []
+        for obj in detected_objects:
+            area = obj.get("normalized_area", 0.0) + 1e-6
+            confidence = obj.get("confidence", 0.0)
+            # Base score: area and confidence are key
+            score = (area * 0.65) + (confidence * 0.35) # Slightly more weight to area
+            # Bonus for generally important object classes (in a generic way)
+            # This is a simple heuristic. More advanced would be context-dependent.
+            # For example, 'person' is often more salient.
+            # Avoid hardcoding specific class_ids here if possible, or use broad categories if available.
+            # For simplicity, we'll keep the landmark bonus for now.
+            if obj.get("class_name") == "person": # Example: person is generally prominent
+                 score += 0.1
+            if obj.get("is_landmark"): # Landmarks are always prominent
+                score += 0.5
+            if score >= min_prominence_score:
+                 scored_objects.append((obj, score))
+        if not scored_objects:
+            return []
+        # Sort by score in descending order
+        scored_objects.sort(key=lambda x: x[1], reverse=True)
+        # Prioritize diversity of object categories first
+        prominent_by_category = {}
+        final_prominent_objects = []
+        for obj, score in scored_objects:
+            category = obj.get("class_name", "unknown")
+            if category not in prominent_by_category:
+                if len(prominent_by_category) < max_categories_to_return:
+                    prominent_by_category[category] = obj
+                    final_prominent_objects.append(obj)
+            elif len(final_prominent_objects) < max_total_objects and obj not in final_prominent_objects:
+                 if score > 0.3:
+                    final_prominent_objects.append(obj)
+        # If still under max_total_objects, fill with highest scored remaining objects regardless of category
+        if len(final_prominent_objects) < max_total_objects:
+            for obj, score in scored_objects:
+                if len(final_prominent_objects) >= max_total_objects:
+                    break
+                if obj not in final_prominent_objects:
+                    final_prominent_objects.append(obj)
+        # Re-sort the final list by original prominence score to maintain order
+        final_prominent_objects_with_scores = []
+        for obj in final_prominent_objects:
+            for original_obj, original_score in scored_objects:
+                if obj is original_obj: # Check for object identity
+                    final_prominent_objects_with_scores.append((obj, original_score))
+                    break
+        final_prominent_objects_with_scores.sort(key=lambda x: x[1], reverse=True)
+        return [obj for obj, score in final_prominent_objects_with_scores[:max_total_objects]]
+    def _format_object_list_for_description(self,
+                                            objects: List[Dict],
+                                            use_indefinite_article_for_one: bool = False,
+                                            count_threshold_for_generalization: int = -1, # Default to -1 for precise counts
+                                            max_types_to_list: int = 5
+                                           ) -> str:
+        """
+        Formats a list of detected objects into a human-readable string with counts.
+        Args:
+            objects: List of object dictionaries, each expected to have 'class_name'.
+            use_indefinite_article_for_one: If True, uses "a/an" for single items. If False, uses "one".
+            count_threshold_for_generalization: If count exceeds this, use general terms. -1 means precise counts.
+            max_types_to_list: Maximum number of different object types to include in the list.
+        """
+        if not objects:
+            return "no specific objects clearly identified"
+        counts: Dict[str, int] = {}
+        for obj in objects:
+            name = obj.get("class_name", "unknown object")
+            if name == "unknown object" or not name: # Skip unknown or empty names
+                continue
+            counts[name] = counts.get(name, 0) + 1
+        if not counts:
+            return "no specific objects clearly identified"
+        descriptions = []
+        # Sort by count (desc) then name (asc) for consistent output order
+        # Limit the number of distinct object types being listed
+        sorted_counts = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:max_types_to_list]
+        for name, count in sorted_counts:
+            if count == 1:
+                if use_indefinite_article_for_one:
+                    if name[0].lower() in 'aeiou':
+                        descriptions.append(f"an {name}")
+                    else:
+                        descriptions.append(f"a {name}")
+                else:
+                    descriptions.append(f"one {name}") # Output "one car" instead of "a car"
+            else: # count > 1
+                plural_name = name
+                if name.endswith("y") and not name.lower().endswith(("ay", "ey", "iy", "oy", "uy")):
+                    plural_name = name[:-1] + "ies"
+                elif name.endswith(("s", "sh", "ch", "x", "z")):
+                    plural_name = name + "es"
+                elif not name.endswith("s"): # Avoid double 's' like "buss"
+                    plural_name = name + "s"
+                if count_threshold_for_generalization != -1 and count > count_threshold_for_generalization:
+                    if count <= count_threshold_for_generalization + 3:
+                        descriptions.append(f"several {plural_name}")
+                    else:
+                        descriptions.append(f"many {plural_name}")
+                else: # Use exact count (e.g., "6 cars")
+                    descriptions.append(f"{count} {plural_name}")
+        if not descriptions:
+            return "no specific objects clearly identified"
+        if len(descriptions) == 1:
+            return descriptions[0]
+        elif len(descriptions) == 2:
+            return f"{descriptions[0]} and {descriptions[1]}"
+        else:
+            # Oxford comma for lists of 3 or more.
+            return ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}"
+    def _get_spatial_description(self, obj: Dict, image_width: Optional[int] = None, image_height: Optional[int] = None) -> str:
+        """
+        Generates a brief spatial description for an object.
+        (This is a new helper function)
+        """
+        region = obj.get("region")
+        if region:
+            # Convert region name to more descriptive terms
+            region_map = {
+                "top_left": "in the top-left", "top_center": "at the top-center", "top_right": "in the top-right",
+                "middle_left": "on the middle-left side", "middle_center": "in the center", "middle_right": "on the middle-right side",
+                "bottom_left": "in the bottom-left", "bottom_center": "at the bottom-center", "bottom_right": "in the bottom-right"
+            }
+            # More general terms if exact region is not critical
+            if "top" in region: general_v_pos = "towards the top"
+            elif "bottom" in region: general_v_pos = "towards the bottom"
+            else: general_v_pos = "in the middle vertically"
+            if "left" in region: general_h_pos = "towards the left"
+            elif "right" in region: general_h_pos = "towards the right"
+            else: general_h_pos = "in the center horizontally"
+            # Prioritize specific region if available, else use general
+            specific_desc = region_map.get(region, "")
+            if specific_desc:
+                return f"{specific_desc} of the frame"
+            else:
+                return f"{general_v_pos} and {general_h_pos} of the frame"
+        # Fallback if region info is not detailed enough or missing
+        # We can use normalized_center if available
+        norm_center = obj.get("normalized_center")
+        if norm_center and image_width and image_height: # Check if image_width/height are provided
+            x_norm, y_norm = norm_center
+            h_pos = "left" if x_norm < 0.4 else "right" if x_norm > 0.6 else "center"
+            v_pos = "top" if y_norm < 0.4 else "bottom" if y_norm > 0.6 else "middle"
+            if h_pos == "center" and v_pos == "middle":
+                return "near the center of the image"
+            return f"in the {v_pos}-{h_pos} area of the image"
+        return "in the scene" # Generic fallback
+    def _generate_dynamic_everyday_description(self,
+                                          detected_objects: List[Dict],
+                                          lighting_info: Optional[Dict] = None,
+                                          viewpoint: str = "eye_level",
+                                          spatial_analysis: Optional[Dict] = None,
+                                          image_dimensions: Optional[Tuple[int, int]] = None,
+                                          places365_info: Optional[Dict] = None,
+                                          object_statistics: Optional[Dict] = None
+                                          ) -> str:
+        """
+        Dynamically generates a description for everyday scenes based on ALL relevant detected_objects,
+        their counts, and context.
+        It aims to describe the overall scene first, then details of object groups including accurate counts.
+        """
+        description_segments = []
+        image_width, image_height = image_dimensions if image_dimensions else (None, None)
+        if hasattr(self, 'logger'):
+            self.logger.info(f"DynamicDesc: Start. Total Raw Objects: {len(detected_objects)}, View: {viewpoint}, Light: {lighting_info is not None}")
+        # 1. Overall Ambiance (Lighting and Viewpoint)
+        ambiance_parts = []
+        if lighting_info:
+            time_of_day = lighting_info.get("time_of_day", "unknown lighting")
+            is_indoor = lighting_info.get("is_indoor")
+            ambiance_statement = "This is"
+            if is_indoor is True: ambiance_statement += " an indoor scene"
+            elif is_indoor is False: ambiance_statement += " an outdoor scene"
+            else: ambiance_statement += " a scene"
+            lighting_map = self.templates.get("lighting_templates", {})
+            readable_lighting_base = lighting_map.get(time_of_day, f"with {time_of_day.replace('_', ' ')} lighting conditions")
+            readable_lighting = readable_lighting_base.lower().replace("the scene is captured", "").replace("the scene has", "").strip()
+            ambiance_statement += f", likely {readable_lighting}."
+            ambiance_parts.append(ambiance_statement)
+        if viewpoint and viewpoint != "eye_level":
+            vp_templates = self.templates.get("viewpoint_templates", {})
+            if viewpoint in vp_templates:
+                vp_prefix = vp_templates[viewpoint].get("prefix", "").strip()
+                if vp_prefix:
+                    if not ambiance_parts:
+                        ambiance_parts.append(f"{vp_prefix.capitalize()} the general layout of the scene is observed.")
+                    else:
+                        ambiance_parts[-1] = ambiance_parts[-1].rstrip('.') + f", viewed {vp_templates[viewpoint].get('short_desc', viewpoint)}."
+        if ambiance_parts:
+            description_segments.append(" ".join(ambiance_parts))
+        # 2. Describe ALL detected objects, grouped by class, with accurate counts and locations
+        if not detected_objects:
+            # This part remains, but the conditions to reach here might change based on confident_objects check
+            if not description_segments:
+                 description_segments.append("A general scene is visible, but no specific objects were clearly identified.")
+            else:
+                 description_segments.append("Within this setting, no specific objects were clearly identified.")
+        else:
+            objects_by_class: Dict[str, List[Dict]] = {}
+            # keeping 0.25 as a placeholder
+            confidence_filter_threshold = getattr(self, 'confidence_threshold_for_description', 0.25)
+            confident_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= confidence_filter_threshold]
+            if not confident_objects:
+                 # This message is more appropriate if objects existed but none met confidence
+                 no_confident_obj_msg = "While some elements might be present, no objects were identified with sufficient confidence for a detailed description."
+                 if not description_segments: description_segments.append(no_confident_obj_msg)
+                 else: description_segments.append(no_confident_obj_msg.lower().capitalize()) # Append as a new sentence
+            else:
+                if object_statistics:
+                    # 使用預計算的統計信息，並採用動態置信度策略
+                    for class_name, stats in object_statistics.items():
+                        count = stats.get("count", 0)
+                        avg_confidence = stats.get("avg_confidence", 0)
+                        # 動態調整置信度閾值：裝飾性物品使用較低閾值
+                        dynamic_threshold = confidence_filter_threshold
+                        if class_name in ["potted plant", "vase", "clock", "book"]:
+                            dynamic_threshold = max(0.15, confidence_filter_threshold * 0.6)
+                        elif count >= 3:  # 數量多的物品降低閾值
+                            dynamic_threshold = max(0.2, confidence_filter_threshold * 0.8)
+                        if count > 0 and avg_confidence >= dynamic_threshold:
+                            matching_objects = [obj for obj in confident_objects if obj.get("class_name") == class_name]
+                            if not matching_objects:
+                                # 如果高信心度的物體中沒有，從原始列表中尋找
+                                matching_objects = [obj for obj in detected_objects
+                                                if obj.get("class_name") == class_name and obj.get("confidence", 0) >= dynamic_threshold]
+                            if matching_objects:
+                                actual_count = min(stats["count"], len(matching_objects))
+                                objects_by_class[class_name] = matching_objects[:actual_count]
+                else:
+                    # 回退邏輯同樣使用動態閾值
+                    for obj in confident_objects:
+                        name = obj.get("class_name", "unknown object")
+                        if name == "unknown object" or not name: continue
+                        if name not in objects_by_class:
+                            objects_by_class[name] = []
+                        objects_by_class[name].append(obj)
+                if not objects_by_class: # Should be rare if confident_objects was not empty and had valid names
+                    description_segments.append("No common objects were confidently identified for detailed description.")
+                else:
+                    def sort_key_object_groups(item_tuple: Tuple[str, List[Dict]]):
+                        class_name_key, obj_group_list = item_tuple
+                        priority = 3  # 預設優先級
+                        count = len(obj_group_list)
+                        # 動態優先級：基於場景相關性和數量
+                        if class_name_key == "person":
+                            priority = 0
+                        elif class_name_key in ["dining table", "chair", "sofa", "bed"]:
+                            priority = 1  # 室內主要家具
+                        elif class_name_key in ["car", "bus", "truck", "traffic light"]:
+                            priority = 2  # 交通相關物體
+                        elif count >= 3:  # 數量多的物體提升優先級
+                            priority = max(1, priority - 1)
+                        elif class_name_key in ["potted plant", "vase", "clock", "book"] and count >= 2:
+                            priority = 2  # 裝飾性物品有一定數量時提升優先級
+                        avg_area = sum(o.get("normalized_area", 0.0) for o in obj_group_list) / len(obj_group_list) if obj_group_list else 0
+                        # 增加數量權重：多個同類物體更重要
+                        quantity_bonus = min(count / 5.0, 1.0)  # 最多1.0的加成
+                        return (priority, -len(obj_group_list), -avg_area, -quantity_bonus)
+                    # 去除重複的邏輯
+                    deduplicated_objects_by_class = {}
+                    processed_positions = []
+                    for class_name, group_of_objects in objects_by_class.items():
+                        unique_objects = []
+                        for obj in group_of_objects:
+                            obj_position = obj.get("normalized_center", [0.5, 0.5])
+                            is_duplicate = False
+                            # 檢查是否與已處理的物體位置重疊
+                            for processed_pos in processed_positions:
+                                position_distance = abs(obj_position[0] - processed_pos[0]) + abs(obj_position[1] - processed_pos[1])
+                                if position_distance < 0.15:  # 位置重疊閾值
+                                    is_duplicate = True
+                                    break
+                            if not is_duplicate:
+                                unique_objects.append(obj)
+                                processed_positions.append(obj_position)
+                        if unique_objects:
+                            deduplicated_objects_by_class[class_name] = unique_objects
+                    objects_by_class = deduplicated_objects_by_class
+                    sorted_object_groups = sorted(objects_by_class.items(), key=sort_key_object_groups)
+                    object_clauses = [] # Stores individual object group descriptions
+                    for class_name, group_of_objects in sorted_object_groups:
+                        count = len(group_of_objects)
+                        if count == 0: continue
+                        # 使用統計信息確保準確的數量描述
+                        if object_statistics and class_name in object_statistics:
+                            actual_count = object_statistics[class_name]["count"]
+                            # 根據實際統計數量生成描述
+                            if actual_count == 1:
+                                formatted_name_with_exact_count = f"one {class_name}"
+                            else:
+                                plural_form = f"{class_name}s" if not class_name.endswith('s') else class_name
+                                formatted_name_with_exact_count = f"{actual_count} {plural_form}"
+                        else:
+                            # 回退到原有的格式化邏輯
+                            formatted_name_with_exact_count = self._format_object_list_for_description(
+                                [group_of_objects[0]] * count,
+                                use_indefinite_article_for_one=False,
+                                count_threshold_for_generalization=-1
+                            )
+                        if formatted_name_with_exact_count == "no specific objects clearly identified" or not formatted_name_with_exact_count:
+                            continue
+                        # Determine collective location for the group
+                        location_description_suffix = "" # e.g., "is in the center" or "are in the west area"
+                        if count == 1:
+                            location_description_suffix = f"is {self._get_spatial_description(group_of_objects[0], image_width, image_height)}"
+                        else:
+                            distinct_regions = sorted(list(set(obj.get("region", "unknown_region") for obj in group_of_objects)))
+                            known_regions = [r for r in distinct_regions if r != "unknown_region"]
+                            if not known_regions and "unknown_region" in distinct_regions:
+                                location_description_suffix = "are visible in the scene"
+                            elif len(known_regions) == 1:
+                                location_description_suffix = f"are primarily in the {known_regions[0].replace('_', ' ')} area"
+                            elif len(known_regions) == 2:
+                                location_description_suffix = f"are mainly across the {known_regions[0].replace('_',' ')} and {known_regions[1].replace('_',' ')} areas"
+                            elif len(known_regions) > 2:
+                                location_description_suffix = "are distributed in various parts of the scene"
+                            else:
+                                location_description_suffix = "are visible in the scene"
+                        # Capitalize the object description (e.g., "Six cars")
+                        formatted_name_capitalized = formatted_name_with_exact_count[0].upper() + formatted_name_with_exact_count[1:]
+                        object_clauses.append(f"{formatted_name_capitalized} {location_description_suffix}")
+                    if object_clauses:
+                        # Join object clauses into one or more sentences.
+                        if not description_segments: # If no ambiance, start with the first object clause.
+                            if object_clauses:
+                                first_clause = object_clauses.pop(0) # Take the first one out
+                                description_segments.append(first_clause + ".")
+                        else: # Ambiance exists, prepend with "The scene features..." or similar
+                            if object_clauses:
+                                description_segments.append("The scene features:") # Or "Key elements include:"
+                        # Add remaining object clauses as separate points or a continuous sentence
+                        # For now, let's join them into a single continuous sentence string to be added.
+                        if object_clauses: # If there are more clauses after the first (or after "The scene features:")
+                            joined_object_clauses = ". ".join(object_clauses)
+                            if joined_object_clauses and not joined_object_clauses.endswith("."):
+                                joined_object_clauses += "."
+                            description_segments.append(joined_object_clauses)
+                    elif not description_segments : # No ambiance and no describable objects after filtering
+                        return "The image depicts a scene, but specific objects could not be described with confidence or detail."
+        # --- Final assembly and formatting ---
+        # Join all collected segments. _smart_append might be better if parts are not full sentences.
+        # Since we aim for full sentences in segments, simple join then format.
+        raw_description = ""
+        for i, segment in enumerate(filter(None, description_segments)):
+            segment = segment.strip()
+            if not segment: continue
+            if not raw_description: # First non-empty segment
+                raw_description = segment
+            else:
+                if not raw_description.endswith(('.', '!', '?')):
+                    raw_description += "."
+                raw_description += " " + (segment[0].upper() + segment[1:] if len(segment) > 1 else segment.upper())
+        if raw_description and not raw_description.endswith(('.', '!', '?')):
+            raw_description += "."
+        final_description = self._format_final_description(raw_description) # Crucial for final polish
+        if not final_description or len(final_description.strip()) < 20:
+            # Fallback if description is too short or empty after processing
+            # Use a more informative fallback if confident_objects existed
+            if 'confident_objects' in locals() and confident_objects:
+                 return "The scene contains several detected objects, but a detailed textual description could not be fully constructed."
+            else:
+                 return "A general scene is depicted with no objects identified with high confidence."
+        return final_description
     def _generate_scene_details(self,
+                          scene_type: str,
+                          detected_objects: List[Dict],
+                          lighting_info: Optional[Dict] = None,
+                          viewpoint: str = "eye_level",
+                          spatial_analysis: Optional[Dict] = None,
+                          image_dimensions: Optional[Tuple[int, int]] = None,
+                          places365_info: Optional[Dict] = None,
+                          object_statistics: Optional[Dict] = None
+                          ) -> str:
         """
         Generate detailed description based on scene type and detected objects.
+        Enhanced to handle everyday scenes dynamically with accurate object counting.
         Args:
+            scene_type: Identified scene type.
+            detected_objects: List of detected objects.
+            lighting_info: Optional lighting condition information.
+            viewpoint: Detected viewpoint (aerial, eye_level, etc.).
+            spatial_analysis: Optional results from SpatialAnalyzer.
+            image_dimensions: Optional tuple of (image_width, image_height).
+            places365_info: Optional Places365 scene classification results.
+            object_statistics: Optional detailed object statistics with counts and confidence.
         Returns:
+            str: Detailed scene description.
         """
         scene_details = ""
         scene_templates = self.templates.get("scene_detail_templates", {})
+        # List of scene types considered "everyday" or generic
+        everyday_scene_types = [
+            "general_indoor_space", "generic_street_view",
+            "desk_area_workspace", "outdoor_gathering_spot",
+            "kitchen_counter_or_utility_area", "unknown"
+        ]
+        # Extract Places365 attributes for enhanced description
+        places365_attributes = []
+        scene_specific_details = ""
+        if places365_info and places365_info.get('confidence', 0) > 0.4:
+            attributes = places365_info.get('attributes', [])
+            scene_label = places365_info.get('scene_label', '')
+            # Filter relevant attributes for description enhancement
+            relevant_attributes = [attr for attr in attributes if attr in [
+                'natural_lighting', 'artificial_lighting', 'commercial', 'residential',
+                'workplace', 'recreational', 'educational', 'open_space', 'enclosed_space'
+            ]]
+            places365_attributes = relevant_attributes[:2]
+            # Generate scene-specific contextual details using object statistics
+            if object_statistics:
+                if 'commercial' in attributes and object_statistics.get('person', {}).get('count', 0) > 0:
+                    person_count = object_statistics['person']['count']
+                    if person_count == 1:
+                        scene_specific_details = "This appears to be an active commercial environment with a customer present."
+                    else:
+                        scene_specific_details = f"This appears to be an active commercial environment with {person_count} people present."
+                elif 'residential' in attributes and scene_type in ['living_room', 'bedroom', 'kitchen']:
+                    scene_specific_details = "The setting suggests a comfortable residential living space."
+                elif 'workplace' in attributes and any(object_statistics.get(obj, {}).get('count', 0) > 0
+                                                    for obj in ['laptop', 'keyboard', 'monitor']):
+                    scene_specific_details = "The environment indicates an active workspace or office setting."
             else:
+                # Fallback to original logic if object_statistics not available
+                if 'commercial' in attributes and any(obj['class_name'] in ['person', 'chair', 'table'] for obj in detected_objects):
+                    scene_specific_details = "This appears to be an active commercial environment with customer activity."
+                elif 'residential' in attributes and scene_type in ['living_room', 'bedroom', 'kitchen']:
+                    scene_specific_details = "The setting suggests a comfortable residential living space."
+                elif 'workplace' in attributes and any(obj['class_name'] in ['laptop', 'keyboard', 'monitor'] for obj in detected_objects):
+                    scene_specific_details = "The environment indicates an active workspace or office setting."
+        # Determine scene description approach
+        is_confident_specific_scene = scene_type not in everyday_scene_types and scene_type in scene_templates
+        treat_as_everyday = scene_type in everyday_scene_types
+        if hasattr(self, 'enable_landmark') and not self.enable_landmark:
+            if scene_type not in ["kitchen", "bedroom", "living_room", "office_workspace", "dining_area", "professional_kitchen"]:
+                treat_as_everyday = True
+        if treat_as_everyday or not is_confident_specific_scene:
+            # Generate dynamic description for everyday scenes with object statistics
+            self.logger.info(f"Generating dynamic description for scene_type: {scene_type}")
+            scene_details = self._generate_dynamic_everyday_description(
+                detected_objects,
+                lighting_info,
+                viewpoint,
+                spatial_analysis,
+                image_dimensions,
+                places365_info,
+                object_statistics  # Pass object statistics to dynamic description
+            )
+        elif scene_type in scene_templates:
+            # Use template-based description with enhanced object information
+            self.logger.info(f"Using template for scene_type: {scene_type}")
+            viewpoint_key = f"{scene_type}_{viewpoint}"
+            templates_list = scene_templates.get(viewpoint_key, scene_templates.get(scene_type, []))
             if templates_list:
                 detail_template = random.choice(templates_list)
                 scene_details = self._fill_detail_template(
                     detail_template,
                     detected_objects,
+                    scene_type,
+                    places365_info,
+                    object_statistics  # Pass object statistics to template filling
                 )
             else:
+                scene_details = self._generate_dynamic_everyday_description(
+                    detected_objects, lighting_info, viewpoint, spatial_analysis,
+                    image_dimensions, places365_info, object_statistics
+                )
+        else:
+            # Fallback to dynamic description with object statistics
+            self.logger.info(f"No specific template for {scene_type}, generating dynamic description.")
+            scene_details = self._generate_dynamic_everyday_description(
+                detected_objects, lighting_info, viewpoint, spatial_analysis,
+                image_dimensions, places365_info, object_statistics
+            )
+        # Filter out landmark references if landmark detection is disabled
+        if hasattr(self, 'enable_landmark') and not self.enable_landmark:
+            scene_details = self.filter_landmark_references(scene_details, enable_landmark=False)
+        return scene_details if scene_details else "A scene with some visual elements."
+    def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str, places365_info: Optional[Dict] = None, object_statistics: Optional[Dict] = None) -> str:
         """
         Fill a template with specific details based on detected objects.
         # Get object template fillers
         fillers = self.templates.get("object_template_fillers", {})
+        # 基於物品的統計資訊形成更準確的模板填充內容
+        statistics_based_replacements = {}
+        if object_statistics:
+            # 根據統計信息生成具體的物體描述
+            for class_name, stats in object_statistics.items():
+                count = stats.get("count", 0)
+                if count > 0:
+                    # 為常見物體類別生成基於統計的描述
+                    if class_name == "potted plant":
+                        if count == 1:
+                            statistics_based_replacements["plant_elements"] = "a potted plant"
+                        elif count <= 3:
+                            statistics_based_replacements["plant_elements"] = f"{count} potted plants"
+                        else:
+                            statistics_based_replacements["plant_elements"] = f"multiple potted plants ({count} total)"
+                    elif class_name == "chair":
+                        if count == 1:
+                            statistics_based_replacements["seating"] = "a chair"
+                        elif count <= 4:
+                            statistics_based_replacements["seating"] = f"{count} chairs"
+                        else:
+                            statistics_based_replacements["seating"] = f"numerous chairs ({count} total)"
+                    elif class_name == "person":
+                        if count == 1:
+                            statistics_based_replacements["people_and_vehicles"] = "a person"
+                            statistics_based_replacements["pedestrian_flow"] = "an individual walking"
+                        elif count <= 5:
+                            statistics_based_replacements["people_and_vehicles"] = f"{count} people"
+                            statistics_based_replacements["pedestrian_flow"] = f"{count} people walking"
+                        else:
+                            statistics_based_replacements["people_and_vehicles"] = f"many people ({count} individuals)"
+                            statistics_based_replacements["pedestrian_flow"] = f"a crowd of {count} people"
         # 為所有可能的變數設置默認值
         default_replacements = {
             # 室內相關
             "knowledge_transfer": "learning exchanges"
         }
+        # 將統計的資訊形成的替換內容合併到默認替換中
+        default_replacements.update(statistics_based_replacements)
+        # Add Places365-specific template variables
+        places365_scene_context = ""
+        places365_atmosphere = ""
+        if places365_info and places365_info.get('confidence', 0) > 0.35:
+            scene_label = places365_info.get('scene_label', '').replace('_', ' ')
+            attributes = places365_info.get('attributes', [])
+            if scene_label and scene_label != scene_type:
+                places365_scene_context = f"characteristic of a {scene_label}"
+            if 'natural_lighting' in attributes:
+                places365_atmosphere = "with natural illumination"
+            elif 'artificial_lighting' in attributes:
+                places365_atmosphere = "under artificial lighting"
+        # Update default_replacements with Places365 context
+        if places365_scene_context:
+            default_replacements["places365_context"] = places365_scene_context
+        else:
+            default_replacements["places365_context"] = ""
+        if places365_atmosphere:
+            default_replacements["places365_atmosphere"] = places365_atmosphere
+        else:
+            default_replacements["places365_atmosphere"] = ""
         # For each placeholder, try to fill with appropriate content
         for placeholder in placeholders:
             if placeholder in fillers:
         if not detected_objects:
             return "eye_level"  # default
+        # extract space and size
         top_region_count = 0
         bottom_region_count = 0
         total_objects = len(detected_objects)
         crosswalk_pattern_detected = False
         for obj in detected_objects:
+            # 計算頂部or底部區域中的物體
             region = obj["region"]
             if "top" in region:
                 top_region_count += 1
             elif "bottom" in region:
                 bottom_region_count += 1
+            # 計算標準化大小（Area）
             if "normalized_area" in obj:
                 sizes.append(obj["normalized_area"])
+            # 計算高度or寬度比例
             if "normalized_size" in obj:
                 width, height = obj["normalized_size"]
                 if width > 0:
                     height_width_ratios.append(height / width)
+            # 收集人的位置
             if obj["class_id"] == 0:  # 人
                 if "normalized_center" in obj:
                     people_positions.append(obj["normalized_center"])
+        # 專門為斑馬線的十字路口添加檢測邏輯
         # 檢查是否有明顯的垂直和水平行人分布
         people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]  # 人
                 y_range = max(y_coords) - min(y_coords)
                 # 嘗試檢測十字形分布
+                # 如果 x 和 y 方向都有較大範圍，且範圍相似，就有可能是十字路口
                 if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
                     # 計算到中心點的距離
             description = description.replace("a bed in the room", "a bed")
         # 處理重複的物品列表
         object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
         for obj_list in object_lists:
         if not functional_zones:
             return ""
+        # 處理不同類型的 functional_zones 參數
+        if isinstance(functional_zones, list):
+            # 如果是列表，轉換為字典格式
+            zones_dict = {}
+            for i, zone in enumerate(functional_zones):
+                if isinstance(zone, dict) and 'name' in zone:
+                    zone_name = zone['name']
+                else:
+                    zone_name = f"zone_{i}"
+                zones_dict[zone_name] = zone if isinstance(zone, dict) else {"description": str(zone)}
+            functional_zones = zones_dict
+        elif not isinstance(functional_zones, dict):
+            return ""
         # 計算場景中的總人數
         total_people_count = 0
         people_by_zone = {}
         # 生成匯總描述
         summary = ""
+        max_mentioned_people = 0  # track已經提到的最大人數
         # 如果總人數顯著且還沒在主描述中提到，添加總人數描述
         if total_people_count > 5:
             summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
+            max_mentioned_people = total_people_count  # update已提到的最大人數
         # 處理每個區域的描述，確保人數信息的一致性
         processed_zones = []
             zone_desc = zone_info.get("description", "a functional zone")
             zone_people_count = people_by_zone.get(zone_name, 0)
+            # 檢查描述中是否包含人數資訊
             contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())
             # 如果描述包含人數信息，且人數較小（小於已提到的最大人數），則修改描述

evaluation_metrics.py CHANGED Viewed

@@ -138,7 +138,7 @@ class EvaluationMetrics:
             # Create empty plot if error
             fig, ax = plt.subplots(figsize=figsize)
             ax.text(0.5, 0.5, viz_data["error"],
-                    ha='center', va='center', fontsize=14, fontfamily='Arial')
             ax.set_xlim(0, 1)
             ax.set_ylim(0, 1)
             ax.axis('off')
@@ -148,7 +148,7 @@ class EvaluationMetrics:
             # Create empty plot if no data
             fig, ax = plt.subplots(figsize=figsize)
             ax.text(0.5, 0.5, "No detection data available",
-                    ha='center', va='center', fontsize=14, fontfamily='Arial')
             ax.set_xlim(0, 1)
             ax.set_ylim(0, 1)
             ax.axis('off')
@@ -163,7 +163,6 @@ class EvaluationMetrics:
         colors = [item["color"] for item in class_data]
         # Create figure and horizontal bar chart with improved styling
-        plt.rcParams['font.family'] = 'Arial'
         fig, ax = plt.subplots(figsize=figsize)
         # Set background color to white
@@ -181,15 +180,15 @@ class EvaluationMetrics:
             conf = class_data[i]["average_confidence"]
             ax.text(width + 0.3, bar.get_y() + bar.get_height()/2,
                     f"{width:.0f} (conf: {conf:.2f})",
-                    va='center', fontsize=12, fontfamily='Arial')
         # Customize axis and labels with larger fonts
         ax.set_yticks(y_pos)
-        ax.set_yticklabels(class_names, fontsize=14, fontfamily='Arial')
         ax.invert_yaxis()  # Labels read top-to-bottom
-        ax.set_xlabel('Count', fontsize=14, fontfamily='Arial')
         ax.set_title(f'Objects Detected: {viz_data["total_objects"]} Total',
-                    fontsize=16, fontfamily='Arial', fontweight='bold')
         # Add grid for better readability
         ax.set_axisbelow(True)
@@ -204,7 +203,7 @@ class EvaluationMetrics:
             f"Average Confidence: {viz_data['average_confidence']:.2f}\n"
             f"Unique Classes: {len(viz_data['class_data'])}"
         )
-        plt.figtext(0.02, 0.02, summary_text, fontsize=12, fontfamily='Arial',
                 bbox=dict(facecolor='white', alpha=0.9, boxstyle='round,pad=0.5',
                             edgecolor='#E5E7EB'))

             # Create empty plot if error
             fig, ax = plt.subplots(figsize=figsize)
             ax.text(0.5, 0.5, viz_data["error"],
+                    ha='center', va='center', fontsize=14)
             ax.set_xlim(0, 1)
             ax.set_ylim(0, 1)
             ax.axis('off')
             # Create empty plot if no data
             fig, ax = plt.subplots(figsize=figsize)
             ax.text(0.5, 0.5, "No detection data available",
+                    ha='center', va='center', fontsize=14)
             ax.set_xlim(0, 1)
             ax.set_ylim(0, 1)
             ax.axis('off')
         colors = [item["color"] for item in class_data]
         # Create figure and horizontal bar chart with improved styling
         fig, ax = plt.subplots(figsize=figsize)
         # Set background color to white
             conf = class_data[i]["average_confidence"]
             ax.text(width + 0.3, bar.get_y() + bar.get_height()/2,
                     f"{width:.0f} (conf: {conf:.2f})",
+                    va='center', fontsize=12)
         # Customize axis and labels with larger fonts
         ax.set_yticks(y_pos)
+        ax.set_yticklabels(class_names, fontsize=14)
         ax.invert_yaxis()  # Labels read top-to-bottom
+        ax.set_xlabel('Count', fontsize=14)
         ax.set_title(f'Objects Detected: {viz_data["total_objects"]} Total',
+                    fontsize=16, fontweight='bold')
         # Add grid for better readability
         ax.set_axisbelow(True)
             f"Average Confidence: {viz_data['average_confidence']:.2f}\n"
             f"Unique Classes: {len(viz_data['class_data'])}"
         )
+        plt.figtext(0.02, 0.02, summary_text, fontsize=12,
                 bbox=dict(facecolor='white', alpha=0.9, boxstyle='round,pad=0.5',
                             edgecolor='#E5E7EB'))

image_processor.py CHANGED Viewed

@@ -13,6 +13,7 @@ from visualization_helper import VisualizationHelper
 from evaluation_metrics import EvaluationMetrics
 from lighting_analyzer import LightingAnalyzer
 from scene_analyzer import SceneAnalyzer
 class ImageProcessor:
     """
@@ -20,13 +21,76 @@ class ImageProcessor:
     Separates processing logic from UI components
     """
-    def __init__(self, use_llm=True, llm_model_path=None):
         """Initialize the image processor with required components"""
-        self.color_mapper = ColorMapper()
-        self.model_instances = {}
-        self.lighting_analyzer = LightingAnalyzer()
-        self.use_llm = use_llm
-        self.llm_model_path = llm_model_path
     def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
         """
@@ -53,48 +117,74 @@ class ImageProcessor:
         return self.model_instances[model_name]
-    def analyze_scene(self, detection_result: Any, lighting_info: Optional[Dict] = None) -> Dict:
         """
         Perform scene analysis on detection results
         Args:
             detection_result: Object detection result from YOLOv8
             lighting_info: Lighting condition analysis results (optional)
         Returns:
             Dictionary containing scene analysis results
         """
         try:
-            # Initialize scene analyzer if not already done
-            if not hasattr(self, 'scene_analyzer'):
                 self.scene_analyzer = SceneAnalyzer(
-                    class_names=detection_result.names,
                     use_llm=self.use_llm,
                     llm_model_path=self.llm_model_path
                 )
-            # 確保類名正確更新
-            if self.scene_analyzer.class_names is None:
-                self.scene_analyzer.class_names = detection_result.names
-                self.scene_analyzer.spatial_analyzer.class_names = detection_result.names
-            # Perform scene analysis with lighting info
             scene_analysis = self.scene_analyzer.analyze(
                 detection_result=detection_result,
                 lighting_info=lighting_info,
                 class_confidence_threshold=0.35,
-                scene_confidence_threshold=0.6
             )
             return scene_analysis
         except Exception as e:
             print(f"Error in scene analysis: {str(e)}")
             import traceback
             traceback.print_exc()
             return {
                 "scene_type": "unknown",
                 "confidence": 0.0,
                 "description": f"Error during scene analysis: {str(e)}",
                 "objects_present": [],
                 "object_count": 0,
                 "regions": {},
@@ -103,146 +193,256 @@ class ImageProcessor:
                 "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
             }
-    def analyze_lighting_conditions(self, image):
         """
-        分析光照條件。
         Args:
             image: 輸入圖像
         Returns:
             Dict: 光照分析結果
         """
-        return self.lighting_analyzer.analyze(image)
-    def process_image(self, image, model_name: str, confidence_threshold: float, filter_classes: Optional[List[int]] = None) -> Tuple[Any, str, Dict]:
         """
-        Process an image for object detection
         Args:
-            image: Input image (numpy array or PIL Image)
-            model_name: Name of the model to use
-            confidence_threshold: Confidence threshold for detection
-            filter_classes: Optional list of classes to filter results
         Returns:
-            Tuple of (result_image, result_text, stats_data)
         """
-        # Get model instance
         model_instance = self.get_model_instance(model_name, confidence_threshold)
-        # Initialize key variables
         result = None
-        stats = {}
         temp_path = None
         try:
-            # Processing input image
             if isinstance(image, np.ndarray):
-                # Convert BGR to RGB if needed
-                if image.shape[2] == 3:
-                    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                 else:
-                    image_rgb = image
-                pil_image = Image.fromarray(image_rgb)
             elif image is None:
                 return None, "No image provided. Please upload an image.", {}
             else:
-                pil_image = image
-            # Analyze lighting conditions
-            lighting_info = self.analyze_lighting_conditions(pil_image)
-            # Store temp files
-            temp_dir = tempfile.gettempdir()  # Use system temp directory
             temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
             temp_path = os.path.join(temp_dir, temp_filename)
-            pil_image.save(temp_path)
-            # Object detection
             result = model_instance.detect(temp_path)
-            if result is None:
-                return None, "Detection failed. Please try again with a different image.", {}
-            # Calculate stats
-            stats = EvaluationMetrics.calculate_basic_stats(result)
-            # Add space calculation
             spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
-            stats["spatial_metrics"] = spatial_metrics
-            # Add lighting information
-            stats["lighting_conditions"] = lighting_info
-            # Apply filter if specified
             if filter_classes and len(filter_classes) > 0:
-                # Get classes, boxes, confidence
                 classes = result.boxes.cls.cpu().numpy().astype(int)
                 confs = result.boxes.conf.cpu().numpy()
-                boxes = result.boxes.xyxy.cpu().numpy()
-                mask = np.zeros_like(classes, dtype=bool)
-                for cls_id in filter_classes:
-                    mask = np.logical_or(mask, classes == cls_id)
-                filtered_stats = {
-                    "total_objects": int(np.sum(mask)),
-                    "class_statistics": {},
-                    "average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0,
-                    "spatial_metrics": stats["spatial_metrics"],
                     "lighting_conditions": lighting_info
                 }
-                # Update stats
                 names = result.names
-                for cls, conf in zip(classes[mask], confs[mask]):
-                    cls_name = names[int(cls)]
-                    if cls_name not in filtered_stats["class_statistics"]:
-                        filtered_stats["class_statistics"][cls_name] = {
-                            "count": 0,
-                            "average_confidence": 0
-                        }
-                    filtered_stats["class_statistics"][cls_name]["count"] += 1
-                    filtered_stats["class_statistics"][cls_name]["average_confidence"] = conf
-                stats = filtered_stats
-            viz_data = EvaluationMetrics.generate_visualization_data(
-                result,
-                self.color_mapper.get_all_colors()
             )
-            result_image = VisualizationHelper.visualize_detection(
-                temp_path, result, color_mapper=self.color_mapper, figsize=(12, 12), return_pil=True, filter_classes=filter_classes
             )
-            result_text = EvaluationMetrics.format_detection_summary(viz_data)
-            if result is not None:
-                # Perform scene analysis with lighting info
-                scene_analysis = self.analyze_scene(result, lighting_info)
-                # Add scene analysis to stats
-                stats["scene_analysis"] = scene_analysis
-            return result_image, result_text, stats
         except Exception as e:
-            error_message = f"Error Occurs: {str(e)}"
             import traceback
             traceback.print_exc()
-            print(error_message)
-            return None, error_message, {}
         finally:
             if temp_path and os.path.exists(temp_path):
-                try:
-                    os.remove(temp_path)
-                except Exception as e:
-                    print(f"Cannot delete temp files {temp_path}: {str(e)}")
     def format_result_text(self, stats: Dict) -> str:
         """
@@ -281,7 +481,7 @@ class ImageProcessor:
         else:
             lines.append("No class information available.")
-        # 添加空間信息
         if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
             lines.append("Object Distribution:")

 from evaluation_metrics import EvaluationMetrics
 from lighting_analyzer import LightingAnalyzer
 from scene_analyzer import SceneAnalyzer
+from places365_model import Places365Model
 class ImageProcessor:
     """
     Separates processing logic from UI components
     """
+    def __init__(self, use_llm=True, llm_model_path=None, enable_places365=True, places365_model_name='resnet50_places365'):
         """Initialize the image processor with required components"""
+        print(f"Initializing ImageProcessor with use_llm={use_llm}, enable_places365={enable_places365}")
+        try:
+            # Initialize basic components first
+            self.use_llm = use_llm
+            self.llm_model_path = llm_model_path
+            self.enable_places365 = enable_places365
+            self.model_instances = {}
+            # Initialize ColorMapper
+            self.color_mapper = ColorMapper()
+            print("ColorMapper initialized successfully")
+            # Initialize LightingAnalyzer
+            self.lighting_analyzer = LightingAnalyzer()
+            print("LightingAnalyzer initialized successfully")
+            # Initialize Places365 model if enabled
+            self.places365_model = None
+            if self.enable_places365:
+                try:
+                    self.places365_model = Places365Model(
+                        model_name=places365_model_name,
+                        device=None
+                    )
+                    print(f"Places365 model initialized successfully with {places365_model_name}")
+                except Exception as e:
+                    print(f"Warning: Failed to initialize Places365 model: {e}")
+                    print("Continuing without Places365 analysis")
+                    self.enable_places365 = False
+                    self.places365_model = None
+            # Initialize SceneAnalyzer with error handling
+            self.scene_analyzer = None
+            self.class_names = None  # Will be set when first model is loaded
+            try:
+                # Initialize SceneAnalyzer without class_names (will be set later)
+                self.scene_analyzer = SceneAnalyzer(
+                    class_names=None,
+                    use_llm=self.use_llm,
+                    use_clip=True,
+                    enable_landmark=True,
+                    llm_model_path=self.llm_model_path
+                )
+                print("SceneAnalyzer initialized successfully")
+                # Verify critical components
+                if self.scene_analyzer is not None:
+                    print(f"SceneAnalyzer status - spatial_analyzer: {hasattr(self.scene_analyzer, 'spatial_analyzer')}, "
+                        f"descriptor: {hasattr(self.scene_analyzer, 'descriptor')}, "
+                        f"scene_describer: {hasattr(self.scene_analyzer, 'scene_describer')}")
+                else:
+                    print("WARNING: scene_analyzer is None after initialization")
+            except Exception as e:
+                print(f"Error initializing SceneAnalyzer: {e}")
+                import traceback
+                traceback.print_exc()
+                self.scene_analyzer = None
+            print("ImageProcessor initialization completed successfully")
+        except Exception as e:
+            print(f"Critical error during ImageProcessor initialization: {e}")
+            import traceback
+            traceback.print_exc()
+            raise RuntimeError(f"Failed to initialize ImageProcessor: {str(e)}")
     def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
         """
         return self.model_instances[model_name]
+    def analyze_scene(self, detection_result: Any, lighting_info: Optional[Dict] = None, enable_landmark=True, places365_info=None) -> Dict:
         """
         Perform scene analysis on detection results
         Args:
             detection_result: Object detection result from YOLOv8
             lighting_info: Lighting condition analysis results (optional)
+            enable_landmark: Whether to enable landmark detection
+            places365_info: Places365 analysis results (optional)
         Returns:
             Dictionary containing scene analysis results
         """
+        print(f"DEBUG: analyze_scene received enable_landmark={enable_landmark}")
         try:
+            # Check if detection_result has valid names
+            class_names = getattr(detection_result, 'names', None) if detection_result else None
+            # Initialize or reinitialize scene analyzer if needed
+            if self.scene_analyzer is None:
+                print("Scene analyzer not initialized, creating new instance")
                 self.scene_analyzer = SceneAnalyzer(
+                    class_names=class_names,
                     use_llm=self.use_llm,
+                    use_clip=True,
+                    enable_landmark=enable_landmark,
                     llm_model_path=self.llm_model_path
                 )
+                if self.scene_analyzer is None:
+                    raise ValueError("Failed to create SceneAnalyzer instance")
+            else:
+                # Update existing scene analyzer settings
+                self.scene_analyzer.enable_landmark = enable_landmark
+                # Update class names if available and different
+                if class_names and self.scene_analyzer.class_names != class_names:
+                    self.scene_analyzer.class_names = class_names
+                    if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
+                        self.scene_analyzer.spatial_analyzer.class_names = class_names
+                # Update landmark detection settings in child components
+                if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
+                    self.scene_analyzer.spatial_analyzer.enable_landmark = enable_landmark
+            # Perform scene analysis with lighting info and Places365 context
             scene_analysis = self.scene_analyzer.analyze(
                 detection_result=detection_result,
                 lighting_info=lighting_info,
                 class_confidence_threshold=0.35,
+                scene_confidence_threshold=0.6,
+                enable_landmark=enable_landmark,
+                places365_info=places365_info
             )
             return scene_analysis
         except Exception as e:
             print(f"Error in scene analysis: {str(e)}")
             import traceback
             traceback.print_exc()
+            # Return a valid default result
             return {
                 "scene_type": "unknown",
                 "confidence": 0.0,
                 "description": f"Error during scene analysis: {str(e)}",
+                "enhanced_description": "Scene analysis could not be completed due to an error.",
                 "objects_present": [],
                 "object_count": 0,
                 "regions": {},
                 "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
             }
+    def analyze_lighting_conditions(self, image, places365_info: Optional[Dict] = None):
         """
+        分析光照條件並考慮 Places365 場景資訊。
         Args:
             image: 輸入圖像
+            places365_info: Places365 場景分析結果，用於覆蓋邏輯
         Returns:
             Dict: 光照分析結果
         """
+        return self.lighting_analyzer.analyze(image, places365_info=places365_info)
+    def analyze_places365_scene(self, image):
         """
+        Analyze scene using Places365 model.
         Args:
+            image: Input image (PIL Image)
         Returns:
+            Dict: Places365 analysis results or None if disabled/failed
+        """
+        if not self.enable_places365 or self.places365_model is None:
+            return None
+        try:
+            if not isinstance(image, Image.Image):
+                if isinstance(image, np.ndarray):
+                    image = Image.fromarray(image)
+                else:
+                    print(f"Warning: Cannot process image of type {type(image)} for Places365")
+                    return None
+            places365_result = self.places365_model.predict(image)
+            if places365_result and places365_result.get('confidence', 0) > 0.1:
+                print(f"Places365 detected: {places365_result['scene_label']} "
+                    f"(mapped: {places365_result['mapped_scene_type']}) "
+                    f"confidence: {places365_result['confidence']:.3f}")
+                return places365_result
+            else:
+                print("Places365 analysis failed or low confidence")
+                return None
+        except Exception as e:
+            print(f"Error in Places365 analysis: {str(e)}")
+            return None
+    def process_image(self, image: Any, model_name: str, confidence_threshold: float, filter_classes: Optional[List[int]] = None,  enable_landmark: bool = True) -> Tuple[Any, str, Dict]:
+        """
+        Process an image for object detection and scene analysis.
+        Args:
+            image: Input image (numpy array or PIL Image).
+            model_name: Name of the model to use.
+            confidence_threshold: Confidence threshold for detection.
+            filter_classes: Optional list of classes to filter results.
+            enable_landmark: Whether to enable landmark detection for this run.
+        Returns:
+            Tuple of (result_image_pil, result_text, stats_data_with_scene_analysis).
         """
         model_instance = self.get_model_instance(model_name, confidence_threshold)
+        if model_instance is None:
+            return None, f"Failed to load model: {model_name}. Please check model configuration.", {}
         result = None
+        stats_data = {}
         temp_path = None
+        pil_image_for_processing = None # Use this to store the consistently processed PIL image
         try:
             if isinstance(image, np.ndarray):
+                if image.ndim == 3 and image.shape[2] == 3: # RGB or BGR
+                    # Assuming BGR from OpenCV, convert to RGB for PIL standard
+                    image_rgb_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                    pil_image_for_processing = Image.fromarray(image_rgb_np)
+                elif image.ndim == 3 and image.shape[2] == 4: # RGBA or BGRA
+                    image_rgba_np = cv2.cvtColor(image, cv2.COLOR_BGRA2RGBA) # Ensure RGBA
+                    pil_image_for_processing = Image.fromarray(image_rgba_np).convert("RGB") # Convert to RGB
+                elif image.ndim == 2: # Grayscale
+                    pil_image_for_processing = Image.fromarray(image).convert("RGB")
                 else:
+                    pil_image_for_processing = Image.fromarray(image) # Hope for the best
+            elif isinstance(image, Image.Image):
+                pil_image_for_processing = image.copy() # Use a copy
             elif image is None:
                 return None, "No image provided. Please upload an image.", {}
             else:
+                return None, f"Unsupported image type: {type(image)}. Please provide a NumPy array or PIL Image.", {}
+            if pil_image_for_processing.mode != "RGB": # Ensure final image is RGB
+                pil_image_for_processing = pil_image_for_processing.convert("RGB")
+            # Add Places365 scene analysis parallel to lighting analysis
+            places365_info = self.analyze_places365_scene(pil_image_for_processing)
+            lighting_info = self.analyze_lighting_conditions(pil_image_for_processing, places365_info=places365_info)
+            temp_dir = tempfile.gettempdir()
             temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
             temp_path = os.path.join(temp_dir, temp_filename)
+            pil_image_for_processing.save(temp_path, format="JPEG")
             result = model_instance.detect(temp_path)
+            if result is None or not hasattr(result, 'boxes'):
+                scene_analysis_no_yolo = self.analyze_scene(result, lighting_info, enable_landmark=enable_landmark, places365_info=places365_info)
+                desc_no_yolo = scene_analysis_no_yolo.get("enhanced_description", scene_analysis_no_yolo.get("description", "Detection failed, scene context analysis attempted."))
+                stats_data["scene_analysis"] = scene_analysis_no_yolo
+                if places365_info:
+                    stats_data["places365_analysis"] = places365_info
+                return pil_image_for_processing, desc_no_yolo, stats_data
+            # 統計資訊
+            stats_data = EvaluationMetrics.calculate_basic_stats(result)
             spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
+            stats_data["spatial_metrics"] = spatial_metrics
+            stats_data["lighting_conditions"] = lighting_info
+            if places365_info:
+                stats_data["places365_analysis"] = places365_info
             if filter_classes and len(filter_classes) > 0:
                 classes = result.boxes.cls.cpu().numpy().astype(int)
                 confs = result.boxes.conf.cpu().numpy()
+                mask = np.isin(classes, filter_classes)
+                filtered_stats_data = {
+                    "total_objects": int(np.sum(mask)), "class_statistics": {},
+                    "average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0.0,
+                    "spatial_metrics": stats_data.get("spatial_metrics",{}),
                     "lighting_conditions": lighting_info
                 }
+                if places365_info:
+                    filtered_stats_data["places365_analysis"] = places365_info
                 names = result.names
+                class_conf_sums = {}
+                for cls_id_int, conf_val in zip(classes[mask], confs[mask]):
+                    cls_name = names[cls_id_int]
+                    if cls_name not in filtered_stats_data["class_statistics"]:
+                        filtered_stats_data["class_statistics"][cls_name] = {"count": 0}
+                        class_conf_sums[cls_name] = 0.0
+                    filtered_stats_data["class_statistics"][cls_name]["count"] += 1 # 累計統計資訊
+                    class_conf_sums[cls_name] += conf_val
+                for cls_name_stat, data_stat in filtered_stats_data["class_statistics"].items():
+                    data_stat["average_confidence"] = round(class_conf_sums[cls_name_stat] / data_stat["count"] if data_stat["count"] > 0 else 0.0, 4)
+                stats_data = filtered_stats_data
+            viz_data = EvaluationMetrics.generate_visualization_data(result, self.color_mapper.get_all_colors())
+            result_image_pil = VisualizationHelper.visualize_detection(
+                temp_path, result, color_mapper=self.color_mapper,
+                figsize=(12, 12), return_pil=True, filter_classes=filter_classes
             )
+            result_text_summary = EvaluationMetrics.format_detection_summary(viz_data)
+            #  Pass the enable_landmark parameter from function signature
+            # Initialize or update scene analyzer if needed
+            if self.scene_analyzer is None:
+                print("Creating SceneAnalyzer in process_image")
+                self.scene_analyzer = SceneAnalyzer(
+                    class_names=result.names if result else None,
+                    use_llm=self.use_llm,
+                    use_clip=True,
+                    enable_landmark=enable_landmark,
+                    llm_model_path=self.llm_model_path
+                )
+                if self.scene_analyzer is None:
+                    print("ERROR: Failed to create SceneAnalyzer in process_image")
+            else:
+                # Update existing scene analyzer with current settings
+                if result and hasattr(result, 'names'):
+                    self.scene_analyzer.class_names = result.names
+                    if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
+                        self.scene_analyzer.spatial_analyzer.class_names = result.names
+                self.scene_analyzer.enable_landmark = enable_landmark
+                if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
+                    self.scene_analyzer.spatial_analyzer.enable_landmark = enable_landmark
+            # Perform scene analysis using the existing analyze_scene method
+            scene_analysis_result = self.analyze_scene(
+                detection_result=result,
+                lighting_info=lighting_info,
+                enable_landmark=enable_landmark,
+                places365_info=places365_info
             )
+            stats_data["scene_analysis"] = scene_analysis_result
+            final_result_text = result_text_summary
+            # Use enable_landmark parameter for landmark block
+            if enable_landmark and "detected_landmarks" in scene_analysis_result:
+                landmarks_detected = scene_analysis_result.get("detected_landmarks", [])
+                if not landmarks_detected and scene_analysis_result.get("primary_landmark"):
+                    primary_lm = scene_analysis_result.get("primary_landmark")
+                    if isinstance(primary_lm, dict): landmarks_detected = [primary_lm]
+                if landmarks_detected:
+                    final_result_text += "\n\n--- Detected Landmarks ---\n"
+                    # Ensure drawing on the correct PIL image
+                    img_to_draw_on = result_image_pil.copy() # Draw on a copy
+                    img_for_drawing_cv2 = cv2.cvtColor(np.array(img_to_draw_on), cv2.COLOR_RGB2BGR)
+                    for landmark_item in landmarks_detected:
+                        if not isinstance(landmark_item, dict): continue
+                        # Use .get() for all potentially missing keys 比較保險
+                        landmark_name_disp = landmark_item.get("class_name", landmark_item.get("name", "N/A"))
+                        landmark_loc_disp = landmark_item.get("location", "N/A")
+                        landmark_conf_disp = landmark_item.get("confidence", 0.0)
+                        final_result_text += f"• {landmark_name_disp} ({landmark_loc_disp}, confidence: {landmark_conf_disp:.2f})\n"
+                        if "box" in landmark_item:
+                            box = landmark_item["box"]
+                            pt1 = (int(box[0]), int(box[1])); pt2 = (int(box[2]), int(box[3]))
+                            color_lm = (255, 0, 255); thickness_lm = 3 # Magenta BGR
+                            cv2.rectangle(img_for_drawing_cv2, pt1, pt2, color_lm, thickness_lm)
+                            label_lm = f"{landmark_name_disp} ({landmark_conf_disp:.2f})"
+                            font_scale_lm = 0.6; font_thickness_lm = 1
+                            (w_text, h_text), baseline = cv2.getTextSize(label_lm, cv2.FONT_HERSHEY_SIMPLEX, font_scale_lm, font_thickness_lm)
+                            # Label position logic (simplified from your extensive one for brevity)
+                            label_y_pos = pt1[1] - baseline - 3
+                            if label_y_pos < h_text : # If label goes above image, put it below box
+                                label_y_pos = pt2[1] + h_text + baseline + 3
+                            label_bg_pt1 = (pt1[0], label_y_pos - h_text - baseline)
+                            label_bg_pt2 = (pt1[0] + w_text, label_y_pos + baseline)
+                            cv2.rectangle(img_for_drawing_cv2, label_bg_pt1, label_bg_pt2, color_lm, -1)
+                            cv2.putText(img_for_drawing_cv2, label_lm, (pt1[0], label_y_pos),
+                                        cv2.FONT_HERSHEY_SIMPLEX, font_scale_lm, (255,255,255), font_thickness_lm, cv2.LINE_AA)
+                    result_image_pil = Image.fromarray(cv2.cvtColor(img_for_drawing_cv2, cv2.COLOR_BGR2RGB))
+            return result_image_pil, final_result_text, stats_data
         except Exception as e:
+            error_message = f"Error in ImageProcessor.process_image: {str(e)}"
             import traceback
             traceback.print_exc()
+            return pil_image_for_processing if pil_image_for_processing else None, error_message, {}
         finally:
             if temp_path and os.path.exists(temp_path):
+                try: os.remove(temp_path)
+                except Exception as e: print(f"Warning: Cannot delete temp file {temp_path}: {str(e)}")
     def format_result_text(self, stats: Dict) -> str:
         """
         else:
             lines.append("No class information available.")
+        # 添加空間資訊
         if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
             lines.append("Object Distribution:")

landmark_activities.py ADDED Viewed

The diff for this file is too large to render. See raw diff

landmark_data.py ADDED Viewed

The diff for this file is too large to render. See raw diff

lighting_analyzer.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

lighting_conditions.py CHANGED Viewed

@@ -12,6 +12,36 @@ LIGHTING_CONDITIONS = {
         "bright": "The scene has the diffused bright lighting of an overcast day.",
         "medium": "The scene has even, soft lighting typical of a cloudy day.",
         "dim": "The scene has the muted lighting of a heavily overcast day."
         },
         "sunset/sunrise": {
         "general": "The scene is captured during golden hour with warm lighting.",
@@ -81,6 +111,10 @@ LIGHTING_CONDITIONS = {
         "beach_lighting": "sun-drenched",
         "sports_venue_lighting": "arena-lit",
         "professional_kitchen_lighting": "kitchen-task lit",
         "unknown": ""
     },
     "activity_modifiers": {
@@ -127,5 +161,11 @@ LIGHTING_CONDITIONS = {
         "bright": "The space blends bright natural and artificial light sources across indoor-outdoor boundaries.",
         "medium": "The area combines moderate indoor lighting with outdoor illumination in a balanced way.",
         "dim": "The transition space features subtle lighting gradients between indoor and outdoor zones."
     }
 }

         "bright": "The scene has the diffused bright lighting of an overcast day.",
         "medium": "The scene has even, soft lighting typical of a cloudy day.",
         "dim": "The scene has the muted lighting of a heavily overcast day."
+        },
+         "day_cloudy_gray": {
+        "general": "The scene is captured during an overcast day with muted gray lighting.",
+        "bright": "The scene has bright but diffused gray daylight from heavy cloud cover.",
+        "medium": "The scene has even, muted lighting typical of a gray, overcast day.",
+        "dim": "The scene has subdued lighting under thick gray clouds."
+        },
+        "indoor_residential_natural": {
+            "general": "The scene is captured in a residential setting with natural window lighting.",
+            "bright": "The residential space is brightly lit with abundant natural light from windows.",
+            "medium": "The home interior has comfortable natural lighting complemented by artificial sources.",
+            "dim": "The residential space has soft natural lighting creating a cozy atmosphere."
+        },
+        "indoor_designer_residential": {
+            "general": "The scene is captured in a well-designed residential space with curated lighting.",
+            "bright": "The residential interior features bright, designer lighting creating an elegant atmosphere.",
+            "medium": "The home space has thoughtfully planned lighting balancing aesthetics and functionality.",
+            "dim": "The residential area has sophisticated mood lighting enhancing the design elements."
+        },
+        "indoor_bright_natural_mix": {
+            "general": "The scene is captured indoors with a blend of natural and artificial lighting.",
+            "bright": "The indoor space combines bright natural window light with artificial illumination.",
+            "medium": "The interior has balanced mixed lighting from windows and electric sources.",
+            "dim": "The indoor area has gentle mixed lighting creating comfortable illumination."
+        },
+        "indoor_restaurant_bar": {
+            "general": "The scene is captured inside a restaurant or bar with characteristic warm lighting.",
+            "bright": "The dining establishment is well-lit with warm illumination emphasizing ambiance.",
+            "medium": "The restaurant/bar has moderate warm lighting creating a comfortable social atmosphere.",
+            "dim": "The establishment features soft, warm lighting creating an intimate dining or social atmosphere."
         },
         "sunset/sunrise": {
         "general": "The scene is captured during golden hour with warm lighting.",
         "beach_lighting": "sun-drenched",
         "sports_venue_lighting": "arena-lit",
         "professional_kitchen_lighting": "kitchen-task lit",
+        "day_cloudy_gray": "gray-lit",
+        "indoor_residential_natural": "naturally-lit residential",
+        "indoor_designer_residential": "designer-lit residential",
+        "indoor_bright_natural_mix": "mixed-lit indoor",
         "unknown": ""
     },
     "activity_modifiers": {
         "bright": "The space blends bright natural and artificial light sources across indoor-outdoor boundaries.",
         "medium": "The area combines moderate indoor lighting with outdoor illumination in a balanced way.",
         "dim": "The transition space features subtle lighting gradients between indoor and outdoor zones."
+    },
+    "stadium_or_floodlit_area": {
+    "general": "The scene is captured under powerful floodlights creating uniform bright illumination.",
+    "bright": "The area is intensely illuminated by floodlights, similar to stadium conditions.",
+    "medium": "The space has even, powerful lighting typical of sports facilities or outdoor events.",
+    "dim": "The area has moderate floodlight illumination providing consistent lighting across the space."
     }
 }

llm_enhancer.py CHANGED Viewed

@@ -19,7 +19,6 @@ class LLMEnhancer:
                 top_p: float = 0.85):
         """
         初始化LLM增強器
         Args:
             model_path: LLM模型的路徑或HuggingFace log in，默認使用Llama 3.2
             tokenizer_path: token處理器的路徑，通常與model_path相同
@@ -38,7 +37,7 @@ class LLMEnhancer:
         self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
         self.tokenizer_path = tokenizer_path or self.model_path
-        # 確定運行設備
         self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
         self.logger.info(f"Using device: {self.device}")
@@ -50,7 +49,7 @@ class LLMEnhancer:
         self.model = None
         self.tokenizer = None
-        # 計數器，用來追蹤模型調用次數
         self.call_count = 0
         self._initialize_prompts()
@@ -124,17 +123,12 @@ class LLMEnhancer:
         self.enhance_description_template = """
             <|system|>
             You are an expert visual analyst. Your task is to improve the readability and fluency of scene descriptions using STRICT factual accuracy.
             Your **top priority is to avoid hallucination** or fabrication. You are working in a computer vision pipeline using object detection (YOLO) and image embeddings. You MUST treat the input object list as a whitelist. Do not speculate beyond this list.
             </|system|>
             <|user|>
             Rewrite the following scene description to be fluent and clear. DO NOT add any objects, events, or spatial relationships that are not explicitly present in the original or object list.
             ORIGINAL:
             {original_description}
             CRITICAL RULES:
             1. NEVER assume room type, object function, or scene purpose unless directly stated.
             2. NEVER invent object types. You are limited to: {object_list}
@@ -143,60 +137,51 @@ class LLMEnhancer:
             5. You MAY describe confirmed materials, colors, and composition style if visually obvious and non-speculative.
             6. Write 2–4 complete, well-structured sentences with punctuation.
             7. Final output MUST be a single fluent paragraph of 60–200 words (not longer).
-            8. NEVER include explanations, reasoning, or tags. ONLY provide the enhanced description.
-            9. Do not repeat any sentence structure or phrase more than once.
             </|user|>
             <|assistant|>
             """
         # 錯誤檢測的prompt
         self.verify_detection_template = """
             Task: You are an advanced vision system that verifies computer vision detections for accuracy.
             Analyze the following detection results and identify any potential errors or inconsistencies:
             SCENE TYPE: {scene_type}
             SCENE NAME: {scene_name}
             CONFIDENCE: {confidence:.2f}
             DETECTED OBJECTS: {detected_objects}
             CLIP ANALYSIS RESULTS:
             {clip_analysis}
             Possible Errors to Check:
             1. Objects misidentified (e.g., architectural elements labeled as vehicles)
             2. Cultural elements misunderstood (e.g., Asian temple structures labeled as boats)
             3. Objects that seem out of place for this type of scene
             4. Inconsistencies between different detection systems
             If you find potential errors, list them clearly with explanations. If the detections seem reasonable, state that they appear accurate.
             Verification Results:
             """
         # 無檢測處理的prompt
         self.no_detection_template = """
             Task: You are an advanced scene understanding system analyzing an image where standard object detection failed to identify specific objects.
             Based on advanced image embeddings (CLIP analysis), we have the following information:
             MOST LIKELY SCENE: {top_scene} (confidence: {top_confidence:.2f})
             VIEWPOINT: {viewpoint}
             LIGHTING: {lighting_condition}
             CULTURAL ANALYSIS: {cultural_analysis}
             Create a detailed description of what might be in this scene, considering:
             1. The most likely type of location or setting
             2. Possible architectural or natural elements present
             3. The lighting and atmosphere
             4. Potential cultural or regional characteristics
             Your description should be natural, flowing, and offer insights into what the image likely contains despite the lack of specific object detection.
             Scene Description:
             """
@@ -300,7 +285,7 @@ class LLMEnhancer:
             self.logger.info("Model not loaded, no context to reset")
     def _remove_introduction_sentences(self, response: str) -> str:
-        """移除生成文本中可能的介紹性句子"""
         # 識別常見的介紹性模式
         intro_patterns = [
             r'^Here is the (?:rewritten|enhanced) .*?description:',
@@ -318,7 +303,7 @@ class LLMEnhancer:
         return response
     def enhance_description(self, scene_data: Dict[str, Any]) -> str:
-        """改進的場景描述增強器，處理各種場景類型並保留視角與光照資訊，並作為總窗口可運用於其他class"""
         try:
             # 重置上下文
             self.reset_context()
@@ -332,7 +317,7 @@ class LLMEnhancer:
             if not original_desc:
                 return "No original description provided."
-            # 獲取scene type 並標準化
             scene_type = scene_data.get("scene_type", "unknown scene")
             scene_type = self._clean_scene_type(scene_type)
@@ -357,16 +342,28 @@ class LLMEnhancer:
                 if confidence >= high_confidence_threshold:
                     filtered_objects.append(obj)
-            # 計算物件列表和數量 - 僅使用過濾後的高信心度物件
             object_counts = {}
-            for obj in filtered_objects:
-                class_name = obj.get("class_name", "")
-                if class_name not in object_counts:
-                    object_counts[class_name] = 0
-                object_counts[class_name] += 1
-            # 將高置信度物件格式化為清單
-            high_confidence_objects = ", ".join([f"{count} {obj}" for obj, count in object_counts.items()])
             # 如果沒有高信心度物件，回退到使用原始描述中的關鍵詞
             if not high_confidence_objects:
@@ -399,6 +396,29 @@ class LLMEnhancer:
             response = self._generate_llm_response(prompt)
             # 檢查回應完整性的更嚴格標準
             is_incomplete = (
                 len(response) < 100 or  # too short
                 (len(response) < 200 and "." not in response[-30:]) or  # 結尾沒有適當的標點符號
@@ -442,7 +462,15 @@ class LLMEnhancer:
             if perspective and perspective.lower() not in result.lower():
                 result = f"{perspective}, {result[0].lower()}{result[1:]}"
-            return str(result)
         except Exception as e:
             self.logger.error(f"Enhancement failed: {str(e)}")
@@ -451,7 +479,7 @@ class LLMEnhancer:
             return original_desc  # 發生任何錯誤時返回原始描述
     def _verify_factual_accuracy(self, original: str, generated: str, object_list: str) -> str:
-        """驗證生成的描述不包含原始描述或物體列表中沒有的信息"""
         # 將原始描述和物體列表合併為授權詞彙源
         authorized_content = original.lower() + " " + object_list.lower()
@@ -475,6 +503,55 @@ class LLMEnhancer:
                 pattern = re.compile(r'\b' + term + r'\b', re.IGNORECASE)
                 generated = pattern.sub(replacement, generated)
         return generated
@@ -486,14 +563,12 @@ class LLMEnhancer:
                        confidence: float) -> Dict[str, Any]:
         """
         驗證並可能修正YOLO的檢測結果
         Args:
             detected_objects: YOLO檢測到的物體列表
             clip_analysis: CLIP分析結果
             scene_type: 識別的場景類型
             scene_name: 場景名稱
             confidence: 場景分類的信心度
         Returns:
             Dict: 包含驗證結果和建議的字典
         """
@@ -520,7 +595,7 @@ class LLMEnhancer:
         result = {
             "verification_text": verification_result,
             "has_errors": "appear accurate" not in verification_result.lower(),
-            "corrected_objects": None  # 可能在未來版本實現詳細錯誤修正
         }
         return result
@@ -567,10 +642,8 @@ class LLMEnhancer:
     def handle_no_detection(self, clip_analysis: Dict[str, Any]) -> str:
         """
         處理YOLO未檢測到物體的情況
         Args:
             clip_analysis: CLIP分析結果
         Returns:
             str: 生成的場景描述
         """
@@ -603,10 +676,8 @@ class LLMEnhancer:
     def _clean_input_text(self, text: str) -> str:
         """
         對輸入文本進行通用的格式清理，處理常見的格式問題。
         Args:
             text: 輸入文本
         Returns:
             清理後的文本
         """
@@ -635,13 +706,11 @@ class LLMEnhancer:
     def _fact_check_description(self, original_desc: str, enhanced_desc: str, scene_type: str, detected_objects: List[str]) -> str:
         """
         驗證並可能修正增強後的描述，確保有保持事實準確性。
         Args:
             original_desc: 原始場景描述
             enhanced_desc: 增強後的描述待驗證
             scene_type: 場景類型
             detected_objects: 檢測到的物體名稱列表
         Returns:
             經過事實檢查的描述
         """
@@ -842,13 +911,14 @@ class LLMEnhancer:
             # 為 Llama 模型設置特定參數
             if "llama" in self.model_path.lower():
                 generation_params.update({
-                    "temperature": 0.4,        # 不要太高, 否則模型可能會太有主觀意見
                     "max_new_tokens": 600,
                     "do_sample": True,
-                    "top_p": 0.8,
-                    "repetition_penalty": 1.2,  # 重複的懲罰權重,可避免掉重複字
-                    "num_beams": 4 ,
-                    "length_penalty": 1.2,
                 })
             else:
@@ -885,9 +955,9 @@ class LLMEnhancer:
                 if response.startswith(input_text):
                     response = response[len(input_text):].strip()
-            # 確保不返回空響應
             if not response or len(response.strip()) < 10:
-                self.logger.warning("生成的回應為空的或太短，返回默認回應")
                 return "No detailed description could be generated."
             return response
@@ -902,10 +972,8 @@ class LLMEnhancer:
         """
         Clean the LLM response to ensure the output contains only clean descriptive text.
         Sometimes it will not only display the description but display tags, notes...etc
         Args:
             response: Original response from the LLM
         Returns:
             Cleaned description text
         """
@@ -939,13 +1007,27 @@ class LLMEnhancer:
         for marker in section_markers:
             response = re.sub(marker, '', response, flags=re.IGNORECASE)
         # 3. Remove common prefixes and suffixes
         prefixes_to_remove = [
             "Enhanced Description:",
             "Scene Description:",
             "Description:",
             "Here is the enhanced description:",
-            "Here's the enhanced description:"
         ]
         for prefix in prefixes_to_remove:
@@ -1004,6 +1086,49 @@ class LLMEnhancer:
         # Recombine unique sentences
         response = ' '.join(unique_sentences)
         # 10. Ensure word count is within limits (50-150 words)
         words = response.split()
         if len(words) > 200:
@@ -1035,7 +1160,20 @@ class LLMEnhancer:
                         # Remove the last preposition or conjunction
                         response = " ".join(words[:-1]) + "."
-        # 12. Ensure haven't over-filtered
         if not response or len(response) < 40:
             # Try to get the first meaningful paragraph from the original response
             paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
@@ -1052,7 +1190,7 @@ class LLMEnhancer:
             # If still no good content, return a simple message
             return "Unable to generate a valid enhanced description."
-        # 13. Final cleaning - catch any missed special cases
         response = re.sub(r'</?\|.*?\|>', '', response)  # Any remaining tags
         response = re.sub(r'\(.*?\)', '', response)  # Any remaining parenthetical content
         response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE)  # Any remaining notes
@@ -1064,7 +1202,7 @@ class LLMEnhancer:
         if response and response[0].islower():
             response = response[0].upper() + response[1:]
-        # 14. 統一格式 - 確保輸出始終是單一段落
         response = re.sub(r'\s*\n\s*', ' ', response)  # 將所有換行符替換為空格
         response = ' '.join(response.split())

                 top_p: float = 0.85):
         """
         初始化LLM增強器
         Args:
             model_path: LLM模型的路徑或HuggingFace log in，默認使用Llama 3.2
             tokenizer_path: token處理器的路徑，通常與model_path相同
         self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
         self.tokenizer_path = tokenizer_path or self.model_path
+        # check device
         self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
         self.logger.info(f"Using device: {self.device}")
         self.model = None
         self.tokenizer = None
+        # 追蹤模型調用次數
         self.call_count = 0
         self._initialize_prompts()
         self.enhance_description_template = """
             <|system|>
             You are an expert visual analyst. Your task is to improve the readability and fluency of scene descriptions using STRICT factual accuracy.
             Your **top priority is to avoid hallucination** or fabrication. You are working in a computer vision pipeline using object detection (YOLO) and image embeddings. You MUST treat the input object list as a whitelist. Do not speculate beyond this list.
             </|system|>
             <|user|>
             Rewrite the following scene description to be fluent and clear. DO NOT add any objects, events, or spatial relationships that are not explicitly present in the original or object list.
             ORIGINAL:
             {original_description}
             CRITICAL RULES:
             1. NEVER assume room type, object function, or scene purpose unless directly stated.
             2. NEVER invent object types. You are limited to: {object_list}
             5. You MAY describe confirmed materials, colors, and composition style if visually obvious and non-speculative.
             6. Write 2–4 complete, well-structured sentences with punctuation.
             7. Final output MUST be a single fluent paragraph of 60–200 words (not longer).
+            8. Begin your response directly with the scene description. Do NOT include any introductory phrases, explanations, or formatting indicators.
+            9. Ensure grammatical completeness in all sentences. Each sentence must have a complete subject and predicate structure.
+            10. Vary sentence structures naturally while maintaining grammatical accuracy. Avoid incomplete phrases or dangling modifiers.
+            11. Limit repetition of descriptive verbs and spatial indicators to maintain text diversity and readability.
+            12. Create natural spatial flow by connecting object descriptions organically rather than listing positions mechanically.
+            13. Use transitional phrases to connect ideas smoothly, varying expression patterns throughout the description.
+            14. End with a conclusive observation about atmosphere, style, or overall impression rather than restating layout information.
+            15. When describing quantities or arrangements, use only information explicitly confirmed by the object detection system.
             </|user|>
             <|assistant|>
             """
         # 錯誤檢測的prompt
         self.verify_detection_template = """
             Task: You are an advanced vision system that verifies computer vision detections for accuracy.
             Analyze the following detection results and identify any potential errors or inconsistencies:
             SCENE TYPE: {scene_type}
             SCENE NAME: {scene_name}
             CONFIDENCE: {confidence:.2f}
             DETECTED OBJECTS: {detected_objects}
             CLIP ANALYSIS RESULTS:
             {clip_analysis}
             Possible Errors to Check:
             1. Objects misidentified (e.g., architectural elements labeled as vehicles)
             2. Cultural elements misunderstood (e.g., Asian temple structures labeled as boats)
             3. Objects that seem out of place for this type of scene
             4. Inconsistencies between different detection systems
             If you find potential errors, list them clearly with explanations. If the detections seem reasonable, state that they appear accurate.
             Verification Results:
             """
         # 無檢測處理的prompt
         self.no_detection_template = """
             Task: You are an advanced scene understanding system analyzing an image where standard object detection failed to identify specific objects.
             Based on advanced image embeddings (CLIP analysis), we have the following information:
             MOST LIKELY SCENE: {top_scene} (confidence: {top_confidence:.2f})
             VIEWPOINT: {viewpoint}
             LIGHTING: {lighting_condition}
             CULTURAL ANALYSIS: {cultural_analysis}
             Create a detailed description of what might be in this scene, considering:
             1. The most likely type of location or setting
             2. Possible architectural or natural elements present
             3. The lighting and atmosphere
             4. Potential cultural or regional characteristics
             Your description should be natural, flowing, and offer insights into what the image likely contains despite the lack of specific object detection.
             Scene Description:
             """
             self.logger.info("Model not loaded, no context to reset")
     def _remove_introduction_sentences(self, response: str) -> str:
+        """remove introduction sentences"""
         # 識別常見的介紹性模式
         intro_patterns = [
             r'^Here is the (?:rewritten|enhanced) .*?description:',
         return response
     def enhance_description(self, scene_data: Dict[str, Any]) -> str:
+        """場景描述增強器，處理各種場景類型並保留視角與光照資訊，並作為總窗口可運用於其他class"""
         try:
             # 重置上下文
             self.reset_context()
             if not original_desc:
                 return "No original description provided."
+            # get scene type 並標準化
             scene_type = scene_data.get("scene_type", "unknown scene")
             scene_type = self._clean_scene_type(scene_type)
                 if confidence >= high_confidence_threshold:
                     filtered_objects.append(obj)
+            # 優先使用傳入的物體統計信息，如果不存在則計算
+            object_statistics = scene_data.get("object_statistics", {})
             object_counts = {}
+            if object_statistics:
+                # 使用預計算的統計資訊，確保數量準確
+                for class_name, stats in object_statistics.items():
+                    if stats.get("count", 0) > 0 and stats.get("avg_confidence", 0) >= high_confidence_threshold:
+                        object_counts[class_name] = stats["count"]
+            else:
+                # 回退到原有的計算方式
+                for obj in filtered_objects:
+                    class_name = obj.get("class_name", "")
+                    if class_name not in object_counts:
+                        object_counts[class_name] = 0
+                    object_counts[class_name] += 1
+            # 將物件格式化為更精確的描述
+            high_confidence_objects = ", ".join([
+                f"{count} {obj}{'s' if count > 1 else ''}"
+                for obj, count in object_counts.items()
+            ])
             # 如果沒有高信心度物件，回退到使用原始描述中的關鍵詞
             if not high_confidence_objects:
             response = self._generate_llm_response(prompt)
             # 檢查回應完整性的更嚴格標準
+            is_landmark_only = (
+                    scene_data.get("scene_type") in ["tourist_landmark", "natural_landmark", "historical_monument"] and
+                    (not scene_data.get("detected_objects") or len(scene_data.get("detected_objects", [])) <= 1)
+                )
+            # 如果是只有地標的情況，調整相關邏輯
+            if is_landmark_only:
+                # 確保原始描述不為空
+                original_desc = scene_data.get("original_description", "")
+                if not original_desc or len(original_desc.strip()) < 10:
+                    # 從場景類型和地標信息生成基本描述
+                    scene_type = scene_data.get("scene_type", "unknown")
+                    scene_name = scene_data.get("scene_name", "Unknown")
+                    if "primary_landmark" in scene_data:
+                        landmark_name = scene_data["primary_landmark"].get("name", "unnamed landmark")
+                        original_desc = f"A {scene_type.replace('_', ' ')} scene featuring {landmark_name}."
+                    else:
+                        original_desc = f"A {scene_type.replace('_', ' ')} scene."
+                    # 更新場景數據
+                    scene_data["original_description"] = original_desc
+            # 檢查回應完整性的更嚴格標準 (保持不變)
             is_incomplete = (
                 len(response) < 100 or  # too short
                 (len(response) < 200 and "." not in response[-30:]) or  # 結尾沒有適當的標點符號
             if perspective and perspective.lower() not in result.lower():
                 result = f"{perspective}, {result[0].lower()}{result[1:]}"
+            final_result = str(result)
+            if not final_result or len(final_result.strip()) < 20:
+                self.logger.warning(f"WARNING: LLM enhanced description is empty or too short!")
+                self.logger.info(f"Original description: {original_desc[:50]}...")
+                self.logger.info(f"Input data: scene_type={scene_data.get('scene_type')}, objects={len(scene_data.get('detected_objects', []))}")
+            else:
+                self.logger.info(f"LLM enhanced description generated successfully ({len(final_result)} chars)")
+            return final_result
         except Exception as e:
             self.logger.error(f"Enhancement failed: {str(e)}")
             return original_desc  # 發生任何錯誤時返回原始描述
     def _verify_factual_accuracy(self, original: str, generated: str, object_list: str) -> str:
+        """驗證生成的描述不包含原始描述或物體列表中沒有的信息，並檢測重複用詞問題"""
         # 將原始描述和物體列表合併為授權詞彙源
         authorized_content = original.lower() + " " + object_list.lower()
                 pattern = re.compile(r'\b' + term + r'\b', re.IGNORECASE)
                 generated = pattern.sub(replacement, generated)
+        # 檢查描述性詞彙重複問題
+        repetitive_patterns = [
+            (r'\b(visible)\b.*?\b(visible)\b', 'Multiple uses of "visible" detected'),
+            (r'\b(positioned)\b.*?\b(positioned)\b', 'Multiple uses of "positioned" detected'),
+            (r'\b(located)\b.*?\b(located)\b', 'Multiple uses of "located" detected'),
+            (r'\b(situated)\b.*?\b(situated)\b', 'Multiple uses of "situated" detected'),
+            (r'\b(appears)\b.*?\b(appears)\b', 'Multiple uses of "appears" detected'),
+            (r'\b(features)\b.*?\b(features)\b', 'Multiple uses of "features" detected'),
+            (r'\bThis\s+(\w+)\s+.*?\bThis\s+\1\b', 'Repetitive sentence structure detected')
+        ]
+        # 定義替換詞典，提供多樣化的表達方式
+        replacement_dict = {
+            'visible': ['present', 'evident', 'apparent', 'observable'],
+            'positioned': ['arranged', 'placed', 'set', 'organized'],
+            'located': ['found', 'placed', 'situated', 'established'],
+            'situated': ['placed', 'positioned', 'arranged', 'set'],
+            'appears': ['seems', 'looks', 'presents', 'exhibits'],
+            'features': ['includes', 'contains', 'displays', 'showcases']
+        }
+        for pattern, issue in repetitive_patterns:
+            matches = list(re.finditer(pattern, generated, re.IGNORECASE | re.DOTALL))
+            if matches:
+                self.logger.warning(f"Text quality issue detected: {issue}")
+                # 針對特定重複詞彙進行替換
+                for word in replacement_dict.keys():
+                    if word in issue.lower():
+                        word_pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
+                        word_matches = list(word_pattern.finditer(generated))
+                        # 保留第一次出現，替換後續出現
+                        for i, match in enumerate(word_matches[1:], 1):
+                            if i <= len(replacement_dict[word]):
+                                replacement = replacement_dict[word][(i-1) % len(replacement_dict[word])]
+                                # 保持原始大小寫格式
+                                if match.group().isupper():
+                                    replacement = replacement.upper()
+                                elif match.group().istitle():
+                                    replacement = replacement.capitalize()
+                                # 執行替換
+                                generated = generated[:match.start()] + replacement + generated[match.end():]
+                                # 重新計算後續匹配位置
+                                word_matches = list(word_pattern.finditer(generated))
+                        break
         return generated
                        confidence: float) -> Dict[str, Any]:
         """
         驗證並可能修正YOLO的檢測結果
         Args:
             detected_objects: YOLO檢測到的物體列表
             clip_analysis: CLIP分析結果
             scene_type: 識別的場景類型
             scene_name: 場景名稱
             confidence: 場景分類的信心度
         Returns:
             Dict: 包含驗證結果和建議的字典
         """
         result = {
             "verification_text": verification_result,
             "has_errors": "appear accurate" not in verification_result.lower(),
+            "corrected_objects": None
         }
         return result
     def handle_no_detection(self, clip_analysis: Dict[str, Any]) -> str:
         """
         處理YOLO未檢測到物體的情況
         Args:
             clip_analysis: CLIP分析結果
         Returns:
             str: 生成的場景描述
         """
     def _clean_input_text(self, text: str) -> str:
         """
         對輸入文本進行通用的格式清理，處理常見的格式問題。
         Args:
             text: 輸入文本
         Returns:
             清理後的文本
         """
     def _fact_check_description(self, original_desc: str, enhanced_desc: str, scene_type: str, detected_objects: List[str]) -> str:
         """
         驗證並可能修正增強後的描述，確保有保持事實準確性。
         Args:
             original_desc: 原始場景描述
             enhanced_desc: 增強後的描述待驗證
             scene_type: 場景類型
             detected_objects: 檢測到的物體名稱列表
         Returns:
             經過事實檢查的描述
         """
             # 為 Llama 模型設置特定參數
             if "llama" in self.model_path.lower():
                 generation_params.update({
+                    "temperature": 0.35,        # 不要太高, 否則模型可能會太有主觀意見
                     "max_new_tokens": 600,
                     "do_sample": True,
+                    "top_p": 0.75,
+                    "repetition_penalty": 1.5,  # 重複的懲罰權重,可避免掉重複字
+                    "num_beams": 5 ,
+                    "length_penalty": 1,
+                    "no_repeat_ngram_size": 3
                 })
             else:
                 if response.startswith(input_text):
                     response = response[len(input_text):].strip()
+            # 確保不返回空的回應
             if not response or len(response.strip()) < 10:
+                self.logger.warning("response is too short or empty")
                 return "No detailed description could be generated."
             return response
         """
         Clean the LLM response to ensure the output contains only clean descriptive text.
         Sometimes it will not only display the description but display tags, notes...etc
         Args:
             response: Original response from the LLM
         Returns:
             Cleaned description text
         """
         for marker in section_markers:
             response = re.sub(marker, '', response, flags=re.IGNORECASE)
+        # 2.5. Deal with Here is...
+        intro_prefixes = [
+            r'^Here\s+is\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?scene\s+description.*?:\s*',
+            r'^The\s+(?:rewritten\s+|enhanced\s+)?(?:scene\s+)?description\s+is.*?:\s*',
+            r'^Here\'s\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?description.*?:\s*'
+        ]
+        for prefix_pattern in intro_prefixes:
+            response = re.sub(prefix_pattern, '', response, flags=re.IGNORECASE)
         # 3. Remove common prefixes and suffixes
         prefixes_to_remove = [
             "Enhanced Description:",
             "Scene Description:",
             "Description:",
             "Here is the enhanced description:",
+            "Here's the enhanced description:",
+            "Here is a rewritten scene description that adheres to the provided critical rules:",
+            "Here is the rewritten scene description:",
+            "Here's a rewritten scene description:",
+            "The rewritten scene description is as follows:"
         ]
         for prefix in prefixes_to_remove:
         # Recombine unique sentences
         response = ' '.join(unique_sentences)
+        # 9.5. Advanced repetition detection and replacement
+        repetitive_descriptors = ['visible', 'positioned', 'located', 'situated', 'appears', 'features', 'shows', 'displays']
+        word_usage_count = {}
+        # Count occurrences of each repetitive descriptor
+        for word in repetitive_descriptors:
+            count = len(re.findall(r'\b' + word + r'\b', response, re.IGNORECASE))
+            if count > 1:
+                word_usage_count[word] = count
+        # Replace excessive repetitions with varied alternatives
+        replacement_alternatives = {
+            'visible': ['present', 'evident', 'apparent', 'observable'],
+            'positioned': ['arranged', 'placed', 'set', 'organized'],
+            'located': ['found', 'placed', 'situated', 'established'],
+            'situated': ['placed', 'positioned', 'arranged', 'set'],
+            'appears': ['seems', 'looks', 'presents', 'exhibits'],
+            'features': ['includes', 'contains', 'displays', 'showcases'],
+            'shows': ['reveals', 'presents', 'exhibits', 'demonstrates'],
+            'displays': ['presents', 'exhibits', 'shows', 'reveals']
+        }
+        for word, count in word_usage_count.items():
+            if count > 1 and word in replacement_alternatives:
+                # Find all occurrences
+                pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
+                matches = list(pattern.finditer(response))
+                # Replace subsequent occurrences (keep first one)
+                for i, match in enumerate(matches[1:], 1):
+                    if i <= len(replacement_alternatives[word]):
+                        replacement = replacement_alternatives[word][(i-1) % len(replacement_alternatives[word])]
+                        # Maintain original case pattern
+                        if match.group().isupper():
+                            replacement = replacement.upper()
+                        elif match.group().istitle():
+                            replacement = replacement.capitalize()
+                        response = response[:match.start()] + replacement + response[match.end():]
+                        # Update remaining matches positions
+                        offset = len(replacement) - len(match.group())
+                        matches = list(pattern.finditer(response))
         # 10. Ensure word count is within limits (50-150 words)
         words = response.split()
         if len(words) > 200:
                         # Remove the last preposition or conjunction
                         response = " ".join(words[:-1]) + "."
+        # 12. Grammar completeness check
+        incomplete_patterns = [
+            r'\b(fine|the)\s+(the\s+)?(?:urban|area|scene)\b(?!\s+\w)',  # 檢測不完整的片語
+            r'\b(and|or|but|with|from|in|at|on)\s*[.!?]',              # 介詞後直接結束
+            r'\b\w+\s+\1\b'  # 重複詞語檢測
+        ]
+        for pattern in incomplete_patterns:
+            if re.search(pattern, response, re.IGNORECASE):
+                # 移除有問題的片段或進行修正
+                response = re.sub(pattern, '', response, flags=re.IGNORECASE)
+                response = re.sub(r'\s{2,}', ' ', response)  # 清理多餘空格
+        # 13. Ensure haven't over-filtered
         if not response or len(response) < 40:
             # Try to get the first meaningful paragraph from the original response
             paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
             # If still no good content, return a simple message
             return "Unable to generate a valid enhanced description."
+        # 14. Final cleaning - catch any missed special cases
         response = re.sub(r'</?\|.*?\|>', '', response)  # Any remaining tags
         response = re.sub(r'\(.*?\)', '', response)  # Any remaining parenthetical content
         response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE)  # Any remaining notes
         if response and response[0].islower():
             response = response[0].upper() + response[1:]
+        # 15. 統一格式 - 確保輸出始終是單一段落
         response = re.sub(r'\s*\n\s*', ' ', response)  # 將所有換行符替換為空格
         response = ' '.join(response.split())

object_template_fillers.py CHANGED Viewed

@@ -74,5 +74,10 @@ OBJECT_TEMPLATE_FILLERS = {
                 "playing_surface": ["marked court boundaries", "manicured field turf", "running tracks", "competition equipment", "sports field markers"],
                 "construction_equipment": ["tower cranes", "excavators", "cement mixers", "scaffolding structures", "construction barriers"],
                 "medical_elements": ["examination furniture", "monitoring equipment", "sanitation stations", "privacy screens", "medical supply carts"],
-                "educational_furniture": ["student desks", "lecture podiums", "laboratory benches", "learning stations", "collaborative workspace tables"]
-            }

                 "playing_surface": ["marked court boundaries", "manicured field turf", "running tracks", "competition equipment", "sports field markers"],
                 "construction_equipment": ["tower cranes", "excavators", "cement mixers", "scaffolding structures", "construction barriers"],
                 "medical_elements": ["examination furniture", "monitoring equipment", "sanitation stations", "privacy screens", "medical supply carts"],
+                "educational_furniture": ["student desks", "lecture podiums", "laboratory benches", "learning stations", "collaborative workspace tables"],
+                "landmark_features": ["distinctive architecture", "iconic structural elements", "famous design features", "recognized silhouette", "impressive proportions"],
+                "tourist_activities": ["sightseeing", "guided tours", "photography",  "cultural exploration", "souvenir shopping"],
+                "outdoor_activities": ["nature photography", "hiking",  "scenic viewing", "wildlife observation", "outdoor exploration"],
+                "historical_elements": ["cultural heritage", "historical events", "architectural periods", "traditional craftsmanship", "significant achievements"]
+                }

places365_model.py ADDED Viewed

	@@ -0,0 +1,492 @@

+import torch
+import torch.nn as nn
+import torchvision.transforms as transforms
+import numpy as np
+from PIL import Image
+from typing import Dict, List, Tuple, Optional, Any
+import logging
+class Places365Model:
+    """
+    Places365 scene classification model wrapper for scene understanding integration.
+    Provides scene classification and scene attribute prediction capabilities.
+    """
+    def __init__(self, model_name: str = 'resnet50_places365', device: Optional[str] = None):
+        """
+        Initialize Places365 model with configurable architecture and device.
+        Args:
+            model_name: Model architecture name (默認 resnet50)
+            device: Target device for inference (auto-detected if None)
+        """
+        self.logger = logging.getLogger(self.__class__.__name__)
+        # Device configuration with fallback logic
+        if device is None:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+        self.model_name = model_name
+        self.model = None
+        self.scene_classes = []
+        self.scene_attributes = []
+        # Model configuration mapping
+        self.model_configs = {
+            'resnet18_places365': {
+                'arch': 'resnet18',
+                'num_classes': 365,
+                'url': 'http://places2.csail.mit.edu/models_places365/resnet18_places365.pth.tar'
+            },
+            'resnet50_places365': {
+                'arch': 'resnet50',
+                'num_classes': 365,
+                'url': 'http://places2.csail.mit.edu/models_places365/resnet50_places365.pth.tar'
+            },
+            'densenet161_places365': {
+                'arch': 'densenet161',
+                'num_classes': 365,
+                'url': 'http://places2.csail.mit.edu/models_places365/densenet161_places365.pth.tar'
+            }
+        }
+        self._load_model()
+        self._load_class_names()
+        self._setup_scene_mapping()
+    def _load_model(self):
+        """載入與初始化 Places365 model"""
+        try:
+            if self.model_name not in self.model_configs:
+                raise ValueError(f"Unsupported model name: {self.model_name}")
+            config = self.model_configs[self.model_name]
+            # Import model architecture
+            if config['arch'].startswith('resnet'):
+                import torchvision.models as models
+                if config['arch'] == 'resnet18':
+                    self.model = models.resnet18(num_classes=config['num_classes'])
+                elif config['arch'] == 'resnet50':
+                    self.model = models.resnet50(num_classes=config['num_classes'])
+            elif config['arch'] == 'densenet161':
+                import torchvision.models as models
+                self.model = models.densenet161(num_classes=config['num_classes'])
+            # Load pretrained weights
+            checkpoint = torch.hub.load_state_dict_from_url(
+                config['url'],
+                map_location=self.device,
+                progress=True
+            )
+            # Handle different checkpoint formats
+            if 'state_dict' in checkpoint:
+                state_dict = checkpoint['state_dict']
+                # Remove 'module.' prefix if present
+                state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
+            else:
+                state_dict = checkpoint
+            self.model.load_state_dict(state_dict)
+            self.model.to(self.device)
+            self.model.eval()
+            self.logger.info(f"Places365 model {self.model_name} loaded successfully on {self.device}")
+        except Exception as e:
+            self.logger.error(f"Error loading Places365 model: {str(e)}")
+            raise
+    def _load_class_names(self):
+        """Load Places365 class names and scene attributes."""
+        try:
+            # Load scene class names (365 categories)
+            import urllib.request
+            class_url = 'https://raw.githubusercontent.com/csailvision/places365/master/categories_places365.txt'
+            class_file = urllib.request.urlopen(class_url)
+            self.scene_classes = []
+            for line in class_file:
+                class_name = line.decode('utf-8').strip().split(' ')[0][3:]  # Remove /x/ prefix
+                self.scene_classes.append(class_name)
+            # Load scene attributes (optional, for enhanced description)
+            attr_url = 'https://raw.githubusercontent.com/csailvision/places365/master/labels_sunattribute.txt'
+            try:
+                attr_file = urllib.request.urlopen(attr_url)
+                self.scene_attributes = []
+                for line in attr_file:
+                    attr_name = line.decode('utf-8').strip()
+                    self.scene_attributes.append(attr_name)
+            except:
+                self.logger.warning("Scene attributes not loaded, continuing with basic classification")
+                self.scene_attributes = []
+            self.logger.info(f"Loaded {len(self.scene_classes)} scene classes and {len(self.scene_attributes)} attributes")
+        except Exception as e:
+            self.logger.error(f"Error loading class names: {str(e)}")
+            # Fallback to basic class names if download fails
+            self.scene_classes = [f"scene_class_{i}" for i in range(365)]
+            self.scene_attributes = []
+    def _setup_scene_mapping(self):
+        """Setup mapping from Places365 classes to common scene types."""
+        # 建立Places365類別到通用場景類型的映射關係
+        self.scene_type_mapping = {
+            # Indoor scenes
+            'living_room': 'living_room',
+            'bedroom': 'bedroom',
+            'kitchen': 'kitchen',
+            'dining_room': 'dining_area',
+            'bathroom': 'bathroom',
+            'office': 'office_workspace',
+            'conference_room': 'office_workspace',
+            'classroom': 'educational_setting',
+            'library': 'library',
+            'restaurant': 'restaurant',
+            'cafe': 'cafe',
+            'bar': 'bar',
+            'hotel_room': 'hotel_room',
+            'hospital_room': 'medical_facility',
+            'gym': 'gym',
+            'supermarket': 'retail_store',
+            'clothing_store': 'retail_store',
+            # Outdoor urban scenes
+            'street': 'city_street',
+            'crosswalk': 'intersection',
+            'parking_lot': 'parking_lot',
+            'gas_station': 'gas_station',
+            'bus_station': 'bus_stop',
+            'train_station': 'train_station',
+            'airport_terminal': 'airport',
+            'subway_station': 'subway_station',
+            'bridge': 'bridge',
+            'highway': 'highway',
+            'downtown': 'commercial_district',
+            'shopping_mall': 'shopping_mall',
+            # Natural outdoor scenes
+            'park': 'park_area',
+            'beach': 'beach',
+            'forest': 'forest',
+            'mountain': 'mountain',
+            'lake': 'lake',
+            'river': 'river',
+            'ocean': 'ocean',
+            'desert': 'desert',
+            'field': 'field',
+            'garden': 'garden',
+            # Landmark and tourist areas
+            'castle': 'historical_monument',
+            'palace': 'historical_monument',
+            'temple': 'temple',
+            'church': 'church',
+            'mosque': 'mosque',
+            'museum': 'museum',
+            'art_gallery': 'art_gallery',
+            'tower': 'tourist_landmark',
+            'monument': 'historical_monument',
+            # Sports and entertainment
+            'stadium': 'stadium',
+            'basketball_court': 'sports_field',
+            'tennis_court': 'sports_field',
+            'swimming_pool': 'swimming_pool',
+            'playground': 'playground',
+            'amusement_park': 'amusement_park',
+            'theater': 'theater',
+            'concert_hall': 'concert_hall',
+            # Transportation
+            'airplane_cabin': 'airplane_cabin',
+            'train_interior': 'train_interior',
+            'car_interior': 'car_interior',
+            # Construction and industrial
+            'construction_site': 'construction_site',
+            'factory': 'factory',
+            'warehouse': 'warehouse'
+        }
+        # Indoor/outdoor classification helper
+        self.indoor_classes = {
+            'living_room', 'bedroom', 'kitchen', 'dining_room', 'bathroom', 'office',
+            'conference_room', 'classroom', 'library', 'restaurant', 'cafe', 'bar',
+            'hotel_room', 'hospital_room', 'gym', 'supermarket', 'clothing_store',
+            'airplane_cabin', 'train_interior', 'car_interior', 'theater', 'concert_hall',
+            'museum', 'art_gallery', 'shopping_mall'
+        }
+        self.outdoor_classes = {
+            'street', 'crosswalk', 'parking_lot', 'gas_station', 'bus_station',
+            'train_station', 'airport_terminal', 'bridge', 'highway', 'downtown',
+            'park', 'beach', 'forest', 'mountain', 'lake', 'river', 'ocean',
+            'desert', 'field', 'garden', 'stadium', 'basketball_court', 'tennis_court',
+            'swimming_pool', 'playground', 'amusement_park', 'construction_site',
+            'factory', 'warehouse', 'castle', 'palace', 'temple', 'church', 'mosque',
+            'tower', 'monument'
+        }
+    def preprocess(self, image_pil: Image.Image) -> torch.Tensor:
+        """
+        Preprocess PIL image for Places365 model inference.
+        Args:
+            image_pil: Input PIL image
+        Returns:
+            torch.Tensor: Preprocessed image tensor
+        """
+        # Places365 standard preprocessing
+        transform = transforms.Compose([
+            transforms.Resize((256, 256)),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+        # Convert to RGB if needed
+        if image_pil.mode != 'RGB':
+            image_pil = image_pil.convert('RGB')
+        # Apply preprocessing
+        input_tensor = transform(image_pil).unsqueeze(0)
+        return input_tensor.to(self.device)
+    def predict(self, image_pil: Image.Image) -> Dict[str, Any]:
+        """
+        Predict scene classification and attributes for input image.
+        Args:
+            image_pil: Input PIL image
+        Returns:
+            Dict containing scene predictions and confidence scores
+        """
+        try:
+            # Preprocess image
+            input_tensor = self.preprocess(image_pil)
+            # Model inference
+            with torch.no_grad():
+                outputs = self.model(input_tensor)
+                probabilities = torch.nn.functional.softmax(outputs, dim=1)
+            # 返回最有可能的項目
+            top_k = min(10, len(self.scene_classes))  # Configurable top-k
+            top_probs, top_indices = torch.topk(probabilities, top_k, dim=1)
+            # Extract results
+            top_probs = top_probs.cpu().numpy()[0]
+            top_indices = top_indices.cpu().numpy()[0]
+            # Build prediction results
+            predictions = []
+            for i in range(top_k):
+                class_idx = top_indices[i]
+                confidence = float(top_probs[i])
+                scene_class = self.scene_classes[class_idx]
+                predictions.append({
+                    'class_name': scene_class,
+                    'class_index': class_idx,
+                    'confidence': confidence
+                })
+            # Get primary prediction
+            primary_prediction = predictions[0]
+            primary_class = primary_prediction['class_name']
+            # 確認是 indoor/outdoor
+            is_indoor = self._classify_indoor_outdoor(primary_class)
+            # Map to common scene type
+            mapped_scene_type = self._map_places365_to_scene_types(primary_class)
+            # Determine scene attributes (basic inference based on class)
+            scene_attributes = self._infer_scene_attributes(primary_class)
+            result = {
+                'scene_label': primary_class,
+                'mapped_scene_type': mapped_scene_type,
+                'confidence': primary_prediction['confidence'],
+                'is_indoor': is_indoor,
+                'attributes': scene_attributes,
+                'top_predictions': predictions,
+                'all_probabilities': probabilities.cpu().numpy()[0].tolist()
+            }
+            return result
+        except Exception as e:
+            self.logger.error(f"Error in Places365 prediction: {str(e)}")
+            return {
+                'scene_label': 'unknown',
+                'mapped_scene_type': 'unknown',
+                'confidence': 0.0,
+                'is_indoor': None,
+                'attributes': [],
+                'top_predictions': [],
+                'error': str(e)
+            }
+    def _classify_indoor_outdoor(self, scene_class: str) -> Optional[bool]:
+        """
+        Classify if scene is indoor or outdoor based on Places365 class.
+        Args:
+            scene_class: Places365 scene class name
+        Returns:
+            bool or None: True for indoor, False for outdoor, None if uncertain
+        """
+        if scene_class in self.indoor_classes:
+            return True
+        elif scene_class in self.outdoor_classes:
+            return False
+        else:
+            # For ambiguous classes, use heuristics
+            indoor_keywords = ['room', 'office', 'store', 'shop', 'hall', 'interior', 'indoor']
+            outdoor_keywords = ['street', 'road', 'park', 'field', 'beach', 'mountain', 'outdoor']
+            scene_lower = scene_class.lower()
+            if any(keyword in scene_lower for keyword in indoor_keywords):
+                return True
+            elif any(keyword in scene_lower for keyword in outdoor_keywords):
+                return False
+            else:
+                return None
+    def _map_places365_to_scene_types(self, places365_class: str) -> str:
+        """
+        Map Places365 class to common scene type used by the system.
+        Args:
+            places365_class: Places365 scene class name
+        Returns:
+            str: Mapped scene type
+        """
+        # Direct mapping lookup
+        if places365_class in self.scene_type_mapping:
+            return self.scene_type_mapping[places365_class]
+        # Fuzzy matching for similar classes
+        places365_lower = places365_class.lower()
+        # Indoor fuzzy matching
+        if any(keyword in places365_lower for keyword in ['living', 'bedroom', 'kitchen']):
+            return 'general_indoor_space'
+        elif any(keyword in places365_lower for keyword in ['office', 'conference', 'meeting']):
+            return 'office_workspace'
+        elif any(keyword in places365_lower for keyword in ['dining', 'restaurant', 'cafe']):
+            return 'dining_area'
+        elif any(keyword in places365_lower for keyword in ['store', 'shop', 'market']):
+            return 'retail_store'
+        elif any(keyword in places365_lower for keyword in ['school', 'class', 'library']):
+            return 'educational_setting'
+        # Outdoor fuzzy matching
+        elif any(keyword in places365_lower for keyword in ['street', 'road', 'crosswalk']):
+            return 'city_street'
+        elif any(keyword in places365_lower for keyword in ['park', 'garden', 'plaza']):
+            return 'park_area'
+        elif any(keyword in places365_lower for keyword in ['beach', 'ocean', 'lake']):
+            return 'beach'
+        elif any(keyword in places365_lower for keyword in ['mountain', 'forest', 'desert']):
+            return 'natural_outdoor_area'
+        elif any(keyword in places365_lower for keyword in ['parking', 'garage']):
+            return 'parking_lot'
+        elif any(keyword in places365_lower for keyword in ['station', 'terminal', 'airport']):
+            return 'transportation_hub'
+        # Landmark fuzzy matching
+        elif any(keyword in places365_lower for keyword in ['castle', 'palace', 'monument', 'temple']):
+            return 'historical_monument'
+        elif any(keyword in places365_lower for keyword in ['tower', 'landmark']):
+            return 'tourist_landmark'
+        elif any(keyword in places365_lower for keyword in ['museum', 'gallery']):
+            return 'cultural_venue'
+        # Default fallback based on indoor/outdoor
+        is_indoor = self._classify_indoor_outdoor(places365_class)
+        if is_indoor is True:
+            return 'general_indoor_space'
+        elif is_indoor is False:
+            return 'generic_street_view'
+        else:
+            return 'unknown'
+    def _infer_scene_attributes(self, scene_class: str) -> List[str]:
+        """
+        Infer basic scene attributes from Places365 class.
+        Args:
+            scene_class: Places365 scene class name
+        Returns:
+            List[str]: Inferred scene attributes
+        """
+        attributes = []
+        scene_lower = scene_class.lower()
+        # Lighting attributes
+        if any(keyword in scene_lower for keyword in ['outdoor', 'street', 'park', 'beach']):
+            attributes.append('natural_lighting')
+        elif any(keyword in scene_lower for keyword in ['indoor', 'room', 'office']):
+            attributes.append('artificial_lighting')
+        # Functional attributes
+        if any(keyword in scene_lower for keyword in ['commercial', 'store', 'shop', 'restaurant']):
+            attributes.append('commercial')
+        elif any(keyword in scene_lower for keyword in ['residential', 'home', 'living', 'bedroom']):
+            attributes.append('residential')
+        elif any(keyword in scene_lower for keyword in ['office', 'conference', 'meeting']):
+            attributes.append('workplace')
+        elif any(keyword in scene_lower for keyword in ['recreation', 'park', 'playground', 'stadium']):
+            attributes.append('recreational')
+        elif any(keyword in scene_lower for keyword in ['educational', 'school', 'library', 'classroom']):
+            attributes.append('educational')
+        # Spatial attributes
+        if any(keyword in scene_lower for keyword in ['open', 'field', 'plaza', 'stadium']):
+            attributes.append('open_space')
+        elif any(keyword in scene_lower for keyword in ['enclosed', 'room', 'interior']):
+            attributes.append('enclosed_space')
+        return attributes
+    def get_scene_probabilities(self, image_pil: Image.Image) -> Dict[str, float]:
+        """
+        Get probability distribution over all scene classes.
+        Args:
+            image_pil: Input PIL image
+        Returns:
+            Dict mapping scene class names to probabilities
+        """
+        try:
+            input_tensor = self.preprocess(image_pil)
+            with torch.no_grad():
+                outputs = self.model(input_tensor)
+                probabilities = torch.nn.functional.softmax(outputs, dim=1)
+            probs = probabilities.cpu().numpy()[0]
+            return {
+                self.scene_classes[i]: float(probs[i])
+                for i in range(len(self.scene_classes))
+            }
+        except Exception as e:
+            self.logger.error(f"Error getting scene probabilities: {str(e)}")
+            return {}

requirements.txt CHANGED Viewed

@@ -1,16 +1,17 @@
-torch>=2.0.0
-torchvision>=0.15.0
-ultralytics>=8.0.0
-opencv-python>=4.7.0
-pillow>=9.4.0
-numpy>=1.23.5
-matplotlib>=3.7.0
-gradio>=3.32.0
-git+https://github.com/openai/CLIP.git
-yt-dlp>=2023.3.4
-requests>=2.28.1
-transformers
-accelerate
-bitsandbytes
-sentencepiece
-huggingface_hub>=0.19.0

+# torch>=2.0.0
+# torchvision>=0.15.0
+# ultralytics>=8.0.0
+# opencv-python>=4.7.0
+# pillow>=9.4.0
+# numpy>=1.23.5
+# matplotlib>=3.7.0
+# gradio>=3.32.0
+# git+https://github.com/openai/CLIP.git
+# yt-dlp>=2023.3.4
+# requests>=2.28.1
+# transformers
+# accelerate
+# bitsandbytes
+# sentencepiece
+# huggingface_hub>=0.19.0
+# urllib3>=1.26.0

scene_analyzer.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

scene_description.py CHANGED Viewed

@@ -59,7 +59,7 @@ class SceneDescriptor:
             "low": "This might be {description}, but the confidence is low. {details}"
         }
-        # 僅提供最基本的模板作為後備
         self.scene_detail_templates = {
             "default": ["A space with various objects."]
         }
@@ -105,53 +105,90 @@ class SceneDescriptor:
         return alternatives
-    def _infer_possible_activities(self, scene_type: str, detected_objects: List[Dict]) -> List[str]:
         """
         Infer possible activities based on scene type and detected objects.
         Args:
             scene_type: Identified scene type
             detected_objects: List of detected objects
         Returns:
             List of possible activities
         """
         activities = []
         if scene_type.startswith("aerial_view_"):
             if scene_type == "aerial_view_intersection":
-                # 使用預定義的十字路口活動
                 activities.extend(self.activity_templates.get("aerial_view_intersection", []))
-                # 添加與行人和車輛相關的特定活動
                 pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
                 vehicles = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]  # Car, bus, truck
                 if pedestrians and vehicles:
                     activities.append("Waiting for an opportunity to cross the street")
                     activities.append("Obeying traffic signals")
             elif scene_type == "aerial_view_commercial_area":
                 activities.extend(self.activity_templates.get("aerial_view_commercial_area", []))
             elif scene_type == "aerial_view_plaza":
                 activities.extend(self.activity_templates.get("aerial_view_plaza", []))
             else:
-                # 處理其他未明確定義的空中視角場景
                 aerial_activities = [
-                    "Street crossing",
-                    "Waiting for signals",
-                    "Following traffic rules",
                     "Pedestrian movement"
                 ]
                 activities.extend(aerial_activities)
         if scene_type in self.activity_templates:
             activities.extend(self.activity_templates[scene_type])
         elif "default" in self.activity_templates:
             activities.extend(self.activity_templates["default"])
         detected_class_ids = [obj["class_id"] for obj in detected_objects]
         # Add activities based on specific object combinations
@@ -181,8 +218,48 @@ class SceneDescriptor:
             if 24 in detected_class_ids or 26 in detected_class_ids:  # Backpack or handbag
                 activities.append("Carrying personal items")
-        # Remove duplicates
-        return list(set(activities))
     def _identify_safety_concerns(self, detected_objects: List[Dict], scene_type: str) -> List[str]:
         """
@@ -198,8 +275,6 @@ class SceneDescriptor:
         concerns = []
         detected_class_ids = [obj["class_id"] for obj in detected_objects]
-        # ORIGINAL SAFETY CONCERNS LOGIC
         # General safety concerns
         if 42 in detected_class_ids or 43 in detected_class_ids:  # Fork or knife
             concerns.append("Sharp utensils present")
@@ -232,8 +307,6 @@ class SceneDescriptor:
                 if obj["region"] in ["top_left", "top_center", "top_right"] and obj["normalized_area"] > 0.05:
                     concerns.append(f"Elevated {obj['class_name']} might be unstable")
-        # NEW SAFETY CONCERNS LOGIC FOR ADDITIONAL SCENE TYPES
         # Upscale dining safety concerns
         if scene_type == "upscale_dining":
             # Check for fragile items
@@ -295,7 +368,6 @@ class SceneDescriptor:
                 concerns.append("Two-wheeled vehicles in pedestrian areas")
             # Check for potential trip hazards
-            # We can't directly detect this, but can infer from context
             if scene_type == "asian_commercial_street" and "bottom" in " ".join([obj["region"] for obj in detected_objects if obj["class_id"] == 0]):
                 # If people are in bottom regions, they might be walking on uneven surfaces
                 concerns.append("Potential uneven walking surfaces in commercial area")
@@ -324,7 +396,6 @@ class SceneDescriptor:
                     concerns.append("Busy traffic area potentially without visible traffic signals in view")
             # Time of day considerations
-            # We don't have direct time data, but can infer from vehicle lights
             vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]
             if vehicle_objs and any("lighting_conditions" in obj for obj in detected_objects):
                 # If vehicles are present and it might be evening/night

             "low": "This might be {description}, but the confidence is low. {details}"
         }
+        # 只提供最基本的模板作為後備
         self.scene_detail_templates = {
             "default": ["A space with various objects."]
         }
         return alternatives
+    def _infer_possible_activities(self, scene_type: str, detected_objects: List[Dict], enable_landmark: bool = True, scene_scores: Optional[Dict] = None) -> List[str]:
         """
         Infer possible activities based on scene type and detected objects.
         Args:
             scene_type: Identified scene type
             detected_objects: List of detected objects
+            enable_landmark: Whether landmark detection is enabled
+            scene_scores: Optional dictionary of scene type scores
         Returns:
             List of possible activities
         """
         activities = []
+        # Dynamically replace landmark scene types when landmark detection is disabled
+        if not enable_landmark and scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
+            alternative_scene_type = self._get_alternative_scene_type(scene_type, detected_objects, scene_scores)
+            print(f"Replacing landmark scene type '{scene_type}' with '{alternative_scene_type}' for activity inference")
+            scene_type = alternative_scene_type
+        # Process aerial view scenes
         if scene_type.startswith("aerial_view_"):
             if scene_type == "aerial_view_intersection":
+                # Use predefined intersection activities
                 activities.extend(self.activity_templates.get("aerial_view_intersection", []))
+                # Add pedestrian and vehicle specific activities
                 pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
                 vehicles = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]  # Car, bus, truck
                 if pedestrians and vehicles:
                     activities.append("Waiting for an opportunity to cross the street")
                     activities.append("Obeying traffic signals")
             elif scene_type == "aerial_view_commercial_area":
                 activities.extend(self.activity_templates.get("aerial_view_commercial_area", []))
             elif scene_type == "aerial_view_plaza":
                 activities.extend(self.activity_templates.get("aerial_view_plaza", []))
             else:
+                # Handle other undefined aerial view scenes
                 aerial_activities = [
+                    "Street crossing",
+                    "Waiting for signals",
+                    "Following traffic rules",
                     "Pedestrian movement"
                 ]
                 activities.extend(aerial_activities)
+        # Add scene-specific activities from templates
         if scene_type in self.activity_templates:
             activities.extend(self.activity_templates[scene_type])
         elif "default" in self.activity_templates:
             activities.extend(self.activity_templates["default"])
+        # Filter out landmark-related activities when landmark detection is disabled
+        if not enable_landmark:
+            filtered_activities = []
+            landmark_keywords = ["sightseeing", "landmark", "tourist", "monument", "historical",
+                                "guided tour", "photography", "cultural tourism", "heritage"]
+            for activity in activities:
+                if not any(keyword in activity.lower() for keyword in landmark_keywords):
+                    filtered_activities.append(activity)
+            activities = filtered_activities
+        # If we filtered out all activities, add some generic ones based on scene type
+        if not activities:
+            generic_activities = {
+                "city_street": ["Walking", "Commuting", "Shopping"],
+                "intersection": ["Crossing the street", "Waiting for traffic signals"],
+                "commercial_district": ["Shopping", "Walking", "Dining"],
+                "pedestrian_area": ["Walking", "Socializing", "Shopping"],
+                "park_area": ["Relaxing", "Walking", "Exercise"],
+                "outdoor_natural_area": ["Walking", "Nature observation", "Relaxation"],
+                "urban_architecture": ["Walking", "Urban exploration", "Photography"]
+            }
+            activities.extend(generic_activities.get(scene_type, ["Walking", "Observing surroundings"]))
+        # Add activities based on detected objects
         detected_class_ids = [obj["class_id"] for obj in detected_objects]
         # Add activities based on specific object combinations
             if 24 in detected_class_ids or 26 in detected_class_ids:  # Backpack or handbag
                 activities.append("Carrying personal items")
+            # Add more person count-dependent activities
+            person_count = detected_class_ids.count(0)
+            if person_count > 3:
+                activities.append("Group gathering")
+            elif person_count > 1:
+                activities.append("Social interaction")
+        # Add additional activities based on significant objects
+        if 43 in detected_class_ids:  # cup
+            activities.append("Drinking beverages")
+        if 32 in detected_class_ids:  # sports ball
+            activities.append("Playing sports")
+        if 25 in detected_class_ids:  # umbrella
+            activities.append("Sheltering from weather")
+        # Add location-specific activities based on environment objects
+        if any(furniture in detected_class_ids for furniture in [56, 57, 58, 59, 60]):  # furniture items
+            activities.append("Using indoor facilities")
+        if any(outdoor_item in detected_class_ids for outdoor_item in [13, 14, 15]):  # bench, outdoor items
+            activities.append("Enjoying outdoor spaces")
+        # Remove duplicates and ensure reasonable number of activities
+        unique_activities = list(set(activities))
+        # Limit to reasonable number (maximum 8 activities)
+        if len(unique_activities) > 8:
+            # Prioritize more specific activities over general ones
+            general_activities = ["Walking", "Observing surroundings", "Commuting", "Using indoor facilities"]
+            specific_activities = [a for a in unique_activities if a not in general_activities]
+            # Take all specific activities first, then fill with general ones if needed
+            if len(specific_activities) <= 8:
+                result = specific_activities + general_activities[:8-len(specific_activities)]
+            else:
+                result = specific_activities[:8]
+        else:
+            result = unique_activities
+        return result
     def _identify_safety_concerns(self, detected_objects: List[Dict], scene_type: str) -> List[str]:
         """
         concerns = []
         detected_class_ids = [obj["class_id"] for obj in detected_objects]
         # General safety concerns
         if 42 in detected_class_ids or 43 in detected_class_ids:  # Fork or knife
             concerns.append("Sharp utensils present")
                 if obj["region"] in ["top_left", "top_center", "top_right"] and obj["normalized_area"] > 0.05:
                     concerns.append(f"Elevated {obj['class_name']} might be unstable")
         # Upscale dining safety concerns
         if scene_type == "upscale_dining":
             # Check for fragile items
                 concerns.append("Two-wheeled vehicles in pedestrian areas")
             # Check for potential trip hazards
             if scene_type == "asian_commercial_street" and "bottom" in " ".join([obj["region"] for obj in detected_objects if obj["class_id"] == 0]):
                 # If people are in bottom regions, they might be walking on uneven surfaces
                 concerns.append("Potential uneven walking surfaces in commercial area")
                     concerns.append("Busy traffic area potentially without visible traffic signals in view")
             # Time of day considerations
             vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]
             if vehicle_objs and any("lighting_conditions" in obj for obj in detected_objects):
                 # If vehicles are present and it might be evening/night

scene_detail_templates.py CHANGED Viewed

@@ -200,4 +200,19 @@ SCENE_DETAIL_TEMPLATES = {
                 "This professional culinary area contains {kitchen_equipment} arranged in stations for {food_preparation}.",
                 "An industrial kitchen featuring {kitchen_equipment} designed for efficient {food_preparation}."
             ],
         }

                 "This professional culinary area contains {kitchen_equipment} arranged in stations for {food_preparation}.",
                 "An industrial kitchen featuring {kitchen_equipment} designed for efficient {food_preparation}."
             ],
+            "tourist_landmark": [
+                "This notable landmark attracts visitors who come to see {landmark_features} and experience {tourist_activities}.",
+                "A famous landmark site where tourists can observe {landmark_features} and engage in {tourist_activities}.",
+                "This iconic landmark showcases {landmark_features} and is a popular destination for {tourist_activities}."
+            ],
+            "natural_landmark": [
+                "This natural landmark features {landmark_features} and offers opportunities for {outdoor_activities}.",
+                "A scenic natural formation with {landmark_features} where visitors enjoy {outdoor_activities}.",
+                "This impressive natural landmark displays {landmark_features} and attracts nature enthusiasts for {outdoor_activities}."
+            ],
+            "historical_monument": [
+                "This historical monument exhibits {landmark_features} and has significance related to {historical_elements}.",
+                "An important historical site featuring {landmark_features} and representing {historical_elements}.",
+                "This heritage monument showcases {landmark_features} and commemorates {historical_elements}."
+            ]
         }

scene_type.py CHANGED Viewed

@@ -384,4 +384,127 @@ SCENE_TYPES = {
     "minimum_required": 3,
     "description": "A commercial kitchen with professional cooking equipment and food preparation areas"
     },
 }

     "minimum_required": 3,
     "description": "A commercial kitchen with professional cooking equipment and food preparation areas"
     },
+    "tourist_landmark": {
+        "name": "Tourist Landmark",
+        "required_objects": [0],  # person
+        "optional_objects": [24, 26, 67],  # backpack, handbag, cell phone
+        "minimum_required": 0,  # 可能沒有人，但仍然是地標
+        "description": "A location featuring a famous landmark with tourist activity",
+        "priority": 1.2  # 提高優先級
+    },
+    "natural_landmark": {
+        "name": "Natural Landmark",
+        "required_objects": [0],  # person
+        "optional_objects": [24, 26, 67],  # backpack, handbag, cell phone
+        "minimum_required": 0,
+        "description": "A natural landmark site with scenic views",
+        "priority": 1.2
+    },
+    "historical_monument": {
+        "name": "Historical Monument",
+        "required_objects": [0],  # person
+        "optional_objects": [24, 26, 67],  # backpack, handbag, cell phone
+        "minimum_required": 0,
+        "description": "A historical monument or heritage site",
+        "priority": 1.2
+    },
+    "general_indoor_space": {
+        "name": "General Indoor Space",
+        "required_objects": [], # No strict required objects, depends on combination
+        "optional_objects": [
+            56, # chair
+            57, # couch
+            58, # potted plant
+            59, # bed
+            60, # dining table
+            61, # toilet
+            62, # tv
+            63, # laptop
+            66, # keyboard
+            67, # cell phone
+            73, # book
+            74, # clock
+            75, # vase
+            39, # bottle
+            41, # cup
+        ],
+        "minimum_required": 2, # Needs at least a few common indoor items
+        "description": "An indoor area with various common household or functional items.",
+        "priority": 0.8 # Lower priority than more specific scenes
+    },
+    "generic_street_view": {
+        "name": "Generic Street View",
+        "required_objects": [], # More about the combination
+        "optional_objects": [
+            0,  # person
+            1,  # bicycle
+            2,  # car
+            3,  # motorcycle
+            5,  # bus
+            7,  # truck
+            9,  # traffic light
+            10, # fire hydrant
+            11, # stop sign
+            13, # bench
+            # Consider adding building if YOLO detects it (not a standard COCO class for YOLOv8, but some custom models might)
+        ],
+        "minimum_required": 2, # e.g., a car and a person, or multiple vehicles
+        "description": "An outdoor street view, likely in an urban or suburban setting, with vehicles and/or pedestrians.",
+        "priority": 0.85
+    },
+    "desk_area_workspace": {
+        "name": "Desk Area / Workspace",
+        "required_objects": [
+            63, # laptop or 62 (tv as monitor) or 66 (keyboard)
+        ],
+        "optional_objects": [
+            56, # chair
+            60, # dining table (often used as a desk)
+            64, # mouse
+            66, # keyboard
+            73, # book
+            41, # cup
+            67, # cell phone
+            74, # clock
+        ],
+        "minimum_required": 2, # e.g., laptop and chair, or table and keyboard
+        "description": "A workspace or desk area, typically featuring a computer and related accessories.",
+        "priority": 0.9
+    },
+    "outdoor_gathering_spot": {
+        "name": "Outdoor Gathering Spot",
+        "required_objects": [
+            0,  # person
+        ],
+        "optional_objects": [
+            13, # bench
+            32, # sports ball
+            24, # backpack
+            25, # umbrella
+            29, # frisbee
+            33, # kite
+            58, # potted plant (if in a more structured park area)
+        ],
+        "minimum_required": 2, # e.g., person and bench, or multiple people
+        "description": "An outdoor area where people might gather for leisure or activity.",
+        "priority": 0.8
+    },
+    "kitchen_counter_or_utility_area": {
+        "name": "Kitchen Counter or Utility Area",
+        "required_objects": [],
+        "optional_objects": [
+            39, # bottle
+            41, # cup
+            44, # spoon
+            45, # bowl
+            68, # microwave
+            69, # oven
+            70, # toaster
+            71, # sink
+            72, # refrigerator
+        ],
+        "minimum_required": 2, # e.g., sink and microwave, or refrigerator and bottles
+        "description": "An area likely used for food preparation or kitchen utilities.",
+        "priority": 0.9
+    }
 }

spatial_analyzer.py CHANGED Viewed

@@ -282,19 +282,29 @@ class SpatialAnalyzer:
         # Group objects by category and region
         category_regions = {}
         for obj in detected_objects:
-            # Find object category
-            category = "other"
-            for cat_name, cat_ids in self.OBJECT_CATEGORIES.items():
-                if obj["class_id"] in cat_ids:
-                    category = cat_name
-                    break
-            # Add to category-region mapping
             if category not in category_regions:
                 category_regions[category] = {}
-            region = obj["region"]
             if region not in category_regions[category]:
                 category_regions[category][region] = []
@@ -328,156 +338,470 @@ class SpatialAnalyzer:
         elif scene_type == "upscale_dining":
             # Upscale dining specific logic
             zones.update(self._identify_upscale_dining_zones(category_regions, detected_objects))
         else:
             # Default zone identification for other scene types
             zones.update(self._identify_default_zones(category_regions, detected_objects))
-        # If no zones were identified, try the default approach
         if not zones:
             zones.update(self._identify_default_zones(category_regions, detected_objects))
         return zones
-    def _identify_indoor_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
         """
-        Identify functional zones for indoor scenes.
         Args:
-            category_regions: Objects grouped by category and region
             detected_objects: List of detected objects
-            scene_type: Specific indoor scene type
         Returns:
-            Dict: Indoor functional zones
         """
         zones = {}
-        # Seating/social zone
-        if "furniture" in category_regions:
-            furniture_regions = category_regions["furniture"]
-            main_furniture_region = max(furniture_regions.items(),
-                                    key=lambda x: len(x[1]),
-                                    default=(None, []))
-            if main_furniture_region[0] is not None and len(main_furniture_region[1]) >= 2:
-                zone_objects = [obj["class_name"] for obj in main_furniture_region[1]]
-                zones["social_zone"] = {
-                    "region": main_furniture_region[0],
-                    "objects": zone_objects,
-                    "description": f"Social or seating area with {', '.join(zone_objects)}"
-                }
-        # Entertainment zone
-        if "electronics" in category_regions:
-            electronics_items = []
-            for region_objects in category_regions["electronics"].values():
-                electronics_items.extend([obj["class_name"] for obj in region_objects])
-            if electronics_items:
-                zones["entertainment_zone"] = {
-                    "region": self._find_main_region(category_regions.get("electronics", {})),
-                    "objects": electronics_items,
-                    "description": f"Entertainment or media area with {', '.join(electronics_items)}"
-                }
-        # Dining/food zone
-        food_zone_categories = ["kitchen_items", "food"]
-        food_items = []
-        food_regions = {}
-        for category in food_zone_categories:
-            if category in category_regions:
-                for region, objects in category_regions[category].items():
-                    if region not in food_regions:
-                        food_regions[region] = []
-                    food_regions[region].extend(objects)
-                    food_items.extend([obj["class_name"] for obj in objects])
-        if food_items:
-            main_food_region = max(food_regions.items(),
-                                key=lambda x: len(x[1]),
-                                default=(None, []))
-            if main_food_region[0] is not None:
-                zones["dining_zone"] = {
-                    "region": main_food_region[0],
-                    "objects": list(set(food_items)),
-                    "description": f"Dining or food preparation area with {', '.join(list(set(food_items))[:3])}"
                 }
-        # Work/study zone - enhanced to detect even when scene_type is not explicitly office
-        work_items = []
-        work_regions = {}
-        for obj in detected_objects:
-            if obj["class_id"] in [56, 60, 63, 64, 66, 73]:  # chair, table, laptop, mouse, keyboard, book
-                region = obj["region"]
-                if region not in work_regions:
-                    work_regions[region] = []
-                work_regions[region].append(obj)
-                work_items.append(obj["class_name"])
-        # Check for laptop and table/chair combinations that suggest a workspace
-        has_laptop = any(obj["class_id"] == 63 for obj in detected_objects)
-        has_keyboard = any(obj["class_id"] == 66 for obj in detected_objects)
-        has_table = any(obj["class_id"] == 60 for obj in detected_objects)
-        has_chair = any(obj["class_id"] == 56 for obj in detected_objects)
-        # If we have electronics with furniture in the same region, likely a workspace
-        workspace_detected = (has_laptop or has_keyboard) and (has_table or has_chair)
-        if (workspace_detected or scene_type in ["office_workspace", "meeting_room"]) and work_items:
-            main_work_region = max(work_regions.items(),
-                                key=lambda x: len(x[1]),
-                                default=(None, []))
-            if main_work_region[0] is not None:
-                zones["workspace_zone"] = {
-                    "region": main_work_region[0],
-                    "objects": list(set(work_items)),
-                    "description": f"Work or study area with {', '.join(list(set(work_items))[:3])}"
-                }
-        # Bedroom-specific zones
-        if scene_type == "bedroom":
-            bed_objects = [obj for obj in detected_objects if obj["class_id"] == 59]  # Bed
-            if bed_objects:
-                bed_region = bed_objects[0]["region"]
-                zones["sleeping_zone"] = {
-                    "region": bed_region,
-                    "objects": ["bed"],
-                    "description": "Sleeping area with bed"
-                }
-        # Kitchen-specific zones
-        if scene_type == "kitchen":
-            # Look for appliances (refrigerator, oven, microwave, sink)
-            appliance_ids = [68, 69, 71, 72]  # microwave, oven, sink, refrigerator
-            appliance_objects = [obj for obj in detected_objects if obj["class_id"] in appliance_ids]
-            if appliance_objects:
-                appliance_regions = {}
-                for obj in appliance_objects:
-                    region = obj["region"]
-                    if region not in appliance_regions:
-                        appliance_regions[region] = []
-                    appliance_regions[region].append(obj)
-                if appliance_regions:
-                    main_appliance_region = max(appliance_regions.items(),
-                                            key=lambda x: len(x[1]),
-                                            default=(None, []))
-                    if main_appliance_region[0] is not None:
-                        appliance_names = [obj["class_name"] for obj in main_appliance_region[1]]
-                        zones["kitchen_appliance_zone"] = {
-                            "region": main_appliance_region[0],
-                            "objects": appliance_names,
-                            "description": f"Kitchen appliance area with {', '.join(appliance_names)}"
-                        }
         return zones
     def _identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
         """
         Identify functional zones for urban intersections with enhanced spatial awareness.
@@ -532,6 +856,142 @@ class SpatialAnalyzer:
         return zones
     def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict],
                                 region_distribution: Dict) -> Dict:
         """
@@ -601,7 +1061,7 @@ class SpatialAnalyzer:
         if not vehicles:
             return traffic_zones
-        # Group vehicles by region
         vehicle_regions = {}
         for v in vehicles:
             region = v["region"]
@@ -652,7 +1112,7 @@ class SpatialAnalyzer:
     def _get_directional_description(self, region: str) -> str:
         """
-        Convert region name to a directional description.
         Args:
             region: Region name from the grid
@@ -1433,12 +1893,3 @@ class SpatialAnalyzer:
         return max(region_objects_dict.items(),
                 key=lambda x: len(x[1]),
                 default=("unknown", []))[0]
-    def _find_main_region(self, region_objects_dict: Dict) -> str:
-        """Find the main region with the most objects"""
-        if not region_objects_dict:
-            return "unknown"
-        return max(region_objects_dict.items(),
-                 key=lambda x: len(x[1]),
-                 default=("unknown", []))[0]

         # Group objects by category and region
         category_regions = {}
+        if not getattr(self, 'enable_landmark', True):
+            detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
+        # 過濾地標相關場景類型
+        if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
+            scene_type = "city_street"
+        # MODIFIED: Smart threshold evaluation instead of fixed values
+        should_identify = self._evaluate_zone_identification_feasibility(detected_objects, scene_type)
+        if not should_identify:
+            return {}
+        # MODIFIED: Build category_regions mapping (was missing in original)
         for obj in detected_objects:
+            category = self._categorize_object(obj)
+            if not category:
+                continue
             if category not in category_regions:
                 category_regions[category] = {}
+            region = obj.get("region", "center")
             if region not in category_regions[category]:
                 category_regions[category][region] = []
         elif scene_type == "upscale_dining":
             # Upscale dining specific logic
             zones.update(self._identify_upscale_dining_zones(category_regions, detected_objects))
+        elif scene_type == "tourist_landmark" or "landmark" in scene_type:
+            # 處理地標場景類型
+            landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
+            if landmark_objects:
+                landmark_zones = self._identify_landmark_zones(landmark_objects)
+                zones.update(landmark_zones)
         else:
             # Default zone identification for other scene types
             zones.update(self._identify_default_zones(category_regions, detected_objects))
+        # 檢查是否有地標物體但場景類型不是地標類型
+        if scene_type != "tourist_landmark" and "landmark" not in scene_type:
+            landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
+            if landmark_objects:
+                # 添加地標功能區，但不覆蓋已有的功能區
+                landmark_zones = self._identify_landmark_zones(landmark_objects)
+                # 確保地標區域不會覆蓋已識別的其他重要功能區
+                for zone_id, zone_info in landmark_zones.items():
+                    if zone_id not in zones:
+                        zones[zone_id] = zone_info
+        # MODIFIED: Enhanced fallback strategy - try simplified identification if no zones found
         if not zones:
             zones.update(self._identify_default_zones(category_regions, detected_objects))
+            # Final fallback: create basic zones from high-confidence objects
+            if not zones:
+                zones.update(self._create_basic_zones_from_objects(detected_objects, scene_type))
         return zones
+    def _identify_core_objects_for_scene(self, detected_objects: List[Dict], scene_type: str) -> List[Dict]:
         """
+        Identify core objects that define a particular scene type.
+        Args:
+            detected_objects: List of detected objects
+            scene_type: Scene type
+        Returns:
+            List of core objects for the scene
+        """
+        core_objects = []
+        scene_core_mapping = {
+            "bedroom": [59],  # bed
+            "kitchen": [68, 69, 71, 72],  # microwave, oven, sink, refrigerator
+            "living_room": [57, 58, 62],  # sofa, chair, tv
+            "dining_area": [60, 46, 47],  # dining table, fork, knife
+            "office_workspace": [63, 64, 66, 73]  # laptop, mouse, keyboard, book
+        }
+        if scene_type in scene_core_mapping:
+            core_class_ids = scene_core_mapping[scene_type]
+            for obj in detected_objects:
+                if obj["class_id"] in core_class_ids and obj.get("confidence", 0) >= 0.4:
+                    core_objects.append(obj)
+        return core_objects
+    def _get_object_categories(self, detected_objects: List[Dict]) -> set:
+        """Get unique object categories from detected objects."""
+        object_categories = set()
+        for obj in detected_objects:
+            category = self._categorize_object(obj)
+            if category:
+                object_categories.add(category)
+        return object_categories
+    def _create_basic_zones_from_objects(self, detected_objects: List[Dict], scene_type: str) -> Dict:
+        """
+        Create basic functional zones from individual high-confidence objects.
+        This is a fallback when standard zone identification fails.
         Args:
             detected_objects: List of detected objects
+            scene_type: Scene type
         Returns:
+            Dictionary of basic zones
         """
         zones = {}
+        # Focus on high-confidence objects
+        high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
+        if not high_conf_objects:
+            high_conf_objects = detected_objects  # Fallback to all objects
+        # Create zones based on individual important objects
+        for i, obj in enumerate(high_conf_objects[:3]):  # Limit to top 3 objects
+            class_name = obj["class_name"]
+            region = obj.get("region", "center")
+            # Create descriptive zone based on object type
+            zone_description = self._get_basic_zone_description(class_name, scene_type)
+            if zone_description:
+                zones[f"functional_area_{i+1}"] = {
+                    "region": region,
+                    "objects": [class_name],
+                    "description": zone_description
                 }
+        return zones
+    def _get_basic_zone_description(self, class_name: str, scene_type: str) -> str:
+        """Generate basic zone description based on object and scene type."""
+        # Object-specific descriptions
+        descriptions = {
+            "bed": "Sleeping and rest area",
+            "sofa": "Seating and relaxation area",
+            "chair": "Seating area",
+            "dining table": "Dining and meal area",
+            "tv": "Entertainment and media area",
+            "laptop": "Work and computing area",
+            "potted plant": "Decorative and green space area",
+            "refrigerator": "Food storage and kitchen area",
+            "car": "Vehicle and transportation area",
+            "person": "Activity and social area"
+        }
+        return descriptions.get(class_name, f"Functional area with {class_name}")
+    def _categorize_object(self, obj: Dict) -> str:
+        """
+        Categorize detected objects into functional categories for zone identification.
+        """
+        class_id = obj.get("class_id", -1)
+        class_name = obj.get("class_name", "").lower()
+        # Use existing category mapping if available
+        if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
+            for category, ids in self.OBJECT_CATEGORIES.items():
+                if class_id in ids:
+                    return category
+        # Fallback categorization based on class names for common COCO classes
+        furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
+        plant_items = ["potted plant"]
+        electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
+        vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
+        person_items = ["person"]
+        kitchen_items = ["bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+                        "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
+                        "pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"]
+        sports_items = ["frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
+                    "baseball glove", "skateboard", "surfboard", "tennis racket"]
+        personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
+        if any(item in class_name for item in furniture_items):
+            return "furniture"
+        elif any(item in class_name for item in plant_items):
+            return "plant"
+        elif any(item in class_name for item in electronic_items):
+            return "electronics"
+        elif any(item in class_name for item in vehicle_items):
+            return "vehicle"
+        elif any(item in class_name for item in person_items):
+            return "person"
+        elif any(item in class_name for item in kitchen_items):
+            return "kitchen_items"
+        elif any(item in class_name for item in sports_items):
+            return "sports"
+        elif any(item in class_name for item in personal_items):
+            return "personal_items"
+        else:
+            return "misc"
+    def _evaluate_zone_identification_feasibility(self, detected_objects: List[Dict], scene_type: str) -> bool:
+        """
+        基於物件關聯性和分布特徵的彈性可行性評估
+        """
+        if len(detected_objects) < 2:
+            return False
+        # 計算不同置信度層級的物件分布
+        high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
+        medium_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.4]
+        # 基礎條件：至少需要一定數量的可信物件
+        if len(medium_conf_objects) < 2:
+            return False
+        # evalure relationships
+        functional_relationships = self._calculate_functional_relationships(detected_objects)
+        # 評估space的分布多樣性
+        spatial_diversity = self._calculate_spatial_diversity(detected_objects)
+        # 綜合評分機制
+        feasibility_score = 0
+        # 物件數量的貢獻值（權重30%）
+        object_count_score = min(len(detected_objects) / 5.0, 1.0) * 0.3
+        # 信心度質量貢獻（權重25%）
+        confidence_score = len(high_conf_objects) / max(len(detected_objects), 1) * 0.25
+        # 功能關聯性貢獻（權重25%）
+        relationship_score = functional_relationships * 0.25
+        # space多樣性貢獻（權重20%）
+        diversity_score = spatial_diversity * 0.20
+        feasibility_score = object_count_score + confidence_score + relationship_score + diversity_score
+        # 動態閾值：基於場景複雜度調整
+        complexity_threshold = self._get_complexity_threshold(scene_type)
+        return feasibility_score >= complexity_threshold
+    def _calculate_functional_relationships(self, detected_objects: List[Dict]) -> float:
+        """
+        計算物件間的功能關聯性評分
+        基於常見的物件組合模式評估功能相關性
+        """
+        relationship_pairs = {
+            # 家具組合關係
+            frozenset([56, 60]): 1.0,  # 椅子+桌子 (dining/work area)
+            frozenset([57, 62]): 0.9,  # 沙發+電視 (living area)
+            frozenset([59, 58]): 0.7,  # 床+植物 (bedroom decor)
+            # 工作相關組合
+            frozenset([63, 66]): 0.9,  # 筆電+鍵盤 (workspace)
+            frozenset([63, 64]): 0.8,  # 筆電+滑鼠 (workspace)
+            frozenset([60, 63]): 0.8,  # 桌子+筆電 (workspace)
+            # 廚房相關組合
+            frozenset([68, 72]): 0.9,  # 微波爐+冰箱 (kitchen)
+            frozenset([69, 71]): 0.8,  # 烤箱+水槽 (kitchen)
+            # 用餐相關組合
+            frozenset([60, 40]): 0.8,  # 桌子+酒杯 (dining)
+            frozenset([60, 41]): 0.8,  # 桌子+杯子 (dining)
+            frozenset([56, 40]): 0.7,  # 椅子+酒杯 (dining)
+            # 交通相關組合
+            frozenset([2, 9]): 0.8,   # 汽車+交通燈 (traffic)
+            frozenset([0, 9]): 0.7,   # 行人+交通燈 (crosswalk)
+        }
+        detected_class_ids = set(obj["class_id"] for obj in detected_objects)
+        max_possible_score = 0
+        actual_score = 0
+        for pair, score in relationship_pairs.items():
+            max_possible_score += score
+            if pair.issubset(detected_class_ids):
+                actual_score += score
+        return actual_score / max_possible_score if max_possible_score > 0 else 0
+    def _calculate_spatial_diversity(self, detected_objects: List[Dict]) -> float:
+        """
+        計算物件空間分布的多樣性
+        評估物件是否分散在不同區域，避免所有物件集中在單一區域
+        """
+        regions = set(obj.get("region", "center") for obj in detected_objects)
+        unique_regions = len(regions)
+        return min(unique_regions / 2.0, 1.0)
+    def _get_complexity_threshold(self, scene_type: str) -> float:
+        """
+        可根據場景類型返回適當的複雜度閾值
+        平衡不同場景的區域劃分需求
+        """
+        # 較簡單場景需要較高分數才進行區域劃分
+        simple_scenes = ["bedroom", "bathroom", "closet"]
+        # 較複雜場景可以較低分數進行區域劃分
+        complex_scenes = ["living_room", "kitchen", "office_workspace", "dining_area"]
+        if scene_type in simple_scenes:
+            return 0.65  # 較高閾值，避免過度細分
+        elif scene_type in complex_scenes:
+            return 0.45  # 較低閾值，允許合理劃分
+        else:
+            return 0.55  # 中等閾值，平衡策略
+    def _identify_indoor_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
+        """
+        平衡化的室內功能區域識別
+        採用通用的物件關聯性分析，避免場景特定的硬編碼
+        """
+        zones = {}
+        # 辨識到主要功能區域（基於物件關聯性而非場景類型）
+        primary_zone = self._identify_primary_functional_area(detected_objects)
+        if primary_zone:
+            zones["primary_area"] = primary_zone
+        # 只有明確證據且物件數量足夠時創建次要功能區域
+        if len(zones) >= 1 and len(detected_objects) >= 6:
+            secondary_zone = self._identify_secondary_functional_area(detected_objects, zones)
+            if secondary_zone:
+                zones["secondary_area"] = secondary_zone
         return zones
+    def _identify_primary_functional_area(self, detected_objects: List[Dict]) -> Dict:
+        """
+        辨識主要功能區域，基於最強的物件關聯性組合
+        採用通用邏輯處理各種室內場景
+        """
+        # 用餐區域檢測（桌椅組合）
+        dining_area = self._detect_functional_combination(
+            detected_objects,
+            primary_objects=[60],  # dining table
+            supporting_objects=[56, 40, 41, 42, 43],  # chair, wine glass, cup, fork, knife
+            min_supporting=2,
+            description_template="Dining area with table and seating arrangement"
+        )
+        if dining_area:
+            return dining_area
+        # 休息區域檢測（沙發電視組合或床）
+        seating_area = self._detect_functional_combination(
+            detected_objects,
+            primary_objects=[57, 59],  # sofa, bed
+            supporting_objects=[62, 58, 56],  # tv, potted plant, chair
+            min_supporting=1,
+            description_template="Seating and relaxation area"
+        )
+        if seating_area:
+            return seating_area
+        # 工作區域檢測（電子設備與家具組合）
+        work_area = self._detect_functional_combination(
+            detected_objects,
+            primary_objects=[63, 66],  # laptop, keyboard
+            supporting_objects=[60, 56, 64],  # dining table, chair, mouse
+            min_supporting=2,
+            description_template="Workspace area with electronics and furniture"
+        )
+        if work_area:
+            return work_area
+        return None
+    def _identify_secondary_functional_area(self, detected_objects: List[Dict], existing_zones: Dict) -> Dict:
+        """
+        識別次要功能區域，避免與主要區域重疊
+        """
+        # 獲取已使用的區域
+        used_regions = set(zone["region"] for zone in existing_zones.values())
+        # 裝飾區域檢測（植物集中區域）
+        decorative_area = self._detect_functional_combination(
+            detected_objects,
+            primary_objects=[58],  # potted plant
+            supporting_objects=[75],  # vase
+            min_supporting=0,
+            min_primary=3,  # 至少需要3個植物
+            description_template="Decorative area with plants and ornamental items",
+            exclude_regions=used_regions
+        )
+        if decorative_area:
+            return decorative_area
+        # 儲存區域檢測（廚房電器組合）
+        storage_area = self._detect_functional_combination(
+            detected_objects,
+            primary_objects=[72, 68, 69],  # refrigerator, microwave, oven
+            supporting_objects=[71],  # sink
+            min_supporting=0,
+            min_primary=2,
+            description_template="Kitchen appliance and storage area",
+            exclude_regions=used_regions
+        )
+        if storage_area:
+            return storage_area
+        return None
+    def _detect_functional_combination(self, detected_objects: List[Dict], primary_objects: List[int],
+                                    supporting_objects: List[int], min_supporting: int,
+                                    description_template: str, min_primary: int = 1,
+                                    exclude_regions: set = None) -> Dict:
+        """
+        通用的功能組合檢測方法
+        基於主要物件和支持物件的組合判斷功能區域
+        Args:
+            detected_objects: 檢測到的物件列表
+            primary_objects: 主要物件的class_id列表
+            supporting_objects: 支持物件的class_id列表
+            min_supporting: 最少需要的支持物件數量
+            description_template: 描述模板
+            min_primary: 最少需要的主要物件數量
+            exclude_regions: 需要排除的區域集合
+        Returns:
+            Dict: 功能區域資訊，如果不符合條件則返回None
+        """
+        if exclude_regions is None:
+            exclude_regions = set()
+        # 收集主要物件
+        primary_objs = [obj for obj in detected_objects
+                    if obj["class_id"] in primary_objects and obj.get("confidence", 0) >= 0.4]
+        # 收集支持物件
+        supporting_objs = [obj for obj in detected_objects
+                        if obj["class_id"] in supporting_objects and obj.get("confidence", 0) >= 0.4]
+        # 檢查是否滿足最少數量要求
+        if len(primary_objs) < min_primary or len(supporting_objs) < min_supporting:
+            return None
+        # 按區域組織物件
+        region_combinations = {}
+        all_relevant_objs = primary_objs + supporting_objs
+        for obj in all_relevant_objs:
+            region = obj["region"]
+            # 排除指定區域
+            if region in exclude_regions:
+                continue
+            if region not in region_combinations:
+                region_combinations[region] = {"primary": [], "supporting": [], "all": []}
+            region_combinations[region]["all"].append(obj)
+            if obj["class_id"] in primary_objects:
+                region_combinations[region]["primary"].append(obj)
+            else:
+                region_combinations[region]["supporting"].append(obj)
+        # 找到最佳區域組合
+        best_region = None
+        best_score = 0
+        for region, objs in region_combinations.items():
+            # 計算該區域的評分
+            primary_count = len(objs["primary"])
+            supporting_count = len(objs["supporting"])
+            # 必須滿足最低要求
+            if primary_count < min_primary or supporting_count < min_supporting:
+                continue
+            # 計算組合評分（主要物件權重較高）
+            score = primary_count * 2 + supporting_count
+            if score > best_score:
+                best_score = score
+                best_region = region
+        if best_region is None:
+            return None
+        best_combination = region_combinations[best_region]
+        all_objects = [obj["class_name"] for obj in best_combination["all"]]
+        return {
+            "region": best_region,
+            "objects": all_objects,
+            "description": description_template
+        }
     def _identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
         """
         Identify functional zones for urban intersections with enhanced spatial awareness.
         return zones
+    def _identify_landmark_zones(self, landmark_objects: List[Dict]) -> Dict:
+        """
+        識別與地標相關的功能區域
+        Args:
+            landmark_objects: 被識別為地標的物體列表
+        Returns:
+            Dict: 地標相關的功能區域
+        """
+        landmark_zones = {}
+        if not landmark_objects:
+            print("Warning: No landmark objects provided to _identify_landmark_zones")
+            return landmark_zones
+        try:
+            for i, landmark in enumerate(landmark_objects):
+                if not isinstance(landmark, dict):
+                    print(f"Warning: Landmark object at index {i} is not a dictionary: {type(landmark)}")
+                    continue
+                landmark_id = landmark.get("landmark_id")
+                if not landmark_id:
+                    print(f"Warning: Missing landmark_id for landmark at index {i}")
+                    landmark_id = f"unknown_landmark_{i}"
+                landmark_name = landmark.get("class_name", "Landmark")
+                landmark_type = landmark.get("landmark_type", "architectural")
+                landmark_region = landmark.get("region", "middle_center")
+                # 為地標創建主要觀景區
+                zone_id = f"landmark_zone_{i+1}"
+                zone_name = f"{landmark_name} Viewing Area"
+                # 根據地標類型調整描述
+                if landmark_type == "natural":
+                    zone_description = f"Scenic viewpoint for observing {landmark_name}, a notable natural landmark in {landmark.get('location', 'this area')}."
+                    primary_function = "Nature observation and photography"
+                elif landmark_type == "monument":
+                    zone_description = f"Viewing area around {landmark_name}, a significant monument in {landmark.get('location', 'this area')}."
+                    primary_function = "Historical appreciation and cultural tourism"
+                else:  # architectural
+                    zone_description = f"Area centered around {landmark_name}, where visitors can observe and appreciate this iconic structure in {landmark.get('location', 'this area')}."
+                    primary_function = "Architectural tourism and photography"
+                # 確定與地標相關的物體
+                related_objects = ["person", "camera", "cell phone", "backpack"]
+                # 創建功能區域
+                landmark_zones[zone_id] = {
+                    "name": zone_name,
+                    "description": zone_description,
+                    "objects": ["landmark"] + [obj for obj in related_objects if obj in [o.get("class_name") for o in landmark_objects]],
+                    "region": landmark_region,
+                    "primary_function": primary_function
+                }
+                # 如果有建造年份信息，加到描述中
+                if "year_built" in landmark:
+                    landmark_zones[zone_id]["description"] += f" Built in {landmark['year_built']}."
+                # 如果有建築風格信息，加到描述中
+                if "architectural_style" in landmark:
+                    landmark_zones[zone_id]["description"] += f" Features {landmark['architectural_style']} architectural style."
+                # 如果有重要性信息，加到描述中
+                if "significance" in landmark:
+                    landmark_zones[zone_id]["description"] += f" {landmark['significance']}."
+                try:
+                    # 創建照相區
+                    photo_region = landmark_region  # 默認與地標在同一區域
+                    # 根據地標位置調整照相區位置（地標前方通常是照相區）
+                    region_mapping = {
+                        "top_left": "bottom_right",
+                        "top_center": "bottom_center",
+                        "top_right": "bottom_left",
+                        "middle_left": "middle_right",
+                        "middle_center": "bottom_center",
+                        "middle_right": "middle_left",
+                        "bottom_left": "top_right",
+                        "bottom_center": "top_center",
+                        "bottom_right": "top_left"
+                    }
+                    if landmark_region in region_mapping:
+                        photo_region = region_mapping[landmark_region]
+                    landmark_zones[f"photo_spot_{i+1}"] = {
+                        "name": f"{landmark_name} Photography Spot",
+                        "description": f"Popular position for photographing {landmark_name} with optimal viewing angle.",
+                        "objects": ["camera", "person", "cell phone"],
+                        "region": photo_region,
+                        "primary_function": "Tourist photography"
+                    }
+                except Exception as e:
+                    print(f"Error creating photo spot zone: {e}")
+                try:
+                    # 如果是著名地標，可能有紀念品販售區
+                    if landmark.get("confidence", 0) > 0.7:  # 高置信度地標更可能有紀念品區
+                        # 根據地標位置找到適合的紀念品區位置（通常在地標附近但不直接在地標上）
+                        adjacent_regions = {
+                            "top_left": ["top_center", "middle_left"],
+                            "top_center": ["top_left", "top_right"],
+                            "top_right": ["top_center", "middle_right"],
+                            "middle_left": ["top_left", "bottom_left"],
+                            "middle_center": ["middle_left", "middle_right"],
+                            "middle_right": ["top_right", "bottom_right"],
+                            "bottom_left": ["middle_left", "bottom_center"],
+                            "bottom_center": ["bottom_left", "bottom_right"],
+                            "bottom_right": ["bottom_center", "middle_right"]
+                        }
+                        if landmark_region in adjacent_regions:
+                            souvenir_region = adjacent_regions[landmark_region][0]  # 選擇第一個相鄰區域
+                            landmark_zones[f"souvenir_area_{i+1}"] = {
+                                "name": f"{landmark_name} Souvenir Area",
+                                "description": f"Area where visitors can purchase souvenirs and memorabilia related to {landmark_name}.",
+                                "objects": ["person", "handbag", "backpack"],
+                                "region": souvenir_region,
+                                "primary_function": "Tourism commerce"
+                            }
+                except Exception as e:
+                    print(f"Error creating souvenir area zone: {e}")
+        except Exception as e:
+            print(f"Error in _identify_landmark_zones: {e}")
+            import traceback
+            traceback.print_exc()
+        return landmark_zones
     def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict],
                                 region_distribution: Dict) -> Dict:
         """
         if not vehicles:
             return traffic_zones
+        # 把運輸工具歸成一區
         vehicle_regions = {}
         for v in vehicles:
             region = v["region"]
     def _get_directional_description(self, region: str) -> str:
         """
+        把方向轉換成方位(東西南北)
         Args:
             region: Region name from the grid
         return max(region_objects_dict.items(),
                 key=lambda x: len(x[1]),
                 default=("unknown", []))[0]

video_processor.py CHANGED Viewed

@@ -222,7 +222,7 @@ class VideoProcessor:
                             else:
                                 obj_id = next_object_id
                                 next_object_id += 1
                                 # 使用更明顯的顏色
                                 bright_colors = [
                                     (0, 0, 255),    # red

                             else:
                                 obj_id = next_object_id
                                 next_object_id += 1
                                 # 使用更明顯的顏色
                                 bright_colors = [
                                     (0, 0, 255),    # red