DawnC commited on
Commit
4d1f920
·
verified ·
1 Parent(s): 58a70f4

Upload 31 files

Browse files
activity_templates.py CHANGED
@@ -320,5 +320,61 @@ ACTIVITY_TEMPLATES = {
320
  "Chef activities",
321
  "Commercial food handling",
322
  "Restaurant meal preparation"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  ]
324
  }
 
320
  "Chef activities",
321
  "Commercial food handling",
322
  "Restaurant meal preparation"
323
+ ],
324
+ "tourist_landmark": [
325
+ "Sightseeing",
326
+ "Photography",
327
+ "Guided tours",
328
+ "Learning about landmark history",
329
+ "Souvenir shopping",
330
+ "Cultural appreciation",
331
+ "Architectural observation"
332
+ ],
333
+ "natural_landmark": [
334
+ "Nature photography",
335
+ "Scenic viewing",
336
+ "Hiking",
337
+ "Nature appreciation",
338
+ "Wildlife watching",
339
+ "Outdoor recreation",
340
+ "Environmental education"
341
+ ],
342
+ "historical_monument": [
343
+ "Historical tours",
344
+ "Cultural heritage appreciation",
345
+ "Educational visits",
346
+ "Historical photography",
347
+ "Learning about past events",
348
+ "Architectural study",
349
+ "Heritage tourism"
350
+ ],
351
+ "general_indoor_space": [
352
+ "Engaging in general indoor activities",
353
+ "Resting or relaxing in an indoor setting",
354
+ "Possibly having a conversation or reading"
355
+ ],
356
+ "generic_street_view": [
357
+ "People walking or commuting",
358
+ "Vehicles driving on the road",
359
+ "Observing street traffic and urban activity",
360
+ "Waiting at a crosswalk or bus stop (if applicable objects present)"
361
+ ],
362
+ "desk_area_workspace": [
363
+ "Working on a computer or laptop",
364
+ "Studying or reading documents",
365
+ "Writing or taking notes",
366
+ "Participating in an online meeting (if computer present)"
367
+ ],
368
+ "outdoor_gathering_spot": [
369
+ "People socializing outdoors",
370
+ "Relaxing on a bench or in a park-like setting",
371
+ "Engaging in light recreational activities",
372
+ "Having a picnic (if food items or backpacks are present)"
373
+ ],
374
+ "kitchen_counter_or_utility_area": [
375
+ "Preparing food or drinks",
376
+ "Using kitchen appliances like a microwave or toaster",
377
+ "Washing dishes or cleaning",
378
+ "Storing food items"
379
  ]
380
  }
app.py CHANGED
@@ -19,8 +19,57 @@ from video_processor import VideoProcessor
19
  from llm_enhancer import LLMEnhancer
20
 
21
  # Initialize Processors with LLM support
22
- image_processor = ImageProcessor(use_llm=True, llm_model_path="meta-llama/Llama-3.2-3B-Instruct")
23
- video_processor = VideoProcessor(image_processor)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # Helper Function
26
  def get_all_classes():
@@ -58,14 +107,93 @@ def get_all_classes():
58
  return sorted(default_classes.items())
59
 
60
  @spaces.GPU
61
- def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True):
62
  """Processes a single uploaded image."""
63
- print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  try:
65
  image_processor.use_llm = use_llm
66
- if hasattr(image_processor, 'scene_analyzer'):
67
- image_processor.scene_analyzer.use_llm = use_llm
68
- print(f"Updated existing scene_analyzer use_llm setting to: {use_llm}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  class_ids_to_filter = None
71
  if filter_classes:
@@ -92,11 +220,13 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
92
  print(f"Filtering image results for class IDs: {class_ids_to_filter}")
93
 
94
  # Call the existing image processing logic
 
95
  result_image, result_text, stats = image_processor.process_image(
96
  image,
97
  model_name,
98
  confidence_threshold,
99
- class_ids_to_filter
 
100
  )
101
 
102
  # Format stats for JSON display
@@ -191,15 +321,13 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
191
 
192
  print(f"Original scene description (first 50 chars): {scene_desc[:50]}...")
193
 
194
- # 確保使用的是有效的描述
195
  clean_scene_desc = clean_description(scene_desc)
196
  print(f"Cleaned scene description (first 50 chars): {clean_scene_desc[:50]}...")
197
 
198
- # 即使清理後為空也確保顯示原始內容
199
  if not clean_scene_desc.strip():
200
  clean_scene_desc = scene_desc
201
 
202
- # 創建原始描述的HTML
203
  scene_desc_html = f"<div>{clean_scene_desc}</div>"
204
 
205
  # 獲取LLM增強描述並且確保設置默認值為空字符串而非 None,不然會有None type Error
@@ -210,18 +338,18 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
210
  if not enhanced_description or not enhanced_description.strip():
211
  print("WARNING: LLM enhanced description is empty!")
212
 
213
- # 準備徽章和描述標籤
214
  llm_badge = ""
215
  description_to_show = ""
216
 
 
217
  if use_llm and enhanced_description:
218
  llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background: linear-gradient(90deg, #38b2ac, #4299e1); color:white; font-size:0.7rem; font-weight:bold; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); border: 1px solid rgba(255, 255, 255, 0.2);">LLM Enhanced</span>'
219
  description_to_show = enhanced_description
220
- # 在 Original Scene Analysis 折疊區顯示原始的描述
221
  else:
222
  llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background-color:#718096; color:white; font-size:0.7rem; font-weight:bold;">Basic</span>'
223
  description_to_show = clean_scene_desc
224
- # 不使用 LLM 時,折疊區不顯示內容
225
 
226
  # 使用LLM敘述時會有徽章標籤在標題上
227
  scene_description_html = f'''
@@ -271,7 +399,7 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
271
  print("WARNING: LLM enhanced description is empty!")
272
 
273
  return (result_image, result_text, formatted_stats, plot_figure,
274
- scene_description_html, original_desc_html,
275
  activities_list_data, safety_data, zones, lighting)
276
 
277
  except Exception as e:
@@ -471,6 +599,12 @@ def create_interface():
471
  info="Provides more detailed and natural language descriptions (may increase processing time)"
472
  )
473
 
 
 
 
 
 
 
474
  with gr.Accordion("Filter Classes", open=False):
475
  gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
476
  with gr.Row():
@@ -490,24 +624,39 @@ def create_interface():
490
  with gr.Group(elem_classes="how-to-use"):
491
  gr.HTML('<div class="section-heading">How to Use (Image)</div>')
492
  gr.Markdown("""
493
- 1. Upload an image or use the camera
494
- 2. (Optional) Adjust settings like confidence threshold or model size (n, m=balanced, x=accurate)
495
- 3. In Analysis Settings, you can uncheck "Use LLM for enhanced scene descriptions" if you prefer faster processing
496
- 4. Optionally filter to specific object classes
497
- 5. Click **Detect Objects** button
 
 
 
 
498
  """)
 
 
499
  # Image Examples
500
  gr.Examples(
501
  examples=[
502
- "room_01.jpg",
503
- "room_02.jpg",
504
- "street_02.jpg",
505
- "street_04.jpg"
 
506
  ],
507
  inputs=image_input,
508
  label="Example Images"
509
  )
510
 
 
 
 
 
 
 
 
 
511
  # Right Column: Image Results
512
  with gr.Column(scale=6, elem_classes="output-panel"):
513
  with gr.Tabs(elem_classes="tabs"):
@@ -540,8 +689,8 @@ def create_interface():
540
  </p>
541
  </div>
542
  ''')
543
- image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
544
-
545
  # 使用LLM增強敘述時也會顯示原本敘述內容
546
  with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
547
  image_llm_description = gr.HTML(label=None, elem_id="original_scene_description_text")
@@ -709,7 +858,7 @@ def create_interface():
709
 
710
  image_detect_btn.click(
711
  fn=handle_image_upload,
712
- inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter, use_llm],
713
  outputs=[
714
  image_result_image, image_result_text, image_stats_json, image_plot_output,
715
  image_scene_description_html, image_llm_description, image_activities_list, image_safety_list, image_zones_json,
@@ -732,18 +881,18 @@ def create_interface():
732
 
733
  # Footer
734
  gr.HTML("""
735
- <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
736
- <div style="margin-bottom: 15px;">
737
- <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
738
- </div>
739
- <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
740
- <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
741
- <a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" target="_blank" style="text-decoration: none;">
742
- <img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
743
- </a>
744
- </div>
745
- </div>
746
- """)
747
 
748
  return demo
749
 
@@ -751,4 +900,4 @@ def create_interface():
751
  if __name__ == "__main__":
752
  demo_interface = create_interface()
753
 
754
- demo_interface.launch()
 
19
  from llm_enhancer import LLMEnhancer
20
 
21
  # Initialize Processors with LLM support
22
+ image_processor = None
23
+ video_processor = None
24
+
25
+ def initialize_processors():
26
+ global image_processor, video_processor
27
+
28
+ try:
29
+ print("Attempting to initialize ImageProcessor with LLM support...")
30
+ image_processor = ImageProcessor(use_llm=True, llm_model_path="meta-llama/Llama-3.2-3B-Instruct")
31
+ print("ImageProcessor initialized successfully with LLM")
32
+
33
+ # 添加診斷檢查
34
+ if hasattr(image_processor, 'scene_analyzer'):
35
+ if image_processor.scene_analyzer is not None:
36
+ print(f"scene_analyzer initialized: {type(image_processor.scene_analyzer)}")
37
+ if hasattr(image_processor.scene_analyzer, 'use_llm'):
38
+ print(f"scene_analyzer.use_llm available: {image_processor.scene_analyzer.use_llm}")
39
+ else:
40
+ print("WARNING: scene_analyzer is None after initialization")
41
+ else:
42
+ print("WARNING: scene_analyzer attribute not found in image_processor")
43
+
44
+ video_processor = VideoProcessor(image_processor)
45
+ print("VideoProcessor initialized successfully")
46
+ return True
47
+
48
+ except Exception as e:
49
+ print(f"Error initializing processors with LLM: {e}")
50
+ import traceback
51
+ traceback.print_exc()
52
+
53
+ # Create fallback processor without LLM
54
+ try:
55
+ print("Attempting fallback initialization without LLM...")
56
+ image_processor = ImageProcessor(use_llm=False, enable_places365=False)
57
+ video_processor = VideoProcessor(image_processor)
58
+ print("Fallback processors initialized successfully without LLM and Places365")
59
+ return True
60
+
61
+ except Exception as fallback_error:
62
+ print(f"Fatal error: Cannot initialize processors: {fallback_error}")
63
+ import traceback
64
+ traceback.print_exc()
65
+ image_processor = None
66
+ video_processor = None
67
+ return False
68
+
69
+ # Initialize processors
70
+ initialization_success = initialize_processors()
71
+ if not initialization_success:
72
+ print("WARNING: Failed to initialize processors. Application may not function correctly.")
73
 
74
  # Helper Function
75
  def get_all_classes():
 
107
  return sorted(default_classes.items())
108
 
109
  @spaces.GPU
110
+ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True, enable_landmark=True):
111
  """Processes a single uploaded image."""
112
+ # Enhanced safety check for image_processor
113
+ if image_processor is None:
114
+ error_msg = "Image processor is not initialized. Please restart the application or check system dependencies."
115
+ print(f"ERROR: {error_msg}")
116
+
117
+ # Create error plot
118
+ fig, ax = plt.subplots(figsize=(8, 6))
119
+ ax.text(0.5, 0.5, "Initialization Error\nProcessor Not Available",
120
+ color="red", ha="center", va="center", fontsize=14, fontweight="bold")
121
+ ax.axis('off')
122
+
123
+ return (None, error_msg, {}, fig, f"<div style='color: red; font-weight: bold;'>Error: {error_msg}</div>",
124
+ "<div style='color: red;'>Error: System not initialized</div>",
125
+ [["System Error"]], [["System Error"]], {}, {"time_of_day": "error", "confidence": 0})
126
+
127
+ # Additional safety check for processor attributes
128
+ if not hasattr(image_processor, 'use_llm'):
129
+ error_msg = "Image processor is corrupted. Missing required attributes."
130
+ print(f"ERROR: {error_msg}")
131
+
132
+ fig, ax = plt.subplots(figsize=(8, 6))
133
+ ax.text(0.5, 0.5, "Processor Error\nCorrupted State",
134
+ color="red", ha="center", va="center", fontsize=14, fontweight="bold")
135
+ ax.axis('off')
136
+
137
+ return (None, error_msg, {}, fig, f"<div style='color: red; font-weight: bold;'>Error: {error_msg}</div>",
138
+ "<div style='color: red;'>Error: Processor corrupted</div>",
139
+ [["Processor Error"]], [["Processor Error"]], {}, {"time_of_day": "error", "confidence": 0})
140
+
141
+ print(f"DIAGNOSTIC: Image upload handled with enable_landmark={enable_landmark}, use_llm={use_llm}")
142
+ print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}, enable_landmark: {enable_landmark}")
143
  try:
144
  image_processor.use_llm = use_llm
145
+
146
+ # ���保 scene_analyzer 不是 None
147
+ if hasattr(image_processor, 'scene_analyzer') and image_processor.scene_analyzer is not None:
148
+ if hasattr(image_processor.scene_analyzer, 'use_llm'):
149
+ image_processor.scene_analyzer.use_llm = use_llm
150
+ print(f"Updated existing scene_analyzer use_llm setting to: {use_llm}")
151
+
152
+ # 檢查並設置 landmark detection
153
+ if hasattr(image_processor.scene_analyzer, 'use_landmark_detection'):
154
+ # 設置所有相關標記
155
+ image_processor.scene_analyzer.use_landmark_detection = enable_landmark
156
+ image_processor.scene_analyzer.enable_landmark = enable_landmark
157
+
158
+ # 確保處理器也設置了這選項
159
+ image_processor.enable_landmark = enable_landmark
160
+
161
+ # 檢查並設置更深層次的組件
162
+ if hasattr(image_processor.scene_analyzer, 'scene_describer') and image_processor.scene_analyzer.scene_describer is not None:
163
+ image_processor.scene_analyzer.scene_describer.enable_landmark = enable_landmark
164
+
165
+ # 檢查並設置CLIP分析器上的標記
166
+ if hasattr(image_processor.scene_analyzer, 'clip_analyzer') and image_processor.scene_analyzer.clip_analyzer is not None:
167
+ if hasattr(image_processor.scene_analyzer.clip_analyzer, 'enable_landmark'):
168
+ image_processor.scene_analyzer.clip_analyzer.enable_landmark = enable_landmark
169
+
170
+ # 檢查並設置LLM增強器
171
+ if hasattr(image_processor.scene_analyzer, 'llm_enhancer') and image_processor.scene_analyzer.llm_enhancer is not None:
172
+ if hasattr(image_processor.scene_analyzer.llm_enhancer, 'enable_landmark'):
173
+ image_processor.scene_analyzer.llm_enhancer.enable_landmark = enable_landmark
174
+ print(f"Updated LLM enhancer enable_landmark to: {enable_landmark}")
175
+
176
+ print(f"Updated all landmark detection settings to: {enable_landmark}")
177
+ else:
178
+ print("WARNING: scene_analyzer is None or not available")
179
+ if hasattr(image_processor, 'enable_landmark'):
180
+ image_processor.enable_landmark = enable_landmark
181
+
182
+ # 設置更深層次的組別
183
+ if hasattr(image_processor.scene_analyzer, 'scene_describer'):
184
+ image_processor.scene_analyzer.scene_describer.enable_landmark = enable_landmark
185
+
186
+ # 設置CLIP分析器上的標記
187
+ if hasattr(image_processor.scene_analyzer, 'clip_analyzer'):
188
+ if hasattr(image_processor.scene_analyzer.clip_analyzer, 'enable_landmark'):
189
+ image_processor.scene_analyzer.clip_analyzer.enable_landmark = enable_landmark
190
+
191
+ # 如果有LLM增強器,也設置它
192
+ if hasattr(image_processor.scene_analyzer, 'llm_enhancer'):
193
+ image_processor.scene_analyzer.llm_enhancer.enable_landmark = enable_landmark
194
+ print(f"Updated LLM enhancer enable_landmark to: {enable_landmark}")
195
+
196
+ print(f"Updated all landmark detection settings to: {enable_landmark}")
197
 
198
  class_ids_to_filter = None
199
  if filter_classes:
 
220
  print(f"Filtering image results for class IDs: {class_ids_to_filter}")
221
 
222
  # Call the existing image processing logic
223
+ print(f"DEBUG: app.py 傳遞 enable_landmark={enable_landmark} 到 process_image")
224
  result_image, result_text, stats = image_processor.process_image(
225
  image,
226
  model_name,
227
  confidence_threshold,
228
+ class_ids_to_filter,
229
+ enable_landmark
230
  )
231
 
232
  # Format stats for JSON display
 
321
 
322
  print(f"Original scene description (first 50 chars): {scene_desc[:50]}...")
323
 
324
+ # determine original description
325
  clean_scene_desc = clean_description(scene_desc)
326
  print(f"Cleaned scene description (first 50 chars): {clean_scene_desc[:50]}...")
327
 
 
328
  if not clean_scene_desc.strip():
329
  clean_scene_desc = scene_desc
330
 
 
331
  scene_desc_html = f"<div>{clean_scene_desc}</div>"
332
 
333
  # 獲取LLM增強描述並且確保設置默認值為空字符串而非 None,不然會有None type Error
 
338
  if not enhanced_description or not enhanced_description.strip():
339
  print("WARNING: LLM enhanced description is empty!")
340
 
341
+ # bedge & label
342
  llm_badge = ""
343
  description_to_show = ""
344
 
345
+ # 在 Original Scene Analysis 折疊區顯示原始的描述
346
  if use_llm and enhanced_description:
347
  llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background: linear-gradient(90deg, #38b2ac, #4299e1); color:white; font-size:0.7rem; font-weight:bold; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); border: 1px solid rgba(255, 255, 255, 0.2);">LLM Enhanced</span>'
348
  description_to_show = enhanced_description
349
+
350
  else:
351
  llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background-color:#718096; color:white; font-size:0.7rem; font-weight:bold;">Basic</span>'
352
  description_to_show = clean_scene_desc
 
353
 
354
  # 使用LLM敘述時會有徽章標籤在標題上
355
  scene_description_html = f'''
 
399
  print("WARNING: LLM enhanced description is empty!")
400
 
401
  return (result_image, result_text, formatted_stats, plot_figure,
402
+ scene_description_html, original_desc_html,
403
  activities_list_data, safety_data, zones, lighting)
404
 
405
  except Exception as e:
 
599
  info="Provides more detailed and natural language descriptions (may increase processing time)"
600
  )
601
 
602
+ use_landmark_detection = gr.Checkbox(
603
+ label="Use CLIP for Landmark Detection",
604
+ value=False,
605
+ info="Detect famous landmarks, monuments, and tourist attractions that standard object detection cannot recognize (increases processing time)"
606
+ )
607
+
608
  with gr.Accordion("Filter Classes", open=False):
609
  gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
610
  with gr.Row():
 
624
  with gr.Group(elem_classes="how-to-use"):
625
  gr.HTML('<div class="section-heading">How to Use (Image)</div>')
626
  gr.Markdown("""
627
+ 1. Upload an image or use the camera
628
+ 2. *(Optional)* Adjust settings like confidence threshold or model size (n, m = balanced, x = accurate)
629
+ 3. In **Analysis Settings**, you can:
630
+ * Uncheck **Use LLM** to skip enhanced descriptions (faster)
631
+ * Check **Use CLIP for Landmark Detection** to identify famous landmarks like museums, monuments, and tourist attractions *(may take longer)*
632
+ * Filter object classes to focus on specific types of objects *(optional)*
633
+ 4. Click **Analyze Image** button
634
+
635
+ **💡 Tip:** For landmark recognition (e.g. Louvre Museum), make sure to enable **CLIP for Landmark Detection** in the settings above.
636
  """)
637
+
638
+
639
  # Image Examples
640
  gr.Examples(
641
  examples=[
642
+ "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/room_01.jpg",
643
+ "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/room_02.jpg",
644
+ "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/street_04.jpg",
645
+ "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/street_05.jpg",
646
+ "/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/landmark_Louvre_01.jpg",
647
  ],
648
  inputs=image_input,
649
  label="Example Images"
650
  )
651
 
652
+ gr.HTML("""
653
+ <div style="text-align: center; margin-top: 8px; padding: 6px; background-color: #f8f9fa; border-radius: 4px; border: 1px solid #e2e8f0;">
654
+ <p style="font-size: 12px; color: #718096; margin: 0;">
655
+ 📷 Sample images sourced from <a href="https://unsplash.com" target="_blank" style="color: #3182ce; text-decoration: underline;">Unsplash</a>
656
+ </p>
657
+ </div>
658
+ """)
659
+
660
  # Right Column: Image Results
661
  with gr.Column(scale=6, elem_classes="output-panel"):
662
  with gr.Tabs(elem_classes="tabs"):
 
689
  </p>
690
  </div>
691
  ''')
692
+ image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
693
+
694
  # 使用LLM增強敘述時也會顯示原本敘述內容
695
  with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
696
  image_llm_description = gr.HTML(label=None, elem_id="original_scene_description_text")
 
858
 
859
  image_detect_btn.click(
860
  fn=handle_image_upload,
861
+ inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter, use_llm, use_landmark_detection ],
862
  outputs=[
863
  image_result_image, image_result_text, image_stats_json, image_plot_output,
864
  image_scene_description_html, image_llm_description, image_activities_list, image_safety_list, image_zones_json,
 
881
 
882
  # Footer
883
  gr.HTML("""
884
+ <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
885
+ <div style="margin-bottom: 15px;">
886
+ <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Places365, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
887
+ </div>
888
+ <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
889
+ <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
890
+ <a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" target="_blank" style="text-decoration: none;">
891
+ <img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
892
+ </a>
893
+ </div>
894
+ </div>
895
+ """)
896
 
897
  return demo
898
 
 
900
  if __name__ == "__main__":
901
  demo_interface = create_interface()
902
 
903
+ demo_interface.launch(debug=True)
clip_analyzer.py CHANGED
@@ -20,12 +20,12 @@ class CLIPAnalyzer:
20
  Use Clip to intergrate scene understanding function
21
  """
22
 
23
- def __init__(self, model_name: str = "ViT-B/32", device: str = None):
24
  """
25
  初始化 CLIP 分析器。
26
 
27
  Args:
28
- model_name: CLIP Model name, "ViT-B/32"、"ViT-B/16"、"ViT-L/14"
29
  device: Use GPU if it can use
30
  """
31
  # 自動選擇設備
@@ -55,49 +55,150 @@ class CLIPAnalyzer:
55
  self._prepare_text_prompts()
56
 
57
  def _prepare_text_prompts(self):
58
- """準備所有文本提示的 CLIP 特徵"""
59
- # base prompt
60
- scene_texts = [self.scene_type_prompts[scene_type] for scene_type in self.scene_type_prompts]
61
- self.scene_type_tokens = clip.tokenize(scene_texts).to(self.device)
62
-
63
- # cultural
64
- self.cultural_tokens_dict = {}
65
- for scene_type, prompts in self.cultural_scene_prompts.items():
66
- self.cultural_tokens_dict[scene_type] = clip.tokenize(prompts).to(self.device)
67
-
68
- # Light
69
- lighting_texts = [self.lighting_condition_prompts[cond] for cond in self.lighting_condition_prompts]
70
- self.lighting_tokens = clip.tokenize(lighting_texts).to(self.device)
71
-
72
- # specializes_status
73
- self.specialized_tokens_dict = {}
74
- for scene_type, prompts in self.specialized_scene_prompts.items():
75
- self.specialized_tokens_dict[scene_type] = clip.tokenize(prompts).to(self.device)
76
-
77
- # view point
78
- viewpoint_texts = [self.viewpoint_prompts[viewpoint] for viewpoint in self.viewpoint_prompts]
79
- self.viewpoint_tokens = clip.tokenize(viewpoint_texts).to(self.device)
80
-
81
- # object combination
82
- object_combination_texts = [self.object_combination_prompts[combo] for combo in self.object_combination_prompts]
83
- self.object_combination_tokens = clip.tokenize(object_combination_texts).to(self.device)
84
-
85
- # activicty prompt
86
- activity_texts = [self.activity_prompts[activity] for activity in self.activity_prompts]
87
- self.activity_tokens = clip.tokenize(activity_texts).to(self.device)
88
-
89
- def analyze_image(self, image, include_cultural_analysis: bool = True) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  """
91
  分析圖像,預測場景類型和光照條件。
92
 
93
  Args:
94
  image: 輸入圖像 (PIL Image 或 numpy array)
95
  include_cultural_analysis: 是否包含文化場景的詳細分析
 
 
 
 
96
 
97
  Returns:
98
  Dict: 包含場景類型預測和光照條件的分析結果
99
  """
100
  try:
 
101
  # 確保圖像是 PIL 格式
102
  if not isinstance(image, Image.Image):
103
  if isinstance(image, np.ndarray):
@@ -113,46 +214,127 @@ class CLIPAnalyzer:
113
  image_features = self.model.encode_image(image_input)
114
  image_features = image_features / image_features.norm(dim=-1, keepdim=True)
115
 
116
- # 分析場景類型
117
- scene_scores = self._analyze_scene_type(image_features)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
- # 分析光照條件
120
  lighting_scores = self._analyze_lighting_condition(image_features)
121
-
122
- # 文化場景的增強分析
123
  cultural_analysis = {}
124
- if include_cultural_analysis:
125
- for scene_type in self.cultural_scene_prompts:
126
- if scene_type in scene_scores and scene_scores[scene_type] > 0.2:
127
- cultural_analysis[scene_type] = self._analyze_cultural_scene(
128
- image_features, scene_type
 
129
  )
130
 
131
  specialized_analysis = {}
132
- for scene_type in self.specialized_scene_prompts:
133
- if scene_type in scene_scores and scene_scores[scene_type] > 0.2:
134
- specialized_analysis[scene_type] = self._analyze_specialized_scene(
135
- image_features, scene_type
136
  )
137
 
138
  viewpoint_scores = self._analyze_viewpoint(image_features)
139
-
140
  object_combination_scores = self._analyze_object_combinations(image_features)
141
-
142
  activity_scores = self._analyze_activities(image_features)
143
 
144
- # display results
 
 
 
 
 
 
 
 
 
 
 
 
145
  result = {
146
  "scene_scores": scene_scores,
147
- "top_scene": max(scene_scores.items(), key=lambda x: x[1]),
148
- "lighting_condition": max(lighting_scores.items(), key=lambda x: x[1]),
149
- "embedding": image_features.cpu().numpy().tolist()[0] if self.device == "cuda" else image_features.numpy().tolist()[0],
150
- "viewpoint": max(viewpoint_scores.items(), key=lambda x: x[1]),
151
- "object_combinations": sorted(object_combination_scores.items(), key=lambda x: x[1], reverse=True)[:3],
152
- "activities": sorted(activity_scores.items(), key=lambda x: x[1], reverse=True)[:3]
153
  }
154
 
155
- if cultural_analysis:
 
 
 
 
 
 
 
 
 
156
  result["cultural_analysis"] = cultural_analysis
157
 
158
  if specialized_analysis:
@@ -164,15 +346,49 @@ class CLIPAnalyzer:
164
  print(f"Error analyzing image with CLIP: {e}")
165
  import traceback
166
  traceback.print_exc()
167
- return {"error": str(e)}
 
 
 
 
168
 
169
- def _analyze_scene_type(self, image_features: torch.Tensor) -> Dict[str, float]:
170
- """分析圖像特徵與各場景類型的相似度"""
 
 
 
 
 
171
  with torch.no_grad():
172
  # 計算場景類型文本特徵
173
  text_features = self.model.encode_text(self.scene_type_tokens)
174
  text_features = text_features / text_features.norm(dim=-1, keepdim=True)
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  # 計算相似度分數
177
  similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
178
  similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
@@ -180,7 +396,36 @@ class CLIPAnalyzer:
180
  # 建立場景分數字典
181
  scene_scores = {}
182
  for i, scene_type in enumerate(self.scene_type_prompts.keys()):
183
- scene_scores[scene_type] = float(similarity[i])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  return scene_scores
186
 
@@ -388,3 +633,12 @@ class CLIPAnalyzer:
388
  result[query] = float(similarity[i])
389
 
390
  return result
 
 
 
 
 
 
 
 
 
 
20
  Use Clip to intergrate scene understanding function
21
  """
22
 
23
+ def __init__(self, model_name: str = "ViT-L/14", device: str = None):
24
  """
25
  初始化 CLIP 分析器。
26
 
27
  Args:
28
+ model_name: CLIP Model name, 默認 "ViT-L/14"
29
  device: Use GPU if it can use
30
  """
31
  # 自動選擇設備
 
55
  self._prepare_text_prompts()
56
 
57
  def _prepare_text_prompts(self):
58
+ """準備所有文本提示的 CLIP 特徵並存儲到 self.text_features_cache 中"""
59
+ self.text_features_cache = {}
60
+
61
+ # 處理基礎場景類型 (SCENE_TYPE_PROMPTS)
62
+ if hasattr(self, 'scene_type_prompts') and self.scene_type_prompts:
63
+ scene_texts = [prompt for scene_type, prompt in self.scene_type_prompts.items()]
64
+ if scene_texts:
65
+ self.text_features_cache["scene_type_keys"] = list(self.scene_type_prompts.keys())
66
+ try:
67
+ self.text_features_cache["scene_type_tokens"] = clip.tokenize(scene_texts).to(self.device)
68
+ except Exception as e:
69
+ print(f"Warning: Error tokenizing scene_type_prompts: {e}")
70
+ self.text_features_cache["scene_type_tokens"] = None # 標記錯誤或空
71
+ else:
72
+ self.text_features_cache["scene_type_keys"] = []
73
+ self.text_features_cache["scene_type_tokens"] = None
74
+ else:
75
+ self.text_features_cache["scene_type_keys"] = []
76
+ self.text_features_cache["scene_type_tokens"] = None
77
+
78
+ # 處理文化場景 (CULTURAL_SCENE_PROMPTS)
79
+ # cultural_tokens_dict 存儲的是 tokenized prompts
80
+ cultural_tokens_dict_val = {}
81
+ if hasattr(self, 'cultural_scene_prompts') and self.cultural_scene_prompts:
82
+ for scene_type, prompts in self.cultural_scene_prompts.items():
83
+ if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
84
+ try:
85
+ cultural_tokens_dict_val[scene_type] = clip.tokenize(prompts).to(self.device)
86
+ except Exception as e:
87
+ print(f"Warning: Error tokenizing cultural_scene_prompts for {scene_type}: {e}")
88
+ cultural_tokens_dict_val[scene_type] = None # 標記錯誤或空
89
+ else:
90
+ cultural_tokens_dict_val[scene_type] = None # prompts 不合規
91
+ self.text_features_cache["cultural_tokens_dict"] = cultural_tokens_dict_val
92
+
93
+ # 處理光照條件 (LIGHTING_CONDITION_PROMPTS)
94
+ if hasattr(self, 'lighting_condition_prompts') and self.lighting_condition_prompts:
95
+ lighting_texts = [prompt for cond, prompt in self.lighting_condition_prompts.items()]
96
+ if lighting_texts:
97
+ self.text_features_cache["lighting_condition_keys"] = list(self.lighting_condition_prompts.keys())
98
+ try:
99
+ self.text_features_cache["lighting_tokens"] = clip.tokenize(lighting_texts).to(self.device)
100
+ except Exception as e:
101
+ print(f"Warning: Error tokenizing lighting_condition_prompts: {e}")
102
+ self.text_features_cache["lighting_tokens"] = None
103
+ else:
104
+ self.text_features_cache["lighting_condition_keys"] = []
105
+ self.text_features_cache["lighting_tokens"] = None
106
+ else:
107
+ self.text_features_cache["lighting_condition_keys"] = []
108
+ self.text_features_cache["lighting_tokens"] = None
109
+
110
+ # 處理特殊場景 (SPECIALIZED_SCENE_PROMPTS)
111
+ specialized_tokens_dict_val = {}
112
+ if hasattr(self, 'specialized_scene_prompts') and self.specialized_scene_prompts:
113
+ for scene_type, prompts in self.specialized_scene_prompts.items():
114
+ if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
115
+ try:
116
+ specialized_tokens_dict_val[scene_type] = clip.tokenize(prompts).to(self.device)
117
+ except Exception as e:
118
+ print(f"Warning: Error tokenizing specialized_scene_prompts for {scene_type}: {e}")
119
+ specialized_tokens_dict_val[scene_type] = None
120
+ else:
121
+ specialized_tokens_dict_val[scene_type] = None
122
+ self.text_features_cache["specialized_tokens_dict"] = specialized_tokens_dict_val
123
+
124
+ # 處理視角 (VIEWPOINT_PROMPTS)
125
+ if hasattr(self, 'viewpoint_prompts') and self.viewpoint_prompts:
126
+ viewpoint_texts = [prompt for viewpoint, prompt in self.viewpoint_prompts.items()]
127
+ if viewpoint_texts:
128
+ self.text_features_cache["viewpoint_keys"] = list(self.viewpoint_prompts.keys())
129
+ try:
130
+ self.text_features_cache["viewpoint_tokens"] = clip.tokenize(viewpoint_texts).to(self.device)
131
+ except Exception as e:
132
+ print(f"Warning: Error tokenizing viewpoint_prompts: {e}")
133
+ self.text_features_cache["viewpoint_tokens"] = None
134
+ else:
135
+ self.text_features_cache["viewpoint_keys"] = []
136
+ self.text_features_cache["viewpoint_tokens"] = None
137
+ else:
138
+ self.text_features_cache["viewpoint_keys"] = []
139
+ self.text_features_cache["viewpoint_tokens"] = None
140
+
141
+ # 處理物件組合 (OBJECT_COMBINATION_PROMPTS)
142
+ if hasattr(self, 'object_combination_prompts') and self.object_combination_prompts:
143
+ object_combination_texts = [prompt for combo, prompt in self.object_combination_prompts.items()]
144
+ if object_combination_texts:
145
+ self.text_features_cache["object_combination_keys"] = list(self.object_combination_prompts.keys())
146
+ try:
147
+ self.text_features_cache["object_combination_tokens"] = clip.tokenize(object_combination_texts).to(self.device)
148
+ except Exception as e:
149
+ print(f"Warning: Error tokenizing object_combination_prompts: {e}")
150
+ self.text_features_cache["object_combination_tokens"] = None
151
+ else:
152
+ self.text_features_cache["object_combination_keys"] = []
153
+ self.text_features_cache["object_combination_tokens"] = None
154
+ else:
155
+ self.text_features_cache["object_combination_keys"] = []
156
+ self.text_features_cache["object_combination_tokens"] = None
157
+
158
+ # 處理活動 (ACTIVITY_PROMPTS)
159
+ if hasattr(self, 'activity_prompts') and self.activity_prompts:
160
+ activity_texts = [prompt for activity, prompt in self.activity_prompts.items()]
161
+ if activity_texts:
162
+ self.text_features_cache["activity_keys"] = list(self.activity_prompts.keys())
163
+ try:
164
+ self.text_features_cache["activity_tokens"] = clip.tokenize(activity_texts).to(self.device)
165
+ except Exception as e:
166
+ print(f"Warning: Error tokenizing activity_prompts: {e}")
167
+ self.text_features_cache["activity_tokens"] = None
168
+ else:
169
+ self.text_features_cache["activity_keys"] = []
170
+ self.text_features_cache["activity_tokens"] = None
171
+ else:
172
+ self.text_features_cache["activity_keys"] = []
173
+ self.text_features_cache["activity_tokens"] = None
174
+
175
+ self.scene_type_tokens = self.text_features_cache["scene_type_tokens"]
176
+ self.lighting_tokens = self.text_features_cache["lighting_tokens"]
177
+ self.viewpoint_tokens = self.text_features_cache["viewpoint_tokens"]
178
+ self.object_combination_tokens = self.text_features_cache["object_combination_tokens"]
179
+ self.activity_tokens = self.text_features_cache["activity_tokens"]
180
+ self.cultural_tokens_dict = self.text_features_cache["cultural_tokens_dict"]
181
+ self.specialized_tokens_dict = self.text_features_cache["specialized_tokens_dict"]
182
+
183
+ print("CLIP text_features_cache prepared.")
184
+
185
+ def analyze_image(self, image, include_cultural_analysis=True, exclude_categories=None, enable_landmark=True, places365_guidance=None):
186
  """
187
  分析圖像,預測場景類型和光照條件。
188
 
189
  Args:
190
  image: 輸入圖像 (PIL Image 或 numpy array)
191
  include_cultural_analysis: 是否包含文化場景的詳細分析
192
+ exclude_categories: 要排除的類別列表
193
+ enable_landmark: 是否啟用地標檢測功能
194
+ places365_guidance: Places365 提供的場景指導信息 (可選)
195
+
196
 
197
  Returns:
198
  Dict: 包含場景類型預測和光照條件的分析結果
199
  """
200
  try:
201
+ self.enable_landmark = enable_landmark # 更新實例的 enable_landmark 狀態
202
  # 確保圖像是 PIL 格式
203
  if not isinstance(image, Image.Image):
204
  if isinstance(image, np.ndarray):
 
214
  image_features = self.model.encode_image(image_input)
215
  image_features = image_features / image_features.norm(dim=-1, keepdim=True)
216
 
217
+ places365_focus_areas = []
218
+ places365_scene_context = "" # 用於存儲 Places365 提供的場景描述
219
+
220
+ if places365_guidance and isinstance(places365_guidance, dict) and places365_guidance.get('confidence', 0) > 0.4:
221
+ mapped_scene = places365_guidance.get('mapped_scene_type', '')
222
+ scene_label = places365_guidance.get('scene_label', '')
223
+ # is_indoor = places365_guidance.get('is_indoor', None) # 未使用,可註釋
224
+ attributes = places365_guidance.get('attributes', [])
225
+
226
+ places365_scene_context = f"Scene identified by Places365 as {scene_label}" # 更新上下文描述
227
+
228
+ # Adjust CLIP analysis focus based on Places365 scene type
229
+ if mapped_scene in ['kitchen', 'dining_area', 'restaurant']:
230
+ places365_focus_areas.extend(['food preparation', 'dining setup', 'kitchen appliances'])
231
+ elif mapped_scene in ['office_workspace', 'educational_setting', 'library', 'conference_room']:
232
+ places365_focus_areas.extend(['work environment', 'professional setting', 'learning space', 'study area'])
233
+ elif mapped_scene in ['retail_store', 'shopping_mall', 'market', 'supermarket']: # 擴展匹配
234
+ places365_focus_areas.extend(['commercial space', 'shopping environment', 'retail display', 'goods for sale'])
235
+ elif mapped_scene in ['park_area', 'beach', 'natural_outdoor_area', 'playground', 'sports_field']: # 擴展匹配
236
+ places365_focus_areas.extend(['outdoor recreation', 'natural environment', 'leisure activity', 'open space'])
237
+
238
+ # 根據屬性添加更通用的 focus areas
239
+ if isinstance(attributes, list): # 確保 attributes 是列表
240
+ if 'commercial' in attributes:
241
+ places365_focus_areas.append('business activity')
242
+ if 'recreational' in attributes:
243
+ places365_focus_areas.append('entertainment or leisure')
244
+ if 'residential' in attributes:
245
+ places365_focus_areas.append('living space')
246
+
247
+ # 去重
248
+ places365_focus_areas = list(set(places365_focus_areas))
249
+
250
+ if places365_focus_areas: # 只有在確實有 focus areas 時才打印
251
+ print(f"CLIP analysis guided by Places365: {places365_scene_context}, focus areas: {places365_focus_areas}")
252
+
253
+ # 分析場景類型,傳遞 enable_landmark 參數和 Places365 指導
254
+ scene_scores = self._analyze_scene_type(image_features,
255
+ enable_landmark=self.enable_landmark, # 使用更新後的實例屬性
256
+ places365_focus=places365_focus_areas)
257
+
258
+ # 如果禁用地標功能,確保排除地標相關類別
259
+ current_exclude_categories = list(exclude_categories) if exclude_categories is not None else []
260
+ if not self.enable_landmark: # 使用更新後的實例屬性
261
+ landmark_related_terms = ["landmark", "monument", "tower", "tourist", "attraction", "historical", "famous", "iconic"]
262
+ for term in landmark_related_terms:
263
+ if term not in current_exclude_categories:
264
+ current_exclude_categories.append(term)
265
+
266
+ if current_exclude_categories:
267
+ filtered_scores = {}
268
+ for scene, score in scene_scores.items():
269
+ # 檢查 scene 的鍵名(通常是英文)是否包含任何排除詞彙
270
+ if not any(cat.lower() in scene.lower() for cat in current_exclude_categories):
271
+ filtered_scores[scene] = score
272
+
273
+ if filtered_scores:
274
+ total_score = sum(filtered_scores.values())
275
+ if total_score > 1e-5: # 避免除以零或非常小的數
276
+ scene_scores = {k: v / total_score for k, v in filtered_scores.items()}
277
+ else: # 如果總分趨近於0,則保持原樣或設為0
278
+ scene_scores = {k: 0.0 for k in filtered_scores.keys()} # 或者 scene_scores = filtered_scores
279
+ else: # 如果過濾後沒有場景了
280
+ scene_scores = {k: (0.0 if any(cat.lower() in k.lower() for cat in current_exclude_categories) else v) for k,v in scene_scores.items()}
281
+ if not any(s > 1e-5 for s in scene_scores.values()): # 如果還是全0
282
+ scene_scores = {"unknown": 1.0} # 給一個默認值避免空字典
283
 
 
284
  lighting_scores = self._analyze_lighting_condition(image_features)
 
 
285
  cultural_analysis = {}
286
+ if include_cultural_analysis and self.enable_landmark: # 使用更新後的實例屬性
287
+ for scene_type_cultural_key in self.text_features_cache.get("cultural_tokens_dict", {}).keys():
288
+ # 確保 scene_type_cultural_key SCENE_TYPE_PROMPTS 中的鍵,或者有一個映射關係
289
+ if scene_type_cultural_key in scene_scores and scene_scores[scene_type_cultural_key] > 0.2:
290
+ cultural_analysis[scene_type_cultural_key] = self._analyze_cultural_scene(
291
+ image_features, scene_type_cultural_key
292
  )
293
 
294
  specialized_analysis = {}
295
+ for scene_type_specialized_key in self.text_features_cache.get("specialized_tokens_dict", {}).keys():
296
+ if scene_type_specialized_key in scene_scores and scene_scores[scene_type_specialized_key] > 0.2:
297
+ specialized_analysis[scene_type_specialized_key] = self._analyze_specialized_scene(
298
+ image_features, scene_type_specialized_key
299
  )
300
 
301
  viewpoint_scores = self._analyze_viewpoint(image_features)
 
302
  object_combination_scores = self._analyze_object_combinations(image_features)
 
303
  activity_scores = self._analyze_activities(image_features)
304
 
305
+ if scene_scores: # 確保 scene_scores 不是空的
306
+ top_scene = max(scene_scores.items(), key=lambda x: x[1])
307
+ # 如果禁用地標,再次確認 top_scene 不是地標相關
308
+ if not self.enable_landmark and any(cat.lower() in top_scene[0].lower() for cat in current_exclude_categories):
309
+ non_excluded_scores = {k:v for k,v in scene_scores.items() if not any(cat.lower() in k.lower() for cat in current_exclude_categories)}
310
+ if non_excluded_scores:
311
+ top_scene = max(non_excluded_scores.items(), key=lambda x: x[1])
312
+ else:
313
+ top_scene = ("unknown", 0.0) # 或其他合適的默認值
314
+ else:
315
+ top_scene = ("unknown", 0.0)
316
+
317
+
318
  result = {
319
  "scene_scores": scene_scores,
320
+ "top_scene": top_scene,
321
+ "lighting_condition": max(lighting_scores.items(), key=lambda x: x[1]) if lighting_scores else ("unknown", 0.0),
322
+ "embedding": image_features.cpu().numpy().tolist()[0], # 簡化
323
+ "viewpoint": max(viewpoint_scores.items(), key=lambda x: x[1]) if viewpoint_scores else ("unknown", 0.0),
324
+ "object_combinations": sorted(object_combination_scores.items(), key=lambda x: x[1], reverse=True)[:3] if object_combination_scores else [],
325
+ "activities": sorted(activity_scores.items(), key=lambda x: x[1], reverse=True)[:3] if activity_scores else []
326
  }
327
 
328
+ if places365_guidance and isinstance(places365_guidance, dict) and places365_focus_areas: # 檢查 places365_focus_areas 是否被填充
329
+ result["places365_guidance"] = {
330
+ "scene_context": places365_scene_context,
331
+ "focus_areas": places365_focus_areas, # 現在這個會包含基於 guidance 的內容
332
+ "guided_analysis": True,
333
+ "original_places365_scene": places365_guidance.get('scene_label', 'N/A'),
334
+ "original_places365_confidence": places365_guidance.get('confidence', 0.0)
335
+ }
336
+
337
+ if cultural_analysis and self.enable_landmark:
338
  result["cultural_analysis"] = cultural_analysis
339
 
340
  if specialized_analysis:
 
346
  print(f"Error analyzing image with CLIP: {e}")
347
  import traceback
348
  traceback.print_exc()
349
+ return {"error": str(e), "scene_scores": {}, "top_scene": ("error", 0.0)}
350
+
351
+ def _analyze_scene_type(self, image_features: torch.Tensor, enable_landmark: bool = True, places365_focus: List[str] = None) -> Dict[str, float]:
352
+ """
353
+ 分析圖像特徵與各場景類型的相似度,並可選擇性地排除地標相關場景
354
 
355
+ Args:
356
+ image_features: 經過 CLIP 編碼的圖像特徵
357
+ enable_landmark: 是否啟用地標識別功能
358
+
359
+ Returns:
360
+ Dict[str, float]: 各場景類型的相似度分數字典
361
+ """
362
  with torch.no_grad():
363
  # 計算場景類型文本特徵
364
  text_features = self.model.encode_text(self.scene_type_tokens)
365
  text_features = text_features / text_features.norm(dim=-1, keepdim=True)
366
 
367
+ # Apply Places365 guidance if available
368
+ if places365_focus and len(places365_focus) > 0:
369
+ # Create enhanced prompts that incorporate Places365 guidance
370
+ enhanced_prompts = []
371
+ for scene_type in self.scene_type_prompts.keys():
372
+ base_prompt = self.scene_type_prompts[scene_type]
373
+
374
+ # Check if this scene type should be emphasized based on Places365 guidance
375
+ scene_lower = scene_type.lower()
376
+ should_enhance = False
377
+
378
+ for focus_area in places365_focus:
379
+ if any(keyword in scene_lower for keyword in focus_area.split()):
380
+ should_enhance = True
381
+ enhanced_prompts.append(f"{base_prompt} with {focus_area}")
382
+ break
383
+
384
+ if not should_enhance:
385
+ enhanced_prompts.append(base_prompt)
386
+
387
+ # Re-tokenize and encode enhanced prompts
388
+ enhanced_tokens = clip.tokenize(enhanced_prompts).to(self.device)
389
+ text_features = self.model.encode_text(enhanced_tokens)
390
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
391
+
392
  # 計算相似度分數
393
  similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
394
  similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
 
396
  # 建立場景分數字典
397
  scene_scores = {}
398
  for i, scene_type in enumerate(self.scene_type_prompts.keys()):
399
+ # 如果未啟用地標功能,則跳過地標相關場景類型
400
+ if not enable_landmark and scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
401
+ scene_scores[scene_type] = 0.0 # 將地標場景分數設為零
402
+ else:
403
+ base_score = float(similarity[i])
404
+
405
+ # Apply Places365 guidance boost if applicable
406
+ if places365_focus:
407
+ scene_lower = scene_type.lower()
408
+ boost_factor = 1.0
409
+
410
+ for focus_area in places365_focus:
411
+ if any(keyword in scene_lower for keyword in focus_area.split()):
412
+ boost_factor = 1.15 # 15% boost for matching scenes
413
+ break
414
+
415
+ scene_scores[scene_type] = base_score * boost_factor
416
+ else:
417
+ scene_scores[scene_type] = base_score
418
+
419
+ # 如果禁用地標功能,確保重新歸一化剩餘場景分數
420
+ if not enable_landmark:
421
+ # 獲取所有非零分數
422
+ non_zero_scores = {k: v for k, v in scene_scores.items() if v > 0}
423
+ if non_zero_scores:
424
+ # 計算總和並歸一化
425
+ total_score = sum(non_zero_scores.values())
426
+ if total_score > 0:
427
+ for scene_type in non_zero_scores:
428
+ scene_scores[scene_type] = non_zero_scores[scene_type] / total_score
429
 
430
  return scene_scores
431
 
 
633
  result[query] = float(similarity[i])
634
 
635
  return result
636
+
637
+ def get_clip_instance(self):
638
+ """
639
+ 獲取初始化好的CLIP模型實例,便於其他模組重用
640
+
641
+ Returns:
642
+ tuple: (模型實例, 預處理函數, 設備名稱)
643
+ """
644
+ return self.model, self.preprocess, self.device
clip_prompts.py CHANGED
@@ -137,7 +137,7 @@ COMPARATIVE_PROMPTS = {
137
  "asian_vs_western_commercial": [
138
  "An Asian shopping street with vertical signage and compact multi-level shops.",
139
  "A Western commercial street with horizontal storefronts and wider sidewalks.",
140
- "An East Asian retail area with dense signage in Asian scripts and narrow walkways.",
141
  "A Western shopping district with uniform building heights and Latin alphabetic signs."
142
  ],
143
  "daytime_vs_nighttime": [
 
137
  "asian_vs_western_commercial": [
138
  "An Asian shopping street with vertical signage and compact multi-level shops.",
139
  "A Western commercial street with horizontal storefronts and wider sidewalks.",
140
+ "An East Asian retail area with dense signage in Asian scripts and narrow walkways."
141
  "A Western shopping district with uniform building heights and Latin alphabetic signs."
142
  ],
143
  "daytime_vs_nighttime": [
clip_zero_shot_classifier.py ADDED
@@ -0,0 +1,1415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import clip
4
+ from PIL import Image
5
+ import numpy as np
6
+ from typing import List, Dict, Tuple, Optional, Union, Any
7
+
8
+ from landmark_data import ALL_LANDMARKS, get_all_landmark_prompts
9
+
10
+ class CLIPZeroShotClassifier:
11
+ """
12
+ 使用CLIP模型進行零樣本分類,專注於識別世界知名地標。
13
+ 作為YOLO檢測的補充,處理標準對象檢測無法識別的地標建築。
14
+ """
15
+ def __init__(self, model_name: str = "ViT-L/14", device: str = None):
16
+ """
17
+ 初始化CLIP零樣本分類器
18
+
19
+ Args:
20
+ model_name: CLIP模型名稱,默認為"ViT-L/14"
21
+ device: 運行設備,None則自動選擇
22
+ """
23
+ # 設置運行設備
24
+ if device is None:
25
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
26
+ else:
27
+ self.device = device
28
+
29
+ print(f"Initializing CLIP Zero-Shot Landmark Classifier ({model_name}) on {self.device}")
30
+ try:
31
+ self.model, self.preprocess = clip.load(model_name, device=self.device)
32
+ print(f"Successfully loaded CLIP model")
33
+ except Exception as e:
34
+ print(f"Error loading CLIP model: {e}")
35
+ raise
36
+
37
+ # 加載地標數據
38
+ try:
39
+ self.landmark_data = ALL_LANDMARKS
40
+ self.landmark_prompts = get_all_landmark_prompts()
41
+ print(f"Loaded {len(self.landmark_prompts)} landmark prompts for classification")
42
+
43
+ # 預計算地標文本特徵
44
+ self.landmark_text_features = self._precompute_text_features(self.landmark_prompts)
45
+
46
+ # 創建地標ID到索引的映射,可快速查找
47
+ self.landmark_id_to_index = {landmark_id: i for i, landmark_id in enumerate(ALL_LANDMARKS.keys())}
48
+
49
+ # 初始化批處理參數
50
+ self.batch_size = 16 # 默認批處理大小
51
+ self.confidence_threshold_multipliers = {
52
+ "close_up": 0.9, # 近景標準閾值
53
+ "partial": 0.6, # 部分可見降低閾值要求
54
+ "distant": 0.5, # 遠景更低閾值要求
55
+ "full_image": 0.7 # 整張圖像需要更高閾值
56
+ }
57
+
58
+ self.landmark_type_thresholds = {
59
+ "tower": 0.5, # 塔型建築需要更高閾值
60
+ "skyscraper": 0.4, # 摩天大樓使用較低閾值
61
+ "building": 0.55, # 一般建築物閾值略微降低
62
+ "monument": 0.5, # 紀念碑閾值
63
+ "natural": 0.6 # 自然地標可以使用較低閾值
64
+ }
65
+
66
+ # 初始化結果快取
67
+ self.results_cache = {} # 使用圖像hash作為鍵
68
+ self.cache_max_size = 100 # 最大快取項目數
69
+
70
+ except ImportError:
71
+ print("Warning: landmark_data.py not found. Landmark classification will be limited")
72
+ self.landmark_data = {}
73
+ self.landmark_prompts = []
74
+ self.landmark_text_features = None
75
+ self.landmark_id_to_index = {}
76
+ self.results_cache = {}
77
+
78
+ def _get_image_hash(self, image):
79
+ """
80
+ 為圖像生成簡單的 hash 值用於快取
81
+
82
+ Args:
83
+ image: PIL Image 或 numpy 數組
84
+
85
+ Returns:
86
+ str: 圖像的 hash 值
87
+ """
88
+ if isinstance(image, np.ndarray):
89
+ # 對於 numpy 數組,降採樣並計算簡單 hash
90
+ small_img = image[::10, ::10] if image.ndim == 3 else image
91
+ return hash(small_img.tobytes())
92
+ else:
93
+ # 對於 PIL 圖像,調整大小後轉換為 bytes
94
+ small_img = image.resize((32, 32))
95
+ return hash(small_img.tobytes())
96
+
97
+ def _manage_cache(self):
98
+ """
99
+ 管理結果快取大小
100
+ """
101
+ if len(self.results_cache) > self.cache_max_size:
102
+ oldest_key = next(iter(self.results_cache))
103
+ del self.results_cache[oldest_key]
104
+
105
+ def set_batch_size(self, batch_size: int):
106
+ """
107
+ 設置批處理大小
108
+
109
+ Args:
110
+ batch_size: 新的批處理大小
111
+ """
112
+ self.batch_size = max(1, batch_size)
113
+ print(f"Batch size set to {self.batch_size}")
114
+
115
+
116
+ def adjust_confidence_threshold(self, detection_type: str, multiplier: float):
117
+ """
118
+ 調整特定檢測類型的置信度閾值乘數
119
+
120
+ Args:
121
+ detection_type: 檢測類型 ('close_up', 'partial', 'distant', 'full_image')
122
+ multiplier: 置信度閾值乘數
123
+ """
124
+ if detection_type in self.confidence_threshold_multipliers:
125
+ self.confidence_threshold_multipliers[detection_type] = max(0.1, min(1.5, multiplier))
126
+ print(f"Adjusted confidence threshold multiplier for {detection_type} to {multiplier}")
127
+ else:
128
+ print(f"Unknown detection type: {detection_type}")
129
+
130
+
131
+ def _precompute_text_features(self, text_prompts: List[str]) -> torch.Tensor:
132
+ """
133
+ 預計算文本提示的CLIP特徵,提高批處理效率
134
+
135
+ Args:
136
+ text_prompts: 文本提示列表
137
+
138
+ Returns:
139
+ torch.Tensor: 預計算的文本特徵
140
+ """
141
+ if not text_prompts:
142
+ return None
143
+
144
+ with torch.no_grad():
145
+ # Process in batches to avoid CUDA memory issues
146
+ batch_size = 128 # Adjust based on GPU memory
147
+ features_list = []
148
+
149
+ for i in range(0, len(text_prompts), batch_size):
150
+ batch_prompts = text_prompts[i:i+batch_size]
151
+ text_tokens = clip.tokenize(batch_prompts).to(self.device)
152
+ batch_features = self.model.encode_text(text_tokens)
153
+ batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True)
154
+ features_list.append(batch_features)
155
+
156
+ # Concatenate all batches
157
+ if len(features_list) > 1:
158
+ text_features = torch.cat(features_list, dim=0)
159
+ else:
160
+ text_features = features_list[0]
161
+
162
+ return text_features
163
+
164
+ def _perform_pyramid_analysis(self,
165
+ image: Union[Image.Image, np.ndarray],
166
+ levels: int = 4,
167
+ base_threshold: float = 0.25,
168
+ aspect_ratios: List[float] = [1.0, 0.75, 1.5]) -> Dict[str, Any]:
169
+ """
170
+ Performs multi-scale pyramid analysis on the image to improve landmark detection.
171
+
172
+ Args:
173
+ image: Input image
174
+ levels: Number of pyramid levels
175
+ base_threshold: Base confidence threshold
176
+ aspect_ratios: Different aspect ratios to try (for tall buildings vs wide landscapes)
177
+
178
+ Returns:
179
+ Dict: Results of pyramid analysis
180
+ """
181
+ # Ensure image is PIL format
182
+ if not isinstance(image, Image.Image):
183
+ if isinstance(image, np.ndarray):
184
+ image = Image.fromarray(image)
185
+ else:
186
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
187
+
188
+ width, height = image.size
189
+ pyramid_results = []
190
+
191
+ # 對每個縮放和縱橫比組合進行處理
192
+ for level in range(levels):
193
+ # 計算縮放因子
194
+ scale_factor = 1.0 - (level * 0.2)
195
+
196
+ for aspect_ratio in aspect_ratios:
197
+ # 計算新尺寸,保持面積近似不變
198
+ if aspect_ratio != 1.0:
199
+ # 保持面積近似不變的情況下調整縱橫比
200
+ new_width = int(width * scale_factor * (1/aspect_ratio)**0.5)
201
+ new_height = int(height * scale_factor * aspect_ratio**0.5)
202
+ else:
203
+ new_width = int(width * scale_factor)
204
+ new_height = int(height * scale_factor)
205
+
206
+ # 調整圖像大小
207
+ scaled_image = image.resize((new_width, new_height), Image.LANCZOS)
208
+
209
+ # 預處理圖像
210
+ image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)
211
+
212
+ # 獲取圖像特徵
213
+ with torch.no_grad():
214
+ image_features = self.model.encode_image(image_input)
215
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
216
+
217
+ # 計算相似度
218
+ similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
219
+ similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
220
+
221
+ # 找到最佳匹配
222
+ best_idx = similarity.argmax().item()
223
+ best_score = similarity[best_idx]
224
+
225
+ if best_score >= base_threshold:
226
+ landmark_id = list(self.landmark_data.keys())[best_idx]
227
+ landmark_info = self.landmark_data[landmark_id]
228
+
229
+ pyramid_results.append({
230
+ "landmark_id": landmark_id,
231
+ "landmark_name": landmark_info["name"],
232
+ "confidence": float(best_score),
233
+ "scale_factor": scale_factor,
234
+ "aspect_ratio": aspect_ratio,
235
+ "location": landmark_info["location"]
236
+ })
237
+
238
+ # 按置信度排序
239
+ pyramid_results.sort(key=lambda x: x["confidence"], reverse=True)
240
+
241
+ return {
242
+ "is_landmark": len(pyramid_results) > 0,
243
+ "results": pyramid_results,
244
+ "best_result": pyramid_results[0] if pyramid_results else None
245
+ }
246
+
247
+ def _enhance_features(self, image: Union[Image.Image, np.ndarray]) -> Image.Image:
248
+ """
249
+ Enhances image features to improve landmark detection.
250
+
251
+ Args:
252
+ image: Input image
253
+
254
+ Returns:
255
+ PIL.Image: Enhanced image
256
+ """
257
+ # Ensure image is PIL format
258
+ if not isinstance(image, Image.Image):
259
+ if isinstance(image, np.ndarray):
260
+ image = Image.fromarray(image)
261
+ else:
262
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
263
+
264
+ # Convert to numpy for processing
265
+ img_array = np.array(image)
266
+
267
+ # Skip processing for grayscale images
268
+ if len(img_array.shape) < 3:
269
+ return image
270
+
271
+ # Apply adaptive contrast enhancement
272
+ # Convert to LAB color space
273
+ from skimage import color, exposure
274
+ try:
275
+ # Convert to LAB color space
276
+ if img_array.shape[2] == 4: # Handle RGBA
277
+ img_array = img_array[:,:,:3]
278
+
279
+ lab = color.rgb2lab(img_array[:,:,:3] / 255.0)
280
+ l_channel = lab[:,:,0]
281
+
282
+ # Enhance contrast of L channel
283
+ p2, p98 = np.percentile(l_channel, (2, 98))
284
+ l_channel_enhanced = exposure.rescale_intensity(l_channel, in_range=(p2, p98))
285
+
286
+ # Replace L channel and convert back to RGB
287
+ lab[:,:,0] = l_channel_enhanced
288
+ enhanced_img = color.lab2rgb(lab) * 255.0
289
+ enhanced_img = enhanced_img.astype(np.uint8)
290
+
291
+ return Image.fromarray(enhanced_img)
292
+ except ImportError:
293
+ print("Warning: skimage not available for feature enhancement")
294
+ return image
295
+ except Exception as e:
296
+ print(f"Error in feature enhancement: {e}")
297
+ return image
298
+
299
+ def _determine_landmark_type(self, landmark_id):
300
+ """
301
+ 自動判斷地標類型,基於地標數據和命名
302
+
303
+ Returns:
304
+ str: 地標類型,用於調整閾值
305
+ """
306
+ if not landmark_id:
307
+ return "building" # 預設類型
308
+
309
+ # 獲取地標詳細數據
310
+ landmark_data = self.landmark_data if hasattr(self, 'landmark_data') else {}
311
+ landmark_info = landmark_data.get(landmark_id, {})
312
+
313
+ # 獲取地標相關文本
314
+ landmark_id_lower = landmark_id.lower()
315
+ landmark_name = landmark_info.get("name", "").lower()
316
+ landmark_location = landmark_info.get("location", "").lower()
317
+ landmark_aliases = [alias.lower() for alias in landmark_info.get("aliases", [])]
318
+
319
+ # 合併所有文本數據用於特徵判斷
320
+ combined_text = " ".join([landmark_id_lower, landmark_name] + landmark_aliases)
321
+
322
+ # 地標類型的特色特徵
323
+ type_features = {
324
+ "skyscraper": ["skyscraper", "tall", "tower", "高樓", "摩天", "大厦", "タワー"],
325
+ "tower": ["tower", "bell", "clock", "塔", "鐘樓", "タワー", "campanile"],
326
+ "monument": ["monument", "memorial", "statue", "紀念", "雕像", "像", "memorial"],
327
+ "natural": ["mountain", "lake", "canyon", "falls", "beach", "山", "湖", "峽谷", "瀑布", "海灘"],
328
+ "temple": ["temple", "shrine", "寺", "神社", "廟"],
329
+ "palace": ["palace", "castle", "宮", "城", "皇宮", "宫殿"],
330
+ "distinctive": ["unique", "leaning", "slanted", "傾斜", "斜", "獨特", "傾く"]
331
+ }
332
+
333
+ # 檢查是否位於亞洲地區
334
+ asian_regions = ["china", "japan", "korea", "taiwan", "singapore", "vietnam", "thailand",
335
+ "hong kong", "中國", "日本", "韓國", "台灣", "新加坡", "越南", "泰國", "香港"]
336
+ is_asian = any(region in landmark_location for region in asian_regions)
337
+
338
+ # 判斷地標類型
339
+ best_type = None
340
+ max_matches = 0
341
+
342
+ for type_name, features in type_features.items():
343
+ # 計算特徵詞匹配數量
344
+ matches = sum(1 for feature in features if feature in combined_text)
345
+ if matches > max_matches:
346
+ max_matches = matches
347
+ best_type = type_name
348
+
349
+ # 處理亞洲地區特例
350
+ if is_asian and best_type == "tower":
351
+ best_type = "skyscraper" # 亞洲地區的塔型建築閾值較低
352
+
353
+ # 特例處理:檢測傾斜建築
354
+ if any(term in combined_text for term in ["leaning", "slanted", "tilt", "inclined", "斜", "傾斜"]):
355
+ return "distinctive" # 傾斜建築需要特殊處理
356
+
357
+ return best_type if best_type and max_matches > 0 else "building" # 預設為一般建築
358
+
359
+ def classify_image_region(self,
360
+ image: Union[Image.Image, np.ndarray],
361
+ box: List[float],
362
+ threshold: float = 0.25,
363
+ detection_type: str = "close_up") -> Dict[str, Any]:
364
+ """
365
+ 對圖像的特定區域進行地標分類,具有增強的多尺度和部分識別能力
366
+
367
+ Args:
368
+ image: 原始圖像 (PIL Image 或 numpy數組)
369
+ box: 邊界框 [x1, y1, x2, y2]
370
+ threshold: 基礎分類置信度閾值
371
+ detection_type: 檢測類型,影響置信度調整
372
+
373
+ Returns:
374
+ Dict: 地標���類結果
375
+ """
376
+ # 確保圖像是PIL格式
377
+ if not isinstance(image, Image.Image):
378
+ if isinstance(image, np.ndarray):
379
+ image = Image.fromarray(image)
380
+ else:
381
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
382
+
383
+ # 生成圖像區域的hash用於快取
384
+ region_key = (self._get_image_hash(image), tuple(box), detection_type)
385
+ if region_key in self.results_cache:
386
+ return self.results_cache[region_key]
387
+
388
+ # 裁剪區域
389
+ x1, y1, x2, y2 = map(int, box)
390
+ cropped_image = image.crop((x1, y1, x2, y2))
391
+ enhanced_image = self._enhance_features(cropped_image)
392
+
393
+ # 分析視角信息
394
+ viewpoint_info = self._analyze_viewpoint(enhanced_image)
395
+ dominant_viewpoint = viewpoint_info["dominant_viewpoint"]
396
+
397
+ # 計算區域信息
398
+ region_width = x2 - x1
399
+ region_height = y2 - y1
400
+ image_width, image_height = image.size
401
+
402
+ # 根據區域大小判斷可能的檢測類型
403
+ region_area_ratio = (region_width * region_height) / (image_width * image_height)
404
+ if detection_type == "auto":
405
+ if region_area_ratio > 0.5:
406
+ detection_type = "close_up"
407
+ elif region_area_ratio > 0.2:
408
+ detection_type = "partial"
409
+ else:
410
+ detection_type = "distant"
411
+
412
+ # 根據視角調整檢測類型
413
+ if dominant_viewpoint == "close_up" and detection_type != "close_up":
414
+ detection_type = "close_up"
415
+ elif dominant_viewpoint == "distant" and detection_type != "distant":
416
+ detection_type = "distant"
417
+ elif dominant_viewpoint == "angled_view":
418
+ detection_type = "partial" # 角度視圖可能是部分可見
419
+
420
+ # 調整置信度閾值
421
+ base_multiplier = self.confidence_threshold_multipliers.get(detection_type, 1.0)
422
+ adjusted_threshold = threshold * base_multiplier
423
+
424
+ # 調整多尺度處理的尺度範圍和縱橫比 - 增強對傾斜建築的支持
425
+ scales = [1.0] # 默認尺度
426
+
427
+ # 基於視角選擇合適的尺度和縱橫比
428
+ if detection_type in ["partial", "distant"]:
429
+ scales = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3] # 標準範圍
430
+
431
+ # 如果是特殊視角,進一步調整尺度和縱橫比 - 新增
432
+ if dominant_viewpoint in ["angled_view", "low_angle"]:
433
+ scales = [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4] # 更寬的範圍
434
+
435
+ # 準備縱橫比 - 同時支持水平和垂直地標
436
+ aspect_ratios = [1.0, 0.8, 1.2] # 標準縱橫比
437
+
438
+ # 針對可能的傾斜建築增加更多縱橫比 - 新增
439
+ if dominant_viewpoint in ["angled_view", "unique_feature"]:
440
+ aspect_ratios = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5] # 更多樣的縱橫比
441
+
442
+ best_result = {
443
+ "landmark_id": None,
444
+ "landmark_name": None,
445
+ "confidence": 0.0,
446
+ "is_landmark": False
447
+ }
448
+
449
+ # 多尺度和縱橫比分析
450
+ for scale in scales:
451
+ for aspect_ratio in aspect_ratios:
452
+ # 縮放裁剪區域
453
+ current_width, current_height = cropped_image.size
454
+
455
+ # 計算新尺寸,保持面積不變但調整縱橫比
456
+ if aspect_ratio != 1.0:
457
+ new_width = int(current_width * scale * (1/aspect_ratio)**0.5)
458
+ new_height = int(current_height * scale * aspect_ratio**0.5)
459
+ else:
460
+ new_width = int(current_width * scale)
461
+ new_height = int(current_height * scale)
462
+
463
+ # 確保尺寸至少為1像素
464
+ new_width = max(1, new_width)
465
+ new_height = max(1, new_height)
466
+
467
+ # 縮放圖像
468
+ try:
469
+ scaled_image = cropped_image.resize((new_width, new_height), Image.LANCZOS)
470
+ except Exception as e:
471
+ print(f"Failed to resize image to {new_width}x{new_height}: {e}")
472
+ continue
473
+
474
+ # 預處理裁剪圖像
475
+ try:
476
+ image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)
477
+ except Exception as e:
478
+ print(f"Failed to preprocess image: {e}")
479
+ continue
480
+
481
+ # 獲取圖像特徵
482
+ with torch.no_grad():
483
+ try:
484
+ image_features = self.model.encode_image(image_input)
485
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
486
+
487
+ # 計算與地標提示的相似度
488
+ similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
489
+ similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
490
+
491
+ # 找到最佳匹配
492
+ best_idx = similarity.argmax().item()
493
+ best_score = similarity[best_idx]
494
+
495
+ # 如果當前尺度結果更好,則更新
496
+ if best_score > best_result["confidence"]:
497
+ landmark_id = list(self.landmark_data.keys())[best_idx]
498
+ landmark_info = self.landmark_data[landmark_id]
499
+
500
+ best_result = {
501
+ "landmark_id": landmark_id,
502
+ "landmark_name": landmark_info["name"],
503
+ "location": landmark_info["location"],
504
+ "confidence": float(best_score),
505
+ "is_landmark": best_score >= adjusted_threshold,
506
+ "scale_used": scale,
507
+ "aspect_ratio_used": aspect_ratio,
508
+ "viewpoint": dominant_viewpoint
509
+ }
510
+
511
+ # 添加額外可用信息
512
+ for key in ["year_built", "architectural_style", "significance"]:
513
+ if key in landmark_info:
514
+ best_result[key] = landmark_info[key]
515
+ except Exception as e:
516
+ print(f"Error in calculating similarity: {e}")
517
+ continue
518
+
519
+ # 只有在有識別出地標ID且信心度足夠高時才應用地標類型閾值調整
520
+ if best_result["landmark_id"]:
521
+ landmark_type = self._determine_landmark_type(best_result["landmark_id"])
522
+
523
+ # 檢測是否為特殊類型的建築如斜塔
524
+ if landmark_type == "distinctive":
525
+ # 特殊建築的閾值降低25%
526
+ type_multiplier = 0.75
527
+ else:
528
+ # 使用已有的類型閾值
529
+ type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5
530
+
531
+ # 更新判斷是否為地標的標準
532
+ final_threshold = adjusted_threshold * type_multiplier
533
+ best_result["is_landmark"] = best_result["confidence"] >= final_threshold
534
+ best_result["landmark_type"] = landmark_type # 添加地標類型信息
535
+ best_result["threshold_applied"] = final_threshold # 記錄應用的閾值
536
+
537
+ # 快取結果
538
+ self.results_cache[region_key] = best_result
539
+ self._manage_cache()
540
+
541
+ return best_result
542
+
543
+ def classify_batch_regions(self,
544
+ image: Union[Image.Image, np.ndarray],
545
+ boxes: List[List[float]],
546
+ threshold: float = 0.28) -> List[Dict[str, Any]]:
547
+ """
548
+ 批量處理多個圖像區域,提高效率
549
+
550
+ Args:
551
+ image: 原始圖像
552
+ boxes: 邊界框列表
553
+ threshold: 置信度閾值
554
+
555
+ Returns:
556
+ List[Dict]: 分類結果列表
557
+ """
558
+ if not self.landmark_text_features is not None:
559
+ return [{"is_landmark": False, "confidence": 0.0} for _ in boxes]
560
+
561
+ # 確保圖像是PIL格式
562
+ if not isinstance(image, Image.Image):
563
+ if isinstance(image, np.ndarray):
564
+ image = Image.fromarray(image)
565
+ else:
566
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
567
+
568
+ # 無框可處理時
569
+ if not boxes:
570
+ return []
571
+
572
+ # 裁剪並預處理所有區域
573
+ cropped_inputs = []
574
+ for box in boxes:
575
+ x1, y1, x2, y2 = map(int, box)
576
+ cropped_image = image.crop((x1, y1, x2, y2))
577
+ processed_image = self.preprocess(cropped_image).unsqueeze(0)
578
+ cropped_inputs.append(processed_image)
579
+
580
+ # batch process
581
+ batch_tensor = torch.cat(cropped_inputs).to(self.device)
582
+
583
+ # batch encoding
584
+ with torch.no_grad():
585
+ image_features = self.model.encode_image(batch_tensor)
586
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
587
+
588
+ # 計算相似度
589
+ similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
590
+ similarity = similarity.cpu().numpy() if self.device == "cuda" else similarity.numpy()
591
+
592
+ # 處理每個區域的結果
593
+ results = []
594
+ for i, sim in enumerate(similarity):
595
+ best_idx = sim.argmax().item()
596
+ best_score = sim[best_idx]
597
+
598
+ if best_score >= threshold:
599
+ landmark_id = list(self.landmark_data.keys())[best_idx]
600
+ landmark_info = self.landmark_data[landmark_id]
601
+
602
+ results.append({
603
+ "landmark_id": landmark_id,
604
+ "landmark_name": landmark_info["name"],
605
+ "location": landmark_info["location"],
606
+ "confidence": float(best_score),
607
+ "is_landmark": True,
608
+ "box": boxes[i]
609
+ })
610
+ else:
611
+ results.append({
612
+ "landmark_id": None,
613
+ "landmark_name": None,
614
+ "confidence": float(best_score),
615
+ "is_landmark": False,
616
+ "box": boxes[i]
617
+ })
618
+
619
+ return results
620
+
621
+ def search_entire_image(self,
622
+ image: Union[Image.Image, np.ndarray],
623
+ threshold: float = 0.35,
624
+ detailed_analysis: bool = False) -> Dict[str, Any]:
625
+ """
626
+ 檢查整張圖像是否包含地標,具有增強的分析能力
627
+
628
+ Args:
629
+ image: 原始圖像
630
+ threshold: 置信度閾值
631
+ detailed_analysis: 是否進行詳細分析,包括多區域檢測
632
+
633
+ Returns:
634
+ Dict: 地標分類結果
635
+ """
636
+ # 確保圖像是PIL格式
637
+ if not isinstance(image, Image.Image):
638
+ if isinstance(image, np.ndarray):
639
+ image = Image.fromarray(image)
640
+ else:
641
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
642
+
643
+ # 檢查快取
644
+ image_key = (self._get_image_hash(image), "entire_image", detailed_analysis)
645
+ if image_key in self.results_cache:
646
+ return self.results_cache[image_key]
647
+
648
+ # 調整閾值
649
+ adjusted_threshold = threshold * self.confidence_threshold_multipliers.get("full_image", 1.0)
650
+
651
+ # 預處理圖像
652
+ image_input = self.preprocess(image).unsqueeze(0).to(self.device)
653
+
654
+ # 獲取圖像特徵
655
+ with torch.no_grad():
656
+ image_features = self.model.encode_image(image_input)
657
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
658
+
659
+ # 計算與地標提示的相似度
660
+ similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
661
+ similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
662
+
663
+ # 找到最佳匹配
664
+ best_idx = similarity.argmax().item()
665
+ best_score = similarity[best_idx]
666
+
667
+ # top3 landmark
668
+ top_indices = similarity.argsort()[-3:][::-1]
669
+ top_landmarks = []
670
+
671
+ for idx in top_indices:
672
+ score = similarity[idx]
673
+ landmark_id = list(self.landmark_data.keys())[idx]
674
+ landmark_info = self.landmark_data[landmark_id]
675
+
676
+ landmark_result = {
677
+ "landmark_id": landmark_id,
678
+ "landmark_name": landmark_info["name"],
679
+ "location": landmark_info["location"],
680
+ "confidence": float(score)
681
+ }
682
+
683
+ # 添加額外可用信息
684
+ if "year_built" in landmark_info:
685
+ landmark_result["year_built"] = landmark_info["year_built"]
686
+ if "architectural_style" in landmark_info:
687
+ landmark_result["architectural_style"] = landmark_info["architectural_style"]
688
+ if "significance" in landmark_info:
689
+ landmark_result["significance"] = landmark_info["significance"]
690
+
691
+ top_landmarks.append(landmark_result)
692
+
693
+ # main result
694
+ result = {}
695
+ if best_score >= adjusted_threshold:
696
+ landmark_id = list(self.landmark_data.keys())[best_idx]
697
+ landmark_info = self.landmark_data[landmark_id]
698
+
699
+ # 應用地標類型特定閾值
700
+ landmark_type = self._determine_landmark_type(landmark_id)
701
+ type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5
702
+ final_threshold = adjusted_threshold * type_multiplier
703
+
704
+ if best_score >= final_threshold:
705
+ result = {
706
+ "landmark_id": landmark_id,
707
+ "landmark_name": landmark_info["name"],
708
+ "location": landmark_info["location"],
709
+ "confidence": float(best_score),
710
+ "is_landmark": True,
711
+ "landmark_type": landmark_type,
712
+ "top_landmarks": top_landmarks
713
+ }
714
+
715
+ # 添加額外可用信息
716
+ if "year_built" in landmark_info:
717
+ result["year_built"] = landmark_info["year_built"]
718
+ if "architectural_style" in landmark_info:
719
+ result["architectural_style"] = landmark_info["architectural_style"]
720
+ if "significance" in landmark_info:
721
+ result["significance"] = landmark_info["significance"]
722
+ else:
723
+ result = {
724
+ "landmark_id": None,
725
+ "landmark_name": None,
726
+ "confidence": float(best_score),
727
+ "is_landmark": False,
728
+ "top_landmarks": top_landmarks
729
+ }
730
+
731
+ # 如果請求詳細分析且是地標,進一步分析圖像區域
732
+ if detailed_analysis and result.get("is_landmark", False):
733
+ # 創建不同區域進行更深入分析
734
+ width, height = image.size
735
+ regions = [
736
+ # 中心區域
737
+ [width * 0.25, height * 0.25, width * 0.75, height * 0.75],
738
+ # 左半部
739
+ [0, 0, width * 0.5, height],
740
+ # 右半部
741
+ [width * 0.5, 0, width, height],
742
+ # 上半部
743
+ [0, 0, width, height * 0.5],
744
+ # 下半部
745
+ [0, height * 0.5, width, height]
746
+ ]
747
+
748
+ region_results = []
749
+ for i, box in enumerate(regions):
750
+ region_result = self.classify_image_region(
751
+ image,
752
+ box,
753
+ threshold=threshold * 0.9,
754
+ detection_type="partial"
755
+ )
756
+ if region_result["is_landmark"]:
757
+ region_result["region_name"] = ["center", "left", "right", "top", "bottom"][i]
758
+ region_results.append(region_result)
759
+
760
+ # 添加區域分析結果
761
+ if region_results:
762
+ result["region_analyses"] = region_results
763
+
764
+ # 快取結果
765
+ self.results_cache[image_key] = result
766
+ self._manage_cache()
767
+
768
+ return result
769
+
770
+ def enhanced_landmark_detection(self,
771
+ image: Union[Image.Image, np.ndarray],
772
+ threshold: float = 0.3) -> Dict[str, Any]:
773
+ """
774
+ Enhanced landmark detection using multiple analysis techniques.
775
+
776
+ Args:
777
+ image: Input image
778
+ threshold: Base confidence threshold
779
+
780
+ Returns:
781
+ Dict: Comprehensive landmark detection results
782
+ """
783
+ # Ensure image is PIL format
784
+ if not isinstance(image, Image.Image):
785
+ if isinstance(image, np.ndarray):
786
+ image = Image.fromarray(image)
787
+ else:
788
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
789
+
790
+ # Phase 1: Analyze viewpoint to adjust detection parameters
791
+ viewpoint_info = self._analyze_viewpoint(image)
792
+ viewpoint = viewpoint_info["dominant_viewpoint"]
793
+
794
+ # Adjust threshold based on viewpoint
795
+ if viewpoint == "distant":
796
+ adjusted_threshold = threshold * 0.7 # Lower threshold for distant views
797
+ elif viewpoint == "close_up":
798
+ adjusted_threshold = threshold * 1.1 # Higher threshold for close-ups
799
+ else:
800
+ adjusted_threshold = threshold
801
+
802
+ # Phase 2: Perform multi-scale pyramid analysis
803
+ pyramid_results = self._perform_pyramid_analysis(image, levels=3, base_threshold=adjusted_threshold)
804
+
805
+ # Phase 3: Perform grid-based region analysis
806
+ grid_results = []
807
+ width, height = image.size
808
+
809
+ # Create adaptive grid based on viewpoint
810
+ if viewpoint == "distant":
811
+ grid_size = 3 # Coarser grid for distant views
812
+ elif viewpoint == "close_up":
813
+ grid_size = 5 # Finer grid for close-ups
814
+ else:
815
+ grid_size = 4 # Default grid size
816
+
817
+ # Generate grid regions
818
+ for i in range(grid_size):
819
+ for j in range(grid_size):
820
+ box = [
821
+ width * (j/grid_size),
822
+ height * (i/grid_size),
823
+ width * ((j+1)/grid_size),
824
+ height * ((i+1)/grid_size)
825
+ ]
826
+
827
+ # Apply feature enhancement
828
+ region_result = self.classify_image_region(
829
+ image,
830
+ box,
831
+ threshold=adjusted_threshold,
832
+ detection_type="auto"
833
+ )
834
+
835
+ if region_result["is_landmark"]:
836
+ region_result["grid_position"] = (i, j)
837
+ grid_results.append(region_result)
838
+
839
+ # Phase 4: Cross-validate and combine results
840
+ all_detections = []
841
+
842
+ # Add pyramid results
843
+ if pyramid_results["is_landmark"] and pyramid_results["best_result"]:
844
+ all_detections.append({
845
+ "source": "pyramid",
846
+ "landmark_id": pyramid_results["best_result"]["landmark_id"],
847
+ "landmark_name": pyramid_results["best_result"]["landmark_name"],
848
+ "confidence": pyramid_results["best_result"]["confidence"],
849
+ "scale_factor": pyramid_results["best_result"].get("scale_factor", 1.0)
850
+ })
851
+
852
+ # Add grid results
853
+ for result in grid_results:
854
+ all_detections.append({
855
+ "source": "grid",
856
+ "landmark_id": result["landmark_id"],
857
+ "landmark_name": result["landmark_name"],
858
+ "confidence": result["confidence"],
859
+ "grid_position": result.get("grid_position", (0, 0))
860
+ })
861
+
862
+ # Search entire image
863
+ full_image_result = self.search_entire_image(image, threshold=adjusted_threshold)
864
+ if full_image_result and full_image_result.get("is_landmark", False):
865
+ all_detections.append({
866
+ "source": "full_image",
867
+ "landmark_id": full_image_result["landmark_id"],
868
+ "landmark_name": full_image_result["landmark_name"],
869
+ "confidence": full_image_result["confidence"]
870
+ })
871
+
872
+ # Group by landmark_id and calculate aggregate confidence
873
+ landmark_groups = {}
874
+ for detection in all_detections:
875
+ landmark_id = detection["landmark_id"]
876
+ if landmark_id not in landmark_groups:
877
+ landmark_groups[landmark_id] = {
878
+ "landmark_id": landmark_id,
879
+ "landmark_name": detection["landmark_name"],
880
+ "detections": [],
881
+ "sources": set()
882
+ }
883
+
884
+ landmark_groups[landmark_id]["detections"].append(detection)
885
+ landmark_groups[landmark_id]["sources"].add(detection["source"])
886
+
887
+ # Calculate aggregate confidence for each landmark
888
+ for landmark_id, group in landmark_groups.items():
889
+ detections = group["detections"]
890
+
891
+ # Base confidence is the maximum confidence from any source
892
+ max_confidence = max(d["confidence"] for d in detections)
893
+
894
+ # Bonus for detection from multiple sources
895
+ source_count = len(group["sources"])
896
+ source_bonus = min(0.15, (source_count - 1) * 0.05) # Up to 15% bonus
897
+
898
+ # Consistency bonus for multiple detections of the same landmark
899
+ detection_count = len(detections)
900
+ consistency_bonus = min(0.1, (detection_count - 1) * 0.02) # Up to 10% bonus
901
+
902
+ # Calculate final confidence
903
+ aggregate_confidence = min(1.0, max_confidence + source_bonus + consistency_bonus)
904
+
905
+ group["confidence"] = aggregate_confidence
906
+ group["detection_count"] = detection_count
907
+ group["source_count"] = source_count
908
+
909
+ # Sort landmarks by confidence
910
+ sorted_landmarks = sorted(
911
+ landmark_groups.values(),
912
+ key=lambda x: x["confidence"],
913
+ reverse=True
914
+ )
915
+
916
+ return {
917
+ "is_landmark_scene": len(sorted_landmarks) > 0,
918
+ "detected_landmarks": sorted_landmarks,
919
+ "viewpoint_info": viewpoint_info,
920
+ "primary_landmark": sorted_landmarks[0] if sorted_landmarks else None
921
+ }
922
+
923
+ def _analyze_architectural_features(self, image):
924
+ """
925
+ Analyzes the architectural features of a structure in the image without hardcoding specific landmarks.
926
+
927
+ Args:
928
+ image: Input image
929
+
930
+ Returns:
931
+ Dict: Architectural feature analysis results
932
+ """
933
+ # Define universal architectural feature prompts that apply to all types of landmarks
934
+ architecture_prompts = {
935
+ "tall_structure": "a tall vertical structure standing alone",
936
+ "tiered_building": "a building with multiple stacked tiers or segments",
937
+ "historical_structure": "a building with historical architectural elements",
938
+ "modern_design": "a modern structure with contemporary architectural design",
939
+ "segmented_exterior": "a structure with visible segmented or sectioned exterior",
940
+ "viewing_platform": "a tall structure with observation area at the top",
941
+ "time_display": "a structure with timepiece features",
942
+ "glass_facade": "a building with prominent glass exterior surfaces",
943
+ "memorial_structure": "a monument or memorial structure",
944
+ "ancient_construction": "ancient constructed elements or archaeological features",
945
+ "natural_landmark": "a natural geographic formation or landmark",
946
+ "slanted_design": "a structure with non-vertical or leaning profile"
947
+ }
948
+
949
+ # Calculate similarity scores against universal architectural patterns
950
+ context_scores = self.calculate_similarity_scores(image, architecture_prompts)
951
+
952
+ # Determine most relevant architectural features
953
+ top_features = sorted(context_scores.items(), key=lambda x: x[1], reverse=True)[:3]
954
+
955
+ # Calculate feature confidence
956
+ context_confidence = sum(score for _, score in top_features) / 3
957
+
958
+ # Determine primary architectural category based on top features
959
+ architectural_categories = {
960
+ "tower": ["tall_structure", "viewing_platform", "time_display"],
961
+ "skyscraper": ["tall_structure", "modern_design", "glass_facade"],
962
+ "historical": ["historical_structure", "ancient_construction", "memorial_structure"],
963
+ "natural": ["natural_landmark"],
964
+ "distinctive": ["tiered_building", "segmented_exterior", "slanted_design"]
965
+ }
966
+
967
+ # Score each category based on the top features
968
+ category_scores = {}
969
+ for category, features in architectural_categories.items():
970
+ category_score = 0
971
+ for feature, score in context_scores.items():
972
+ if feature in features:
973
+ category_score += score
974
+ category_scores[category] = category_score
975
+
976
+ primary_category = max(category_scores.items(), key=lambda x: x[1])[0]
977
+
978
+ return {
979
+ "architectural_features": top_features,
980
+ "context_confidence": context_confidence,
981
+ "primary_category": primary_category,
982
+ "category_scores": category_scores
983
+ }
984
+
985
+ def intelligent_landmark_search(self,
986
+ image: Union[Image.Image, np.ndarray],
987
+ yolo_boxes: Optional[List[List[float]]] = None,
988
+ base_threshold: float = 0.25) -> Dict[str, Any]:
989
+ """
990
+ 對圖像進行智能地標搜索,綜合整張圖像分析和區域分析
991
+
992
+ Args:
993
+ image: 原始圖像
994
+ yolo_boxes: YOLO檢測到的邊界框 (可選)
995
+ base_threshold: 基礎置信度閾值
996
+
997
+ Returns:
998
+ Dict: 包含所有檢測結果的綜合分析
999
+ """
1000
+ # 確保圖像是PIL格式
1001
+ if not isinstance(image, Image.Image):
1002
+ if isinstance(image, np.ndarray):
1003
+ image = Image.fromarray(image)
1004
+ else:
1005
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
1006
+
1007
+ # No YOLO 框時,可以稍微降低閾值以提高召回率
1008
+ actual_threshold = base_threshold * 0.85 if yolo_boxes is None or len(yolo_boxes) == 0 else base_threshold
1009
+
1010
+ # 首先對整張圖像進行分析
1011
+ try:
1012
+ full_image_result = self.search_entire_image(
1013
+ image,
1014
+ threshold=actual_threshold,
1015
+ detailed_analysis=True # 確保詳細分析開啟
1016
+ )
1017
+
1018
+ # No YOLO 框,則進行多尺度分析以提高檢測機會
1019
+ if (yolo_boxes is None or len(yolo_boxes) == 0) and (not full_image_result or not full_image_result.get("is_landmark", False)):
1020
+ print("No YOLO boxes provided, attempting multi-scale pyramid analysis")
1021
+ try:
1022
+ if hasattr(self, '_perform_pyramid_analysis'):
1023
+ pyramid_results = self._perform_pyramid_analysis(
1024
+ image,
1025
+ levels=4, #
1026
+ base_threshold=actual_threshold,
1027
+ aspect_ratios=[1.0, 0.75, 1.5, 0.5, 2.0]
1028
+ )
1029
+
1030
+ if pyramid_results and pyramid_results.get("is_landmark", False) and pyramid_results.get("best_result", {}).get("confidence", 0) > actual_threshold:
1031
+ # 使用金字塔分析結果增強或替代全圖結果
1032
+ if not full_image_result or not full_image_result.get("is_landmark", False):
1033
+ full_image_result = {
1034
+ "is_landmark": True,
1035
+ "landmark_id": pyramid_results["best_result"]["landmark_id"],
1036
+ "landmark_name": pyramid_results["best_result"]["landmark_name"],
1037
+ "confidence": pyramid_results["best_result"]["confidence"],
1038
+ "location": pyramid_results["best_result"].get("location", "Unknown Location")
1039
+ }
1040
+ print(f"Pyramid analysis detected landmark: {pyramid_results['best_result']['landmark_name']} with confidence {pyramid_results['best_result']['confidence']:.3f}")
1041
+ else:
1042
+ print("Pyramid analysis not available, skipping multi-scale detection")
1043
+ except Exception as e:
1044
+ print(f"Error in pyramid analysis: {e}")
1045
+ except Exception as e:
1046
+ print(f"Error in search_entire_image: {e}")
1047
+ import traceback
1048
+ traceback.print_exc()
1049
+ full_image_result = None
1050
+
1051
+ # 初始化結果字典
1052
+ result = {
1053
+ "full_image_analysis": full_image_result if full_image_result else {},
1054
+ "is_landmark_scene": False, # 默認值
1055
+ "detected_landmarks": []
1056
+ }
1057
+
1058
+ # 上下文感知比較,處理接近的排名結果
1059
+ if full_image_result and "top_landmarks" in full_image_result and len(full_image_result["top_landmarks"]) >= 2:
1060
+ top_landmarks = full_image_result["top_landmarks"]
1061
+
1062
+ # 檢查前兩個結果是否非常接近(信心度差異小於 0.1)
1063
+ if len(top_landmarks) >= 2 and abs(top_landmarks[0]["confidence"] - top_landmarks[1]["confidence"]) < 0.1:
1064
+ # 對於接近的結果,使用通用建築特徵分析進行區分
1065
+ try:
1066
+ # 分析建築特徵
1067
+ if hasattr(self, '_analyze_architectural_features'):
1068
+ architectural_analysis = self._analyze_architectural_features(image)
1069
+ top_features = architectural_analysis.get("architectural_features", [])
1070
+ primary_category = architectural_analysis.get("primary_category", "")
1071
+
1072
+ # 根據建築特徵調整地標置信度
1073
+ for i, landmark in enumerate(top_landmarks[:2]):
1074
+ if i >= len(top_landmarks):
1075
+ continue
1076
+
1077
+ landmark_id = landmark.get("landmark_id", "").lower()
1078
+ confidence_boost = 0
1079
+
1080
+ # 使用主要建築類別來調整置信度,使用通用條件而非特定地標名稱
1081
+ if primary_category == "tower" and any(term in landmark_id for term in ["tower", "spire", "needle"]):
1082
+ confidence_boost += 0.05
1083
+ elif primary_category == "skyscraper" and any(term in landmark_id for term in ["building", "skyscraper", "tall"]):
1084
+ confidence_boost += 0.05
1085
+ elif primary_category == "historical" and any(term in landmark_id for term in ["monument", "castle", "palace", "temple"]):
1086
+ confidence_boost += 0.05
1087
+ elif primary_category == "distinctive" and any(term in landmark_id for term in ["unusual", "unique", "special", "famous"]):
1088
+ confidence_boost += 0.05
1089
+
1090
+ # 根據特定特徵進一步微調,使用通用特徵描述而非特定地標
1091
+ for feature, score in top_features:
1092
+ if feature == "time_display" and "clock" in landmark_id:
1093
+ confidence_boost += 0.03
1094
+ elif feature == "segmented_exterior" and "segmented" in landmark_id:
1095
+ confidence_boost += 0.03
1096
+ elif feature == "slanted_design" and "leaning" in landmark_id:
1097
+ confidence_boost += 0.03
1098
+
1099
+ # 應用信心度調整
1100
+ if confidence_boost > 0 and i < len(top_landmarks):
1101
+ top_landmarks[i]["confidence"] += confidence_boost
1102
+ print(f"Boosted {landmark['landmark_name']} confidence by {confidence_boost:.2f} based on architectural features ({primary_category})")
1103
+
1104
+ # 重新排序
1105
+ top_landmarks.sort(key=lambda x: x["confidence"], reverse=True)
1106
+ full_image_result["top_landmarks"] = top_landmarks
1107
+ if top_landmarks:
1108
+ full_image_result["landmark_id"] = top_landmarks[0]["landmark_id"]
1109
+ full_image_result["landmark_name"] = top_landmarks[0]["landmark_name"]
1110
+ full_image_result["confidence"] = top_landmarks[0]["confidence"]
1111
+ full_image_result["location"] = top_landmarks[0].get("location", "Unknown Location")
1112
+ except Exception as e:
1113
+ print(f"Error in architectural feature analysis: {e}")
1114
+ import traceback
1115
+ traceback.print_exc()
1116
+
1117
+ if full_image_result and full_image_result.get("is_landmark", False):
1118
+ result["is_landmark_scene"] = True
1119
+ landmark_id = full_image_result.get("landmark_id", "unknown")
1120
+
1121
+ # extract landmark info
1122
+ landmark_specific_info = self._extract_landmark_specific_info(landmark_id)
1123
+
1124
+ landmark_info = {
1125
+ "landmark_id": landmark_id,
1126
+ "landmark_name": full_image_result.get("landmark_name", "Unknown Landmark"),
1127
+ "confidence": full_image_result.get("confidence", 0.0),
1128
+ "location": full_image_result.get("location", "Unknown Location"),
1129
+ "region_type": "full_image",
1130
+ "box": [0, 0, getattr(image, 'width', 0), getattr(image, 'height', 0)]
1131
+ }
1132
+
1133
+ # 整合地標特定info,確保正確的名稱被使用
1134
+ landmark_info.update(landmark_specific_info)
1135
+
1136
+ # 如果特定信息中有更準確的地標名稱,使用它
1137
+ if landmark_specific_info.get("landmark_name"):
1138
+ landmark_info["landmark_name"] = landmark_specific_info["landmark_name"]
1139
+
1140
+ result["detected_landmarks"].append(landmark_info)
1141
+
1142
+ # 確保地標特定活動被正確設置為主要結果
1143
+ if landmark_specific_info.get("has_specific_activities", False):
1144
+ result["primary_landmark_activities"] = landmark_specific_info.get("landmark_specific_activities", [])
1145
+ print(f"Set primary landmark activities: {len(result['primary_landmark_activities'])} activities for {landmark_info['landmark_name']}")
1146
+
1147
+ # 如果提供了YOLO邊界框,分析這些區域
1148
+ if yolo_boxes and len(yolo_boxes) > 0:
1149
+ for box in yolo_boxes:
1150
+ try:
1151
+ if hasattr(self, 'classify_image_region'):
1152
+ box_result = self.classify_image_region(
1153
+ image,
1154
+ box,
1155
+ threshold=base_threshold,
1156
+ detection_type="auto"
1157
+ )
1158
+
1159
+ # 如果檢測到地標
1160
+ if box_result and box_result.get("is_landmark", False):
1161
+ # 檢查是否與已檢測的地標重複
1162
+ is_duplicate = False
1163
+ for existing in result["detected_landmarks"]:
1164
+ if existing.get("landmark_id") == box_result.get("landmark_id"):
1165
+ # 如果新的置信度更高,則更新
1166
+ if box_result.get("confidence", 0) > existing.get("confidence", 0):
1167
+ existing.update({
1168
+ "confidence": box_result.get("confidence", 0),
1169
+ "region_type": "yolo_box",
1170
+ "box": box
1171
+ })
1172
+ is_duplicate = True
1173
+ break
1174
+
1175
+ # 如果不是重複的,添加到列表
1176
+ if not is_duplicate:
1177
+ result["detected_landmarks"].append({
1178
+ "landmark_id": box_result.get("landmark_id", "unknown"),
1179
+ "landmark_name": box_result.get("landmark_name", "Unknown Landmark"),
1180
+ "confidence": box_result.get("confidence", 0.0),
1181
+ "location": box_result.get("location", "Unknown Location"),
1182
+ "region_type": "yolo_box",
1183
+ "box": box
1184
+ })
1185
+ except Exception as e:
1186
+ print(f"Error in analyzing YOLO box: {e}")
1187
+ continue
1188
+
1189
+ # 最後,執行額外的網格搜索以捕獲可能被遺漏的地標
1190
+ # 但只有在尚未發現地標或僅發現低置信度地標時
1191
+ should_do_grid_search = (
1192
+ len(result["detected_landmarks"]) == 0 or
1193
+ max([landmark.get("confidence", 0) for landmark in result["detected_landmarks"]], default=0) < 0.5
1194
+ )
1195
+
1196
+ if should_do_grid_search and hasattr(self, 'classify_image_region'):
1197
+ try:
1198
+ # 創建5x5網格
1199
+ width, height = getattr(image, 'size', (getattr(image, 'width', 0), getattr(image, 'height', 0)))
1200
+ if not isinstance(width, (int, float)) or width <= 0:
1201
+ width = getattr(image, 'width', 0)
1202
+ if not isinstance(height, (int, float)) or height <= 0:
1203
+ height = getattr(image, 'height', 0)
1204
+
1205
+ if width > 0 and height > 0:
1206
+ grid_boxes = []
1207
+ for i in range(5):
1208
+ for j in range(5):
1209
+ grid_boxes.append([
1210
+ width * (j/5), height * (i/5),
1211
+ width * ((j+1)/5), height * ((i+1)/5)
1212
+ ])
1213
+
1214
+ # 分析每個網格區域
1215
+ for box in grid_boxes:
1216
+ try:
1217
+ grid_result = self.classify_image_region(
1218
+ image,
1219
+ box,
1220
+ threshold=base_threshold * 0.9, # 稍微降低網格搜索閾值
1221
+ detection_type="partial"
1222
+ )
1223
+
1224
+ # 如果檢測到地標
1225
+ if grid_result and grid_result.get("is_landmark", False):
1226
+ # 檢查是否與已檢測的地標重複
1227
+ is_duplicate = False
1228
+ for existing in result["detected_landmarks"]:
1229
+ if existing.get("landmark_id") == grid_result.get("landmark_id"):
1230
+ is_duplicate = True
1231
+ break
1232
+
1233
+ # 如果不是重複的,添加到列表
1234
+ if not is_duplicate:
1235
+ result["detected_landmarks"].append({
1236
+ "landmark_id": grid_result.get("landmark_id", "unknown"),
1237
+ "landmark_name": grid_result.get("landmark_name", "Unknown Landmark"),
1238
+ "confidence": grid_result.get("confidence", 0.0),
1239
+ "location": grid_result.get("location", "Unknown Location"),
1240
+ "region_type": "grid",
1241
+ "box": box
1242
+ })
1243
+ except Exception as e:
1244
+ print(f"Error in analyzing grid region: {e}")
1245
+ continue
1246
+ except Exception as e:
1247
+ print(f"Error in grid search: {e}")
1248
+ import traceback
1249
+ traceback.print_exc()
1250
+
1251
+ # 按置信度排序檢測結果
1252
+ result["detected_landmarks"].sort(key=lambda x: x.get("confidence", 0), reverse=True)
1253
+
1254
+ # 更新整體場景類型判斷
1255
+ if len(result["detected_landmarks"]) > 0:
1256
+ result["is_landmark_scene"] = True
1257
+ result["primary_landmark"] = result["detected_landmarks"][0]
1258
+
1259
+ # 添加 clip_analysis_on_full_image 結果,以便給 LLM 提供更多上下文
1260
+ if full_image_result and "clip_analysis" in full_image_result:
1261
+ result["clip_analysis_on_full_image"] = full_image_result["clip_analysis"]
1262
+
1263
+ return result
1264
+
1265
+ def _extract_landmark_specific_info(self, landmark_id: str) -> Dict[str, Any]:
1266
+ """
1267
+ 提取特定地標的詳細信息,包括特色模板和活動建議
1268
+
1269
+ Args:
1270
+ landmark_id: 地標ID
1271
+
1272
+ Returns:
1273
+ Dict: 地標特定信息
1274
+ """
1275
+ if not landmark_id or landmark_id == "unknown":
1276
+ return {"has_specific_activities": False}
1277
+
1278
+ specific_info = {"has_specific_activities": False}
1279
+
1280
+ # 從 ALL_LANDMARKS 或 self.landmark_data 中提取基本信息
1281
+ landmark_data_source = None
1282
+
1283
+ # 優先嘗試從類屬性獲取
1284
+ if hasattr(self, 'landmark_data') and self.landmark_data and landmark_id in self.landmark_data:
1285
+ landmark_data_source = self.landmark_data[landmark_id]
1286
+ print(f"Using landmark data from class attribute for {landmark_id}")
1287
+ else:
1288
+ try:
1289
+ if landmark_id in ALL_LANDMARKS:
1290
+ landmark_data_source = ALL_LANDMARKS[landmark_id]
1291
+ print(f"Using landmark data from ALL_LANDMARKS for {landmark_id}")
1292
+ except ImportError:
1293
+ print("Warning: Could not import ALL_LANDMARKS from landmark_data")
1294
+ except Exception as e:
1295
+ print(f"Error accessing ALL_LANDMARKS: {e}")
1296
+
1297
+ # 處理地標基本數據
1298
+ if landmark_data_source:
1299
+ # 提取正確的地標名稱
1300
+ if "name" in landmark_data_source:
1301
+ specific_info["landmark_name"] = landmark_data_source["name"]
1302
+
1303
+ # 提取所有可用的 prompts 作為特色模板
1304
+ if "prompts" in landmark_data_source:
1305
+ specific_info["feature_templates"] = landmark_data_source["prompts"][:5]
1306
+ specific_info["primary_template"] = landmark_data_source["prompts"][0]
1307
+
1308
+ # 提取別名info
1309
+ if "aliases" in landmark_data_source:
1310
+ specific_info["aliases"] = landmark_data_source["aliases"]
1311
+
1312
+ # 提取位置信息
1313
+ if "location" in landmark_data_source:
1314
+ specific_info["location"] = landmark_data_source["location"]
1315
+
1316
+ # 提取其他相關信息
1317
+ for key in ["year_built", "architectural_style", "significance", "description"]:
1318
+ if key in landmark_data_source:
1319
+ specific_info[key] = landmark_data_source[key]
1320
+
1321
+ # 嘗試從 LANDMARK_ACTIVITIES 中提取活動建議
1322
+ try:
1323
+ if landmark_id in LANDMARK_ACTIVITIES:
1324
+ activities = LANDMARK_ACTIVITIES[landmark_id]
1325
+ specific_info["landmark_specific_activities"] = activities
1326
+ specific_info["has_specific_activities"] = True
1327
+ print(f"Found {len(activities)} specific activities for landmark {landmark_id}")
1328
+ else:
1329
+ print(f"No specific activities found for landmark {landmark_id} in LANDMARK_ACTIVITIES")
1330
+ specific_info["has_specific_activities"] = False
1331
+ except ImportError:
1332
+ print("Warning: Could not import LANDMARK_ACTIVITIES from landmark_activities")
1333
+ specific_info["has_specific_activities"] = False
1334
+ except Exception as e:
1335
+ print(f"Error loading landmark activities for {landmark_id}: {e}")
1336
+ specific_info["has_specific_activities"] = False
1337
+
1338
+ return specific_info
1339
+
1340
+ def _analyze_viewpoint(self, image: Union[Image.Image, np.ndarray]) -> Dict[str, float]:
1341
+ """
1342
+ Analyzes the image viewpoint to adjust detection parameters.
1343
+
1344
+ Args:
1345
+ image: Input image
1346
+
1347
+ Returns:
1348
+ Dict: Viewpoint analysis results
1349
+ """
1350
+ viewpoint_prompts = {
1351
+ "aerial_view": "an aerial view from above looking down",
1352
+ "street_level": "a street level view looking up at a tall structure",
1353
+ "eye_level": "an eye-level horizontal view of a landmark",
1354
+ "distant": "a distant view of a landmark on the horizon",
1355
+ "close_up": "a close-up detailed view of architectural features",
1356
+ "interior": "an interior view inside a structure"
1357
+ }
1358
+
1359
+ # Calculate similarity scores
1360
+ viewpoint_scores = self.calculate_similarity_scores(image, viewpoint_prompts)
1361
+
1362
+ # Find dominant viewpoint
1363
+ dominant_viewpoint = max(viewpoint_scores.items(), key=lambda x: x[1])
1364
+
1365
+ return {
1366
+ "viewpoint_scores": viewpoint_scores,
1367
+ "dominant_viewpoint": dominant_viewpoint[0],
1368
+ "confidence": dominant_viewpoint[1]
1369
+ }
1370
+
1371
+ def calculate_similarity_scores(self, image: Union[Image.Image, np.ndarray],
1372
+ prompts: Dict[str, str]) -> Dict[str, float]:
1373
+ """
1374
+ 計算圖像與一組特定提示之間的相似度分數
1375
+
1376
+ Args:
1377
+ image: 輸入圖像
1378
+ prompts: 提示詞字典 {名稱: 提示文本}
1379
+
1380
+ Returns:
1381
+ Dict[str, float]: 每個提示的相似度分數
1382
+ """
1383
+ # 確保圖像是PIL格式
1384
+ if not isinstance(image, Image.Image):
1385
+ if isinstance(image, np.ndarray):
1386
+ image = Image.fromarray(image)
1387
+ else:
1388
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
1389
+
1390
+ # 預處理圖像
1391
+ image_input = self.preprocess(image).unsqueeze(0).to(self.device)
1392
+
1393
+ # 獲取圖像特徵
1394
+ with torch.no_grad():
1395
+ image_features = self.model.encode_image(image_input)
1396
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
1397
+
1398
+ # 計算與每個提示的相似度
1399
+ scores = {}
1400
+ prompt_texts = list(prompts.values())
1401
+ prompt_tokens = clip.tokenize(prompt_texts).to(self.device)
1402
+
1403
+ with torch.no_grad():
1404
+ prompt_features = self.model.encode_text(prompt_tokens)
1405
+ prompt_features = prompt_features / prompt_features.norm(dim=-1, keepdim=True)
1406
+
1407
+ # calculate similarity
1408
+ similarity = (100.0 * image_features @ prompt_features.T).softmax(dim=-1)
1409
+ similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
1410
+
1411
+ # 填充結果字典
1412
+ for i, (name, _) in enumerate(prompts.items()):
1413
+ scores[name] = float(similarity[i])
1414
+
1415
+ return scores
enhance_scene_describer.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import re
3
  import json
 
4
  import random
5
  import numpy as np
6
  from typing import Dict, List, Tuple, Any, Optional
@@ -12,6 +13,7 @@ from lighting_conditions import LIGHTING_CONDITIONS
12
  from viewpoint_templates import VIEWPOINT_TEMPLATES
13
  from cultural_templates import CULTURAL_TEMPLATES
14
  from confifence_templates import CONFIDENCE_TEMPLATES
 
15
 
16
  class EnhancedSceneDescriber:
17
  """
@@ -21,7 +23,7 @@ class EnhancedSceneDescriber:
21
  detection results and scene classification.
22
  """
23
 
24
- def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None):
25
  """
26
  Initialize the enhanced scene describer.
27
 
@@ -29,6 +31,15 @@ class EnhancedSceneDescriber:
29
  templates_db: Optional custom templates database
30
  scene_types: Dictionary of scene type definitions
31
  """
 
 
 
 
 
 
 
 
 
32
  # Load or use provided scene types
33
  self.scene_types = scene_types or self._load_default_scene_types()
34
 
@@ -57,7 +68,7 @@ class EnhancedSceneDescriber:
57
  """
58
  templates = {}
59
 
60
- # 直接從導入的 Python 模組中獲取模板
61
  templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES
62
  templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS
63
  templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES
@@ -100,19 +111,19 @@ class EnhancedSceneDescriber:
100
  "low": "This might be {description}, but the confidence is low. {details}"
101
  }
102
 
103
- # 場景細節模板 - 如果未從外部導入
104
  if "scene_detail_templates" not in templates:
105
  templates["scene_detail_templates"] = {
106
  "default": ["A space with various objects."]
107
  }
108
 
109
- # 物體填充模板 - 用於生成物體描述
110
  if "object_template_fillers" not in templates:
111
  templates["object_template_fillers"] = {
112
  "default": ["various items"]
113
  }
114
 
115
- # 視角模板 - 雖然我們現在從專門模組導入,但作為備份
116
  if "viewpoint_templates" not in templates:
117
  # 使用簡化版的默認視角模板
118
  templates["viewpoint_templates"] = {
@@ -147,6 +158,7 @@ class EnhancedSceneDescriber:
147
  "unknown": "The lighting conditions are not easily determined."
148
  }
149
 
 
150
  def _initialize_viewpoint_parameters(self):
151
  """
152
  Initialize parameters used for viewpoint detection.
@@ -165,232 +177,444 @@ class EnhancedSceneDescriber:
165
  "elevated_top_threshold": 0.3 # Few objects at top of frame
166
  }
167
 
168
-
169
- def generate_description(self,
170
- scene_type: str,
171
- detected_objects: List[Dict],
172
- confidence: float,
173
- lighting_info: Optional[Dict] = None,
174
- functional_zones: Optional[Dict] = None) -> str:
175
  """
176
- Generate enhanced scene description based on detection results, scene type,
177
- and additional contextual information.
178
-
179
- This is the main entry point that replaces the original _generate_scene_description.
180
 
181
  Args:
182
- scene_type: Identified scene type
183
- detected_objects: List of detected objects
184
- confidence: Scene classification confidence
185
- lighting_info: Optional lighting condition information
186
- functional_zones: Optional identified functional zones
 
187
 
188
  Returns:
189
- str: Natural language description of the scene
190
  """
191
- # Handle unknown scene type or very low confidence
192
- if scene_type == "unknown" or confidence < 0.4:
193
- return self._format_final_description(self._generate_generic_description(detected_objects, lighting_info))
194
-
195
- # Detect viewpoint
196
- viewpoint = self._detect_viewpoint(detected_objects)
197
-
198
- # Process aerial viewpoint scene types
199
- if viewpoint == "aerial":
200
- if "intersection" in scene_type or self._is_intersection(detected_objects):
201
- scene_type = "aerial_view_intersection"
202
- elif any(keyword in scene_type for keyword in ["commercial", "shopping", "retail"]):
203
- scene_type = "aerial_view_commercial_area"
204
- elif any(keyword in scene_type for keyword in ["plaza", "square"]):
205
- scene_type = "aerial_view_plaza"
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  else:
207
- scene_type = "aerial_view_intersection"
208
 
209
- # Detect cultural context - only for non-aerial viewpoints
210
- cultural_context = None
211
- if viewpoint != "aerial":
212
- cultural_context = self._detect_cultural_context(scene_type, detected_objects)
213
 
214
- # Select appropriate template based on confidence
215
- if confidence > 0.75:
216
- confidence_level = "high"
217
- elif confidence > 0.5:
218
- confidence_level = "medium"
219
- else:
220
- confidence_level = "low"
221
 
222
- # Get base description for the scene type
223
- if viewpoint == "aerial":
224
- if 'base_description' not in locals():
225
- base_description = "An aerial view showing the layout and movement patterns from above"
226
- elif scene_type in self.scene_types:
227
- base_description = self.scene_types[scene_type].get("description", "A scene")
228
- else:
229
- base_description = "A scene"
230
 
231
- # Generate detailed scene information
232
- scene_details = self._generate_scene_details(
233
- scene_type,
234
- detected_objects,
235
- lighting_info,
236
- viewpoint
237
- )
238
 
239
- # Start with the base description
240
- description = base_description
 
241
 
242
- # If there's a secondary description from the scene type template, append it properly
243
- if scene_type in self.scene_types and "secondary_description" in self.scene_types[scene_type]:
244
- secondary_desc = self.scene_types[scene_type]["secondary_description"]
245
- if secondary_desc:
246
- description = self._smart_append(description, secondary_desc)
247
 
248
- # Improve description based on people count
249
- people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # Person class
250
- if people_objs:
251
- people_count = len(people_objs)
252
- if people_count > 5:
253
- people_phrase = f"numerous people ({people_count})"
254
- else:
255
- people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}"
256
 
257
- # Add people information to the scene details if not already mentioned
258
- if "people" not in description.lower() and "pedestrian" not in description.lower():
259
- description = self._smart_append(description, f"The scene includes {people_phrase}")
260
 
261
- # Apply cultural context if detected (only for non-aerial viewpoints)
262
- if cultural_context and viewpoint != "aerial":
263
- cultural_elements = self._generate_cultural_elements(cultural_context)
264
- if cultural_elements:
265
- description = self._smart_append(description, cultural_elements)
 
 
266
 
267
- # Now append the detailed scene information if available
268
- if scene_details:
269
- # Use smart_append to ensure proper formatting between base description and details
270
- description = self._smart_append(description, scene_details)
271
 
272
- # Include lighting information if available
273
- lighting_description = ""
274
  if lighting_info and "time_of_day" in lighting_info:
275
  lighting_type = lighting_info["time_of_day"]
276
  if lighting_type in self.templates.get("lighting_templates", {}):
277
  lighting_description = self.templates["lighting_templates"][lighting_type]
 
278
 
279
- # Add lighting description if available
280
- if lighting_description and lighting_description not in description:
281
- description = self._smart_append(description, lighting_description)
282
-
283
- # Process viewpoint information
284
  if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
285
  viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
286
 
287
- # Special handling for viewpoint prefix
288
  prefix = viewpoint_template.get('prefix', '')
289
  if prefix and not description.startswith(prefix):
290
- # Prefix is a phrase like "From above, " that should precede the description
291
  if description and description[0].isupper():
292
- # Maintain the flow by lowercasing the first letter after the prefix
293
  description = prefix + description[0].lower() + description[1:]
294
  else:
295
  description = prefix + description
296
 
297
- # Get appropriate scene elements description based on viewpoint
298
- if viewpoint == "aerial":
299
- scene_elements = "the crossing patterns and pedestrian movement"
300
- else:
301
- scene_elements = "objects and layout"
302
-
303
  viewpoint_desc = viewpoint_template.get("observation", "").format(
304
- scene_elements=scene_elements
305
  )
306
 
307
- # Add viewpoint observation if not already included
308
  if viewpoint_desc and viewpoint_desc not in description:
309
  description = self._smart_append(description, viewpoint_desc)
310
 
311
- # Add information about functional zones if available
312
  if functional_zones and len(functional_zones) > 0:
313
  zones_desc = self._describe_functional_zones(functional_zones)
314
  if zones_desc:
315
  description = self._smart_append(description, zones_desc)
316
 
317
- # Calculate actual people count
318
- people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
319
-
320
- # Check for inconsistencies in people count descriptions
321
- if people_count > 5:
322
- # Identify fragments that might contain smaller people counts
323
- small_people_patterns = [
324
- r"Area with \d+ people\.",
325
- r"Area with \d+ person\.",
326
- r"with \d+ people",
327
- r"with \d+ person"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  ]
329
 
330
- # Check and remove each pattern
331
- filtered_description = description
332
- for pattern in small_people_patterns:
333
- matches = re.findall(pattern, filtered_description)
334
- for match in matches:
335
- # Extract the number from the match
336
- number_match = re.search(r'\d+', match)
337
- if number_match:
338
- try:
339
- people_mentioned = int(number_match.group())
340
- # If the mentioned count is less than total, remove the entire sentence
341
- if people_mentioned < people_count:
342
- # Split description into sentences
343
- sentences = re.split(r'(?<=[.!?])\s+', filtered_description)
344
- # Remove sentences containing the match
345
- filtered_sentences = []
346
- for sentence in sentences:
347
- if match not in sentence:
348
- filtered_sentences.append(sentence)
349
- # Recombine the description
350
- filtered_description = " ".join(filtered_sentences)
351
- except ValueError:
352
- # Failed number conversion, continue processing
353
- continue
354
 
355
- # Use the filtered description
356
- description = filtered_description
357
 
358
- # Final formatting to ensure correct punctuation and capitalization
359
- description = self._format_final_description(description)
 
360
 
361
- description_lines = description.split('\n')
362
- clean_description = []
363
- skip_block = False # 添加這個變數的定義
364
 
365
- for line in description_lines:
366
- # 檢查是否需要跳過這行
367
- if line.strip().startswith(':param') or line.strip().startswith('"""'):
368
- continue
369
- if line.strip().startswith("Exercise") or "class SceneDescriptionSystem" in line:
370
- skip_block = True
371
- continue
372
- if ('def generate_scene_description' in line or
373
- 'def enhance_scene_descriptions' in line or
374
- 'def __init__' in line):
375
- skip_block = True
376
- continue
377
- if line.strip().startswith('#TEST'):
378
- skip_block = True
379
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
 
381
- # 空行結束跳過模式
382
- if skip_block and line.strip() == "":
383
- skip_block = False
384
 
385
- # 如果不需要跳過,添加這行到結果
386
- if not skip_block:
387
- clean_description.append(line)
388
 
389
- # 如果過濾後的描述為空,返回原始描述
390
- if not clean_description:
391
- return description
392
- else:
393
- return '\n'.join(clean_description)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
  def _smart_append(self, current_text: str, new_fragment: str) -> str:
396
  """
@@ -424,13 +648,17 @@ class EnhancedSceneDescriber:
424
  (new_fragment.startswith("A ") or new_fragment.startswith("An ")):
425
  return current_text + ". " + new_fragment
426
 
 
 
 
 
427
  # Decide how to join the texts
428
  if ends_with_sentence:
429
  # After a sentence, start with uppercase and add proper spacing
430
  joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:])
431
  elif ends_with_comma:
432
  # After a comma, maintain flow with lowercase unless it's a proper noun or special case
433
- if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
434
  joined_text = current_text + " " + new_fragment
435
  else:
436
  joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:]
@@ -440,7 +668,7 @@ class EnhancedSceneDescriber:
440
  else:
441
  # For other cases, decide based on the content
442
  if self._is_related_phrases(current_text, new_fragment):
443
- if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
444
  joined_text = current_text + ", " + new_fragment
445
  else:
446
  joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:]
@@ -489,88 +717,78 @@ class EnhancedSceneDescriber:
489
 
490
  return False
491
 
 
492
  def _format_final_description(self, text: str) -> str:
493
  """
494
  Format the final description text to ensure correct punctuation,
495
  capitalization, and spacing.
496
-
497
- Args:
498
- text: The text to format
499
-
500
- Returns:
501
- str: The properly formatted text
502
  """
503
- import re
504
-
505
- if not text:
506
  return ""
507
 
508
- # 1. 特別處理連續以"A"開頭的片段 (這是一個常見問題)
509
- text = re.sub(r'(A\s[^.!?]+?)\s+(A\s)', r'\1. \2', text, flags=re.IGNORECASE)
510
- text = re.sub(r'(An\s[^.!?]+?)\s+(An?\s)', r'\1. \2', text, flags=re.IGNORECASE)
511
 
512
- # 2. 確保第一個字母大寫
513
- text = text[0].upper() + text[1:] if text else ""
 
514
 
515
- # 3. 修正詞之間的空格問題
516
- text = re.sub(r'\s{2,}', ' ', text) # 多個空格改為一個
517
- text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # 小寫後大寫間加空格
518
 
519
- # 4. 修正詞連接問題
520
- text = re.sub(r'([a-zA-Z])and', r'\1 and', text) # "xxx"和"and"間加空格
521
- text = re.sub(r'([a-zA-Z])with', r'\1 with', text) # "xxx"和"with"間加空格
522
- text = re.sub(r'plants(and|with|or)', r'plants \1', text) # 修正"plantsand"這類問題
523
 
524
- # 5. 修正標點符號後的大小寫問題
525
- text = re.sub(r'\.(\s+)([a-z])', lambda m: f'.{m.group(1)}{m.group(2).upper()}', text) # 句號後大寫
 
 
526
 
527
- # 6. 修正逗號後接大寫單詞的問題
528
  def fix_capitalization_after_comma(match):
529
- word = match.group(2)
530
- # 例外情況:保留專有名詞、人稱代詞等的大寫
531
- if word in ["I", "I'm", "I've", "I'd", "I'll"]:
532
- return match.group(0) # 保持原樣
533
-
534
- # 保留月份、星期、地名等專有名詞的大寫
535
- proper_nouns = ["January", "February", "March", "April", "May", "June", "July",
536
- "August", "September", "October", "November", "December",
537
- "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
538
- if word in proper_nouns:
539
- return match.group(0) # 保持原樣
540
-
541
- # 其他情況:將首字母改為小寫
542
- return match.group(1) + word[0].lower() + word[1:]
543
-
544
- # 匹配逗號後接空格再接大寫單詞的模式
545
- text = re.sub(r'(,\s+)([A-Z][a-zA-Z]*)', fix_capitalization_after_comma, text)
546
-
547
-
548
- common_phrases = [
549
- (r'Social or seating area', r'social or seating area'),
550
- (r'Sleeping area', r'sleeping area'),
551
- (r'Dining area', r'dining area'),
552
- (r'Living space', r'living space')
553
- ]
 
 
 
 
 
554
 
555
- for phrase, replacement in common_phrases:
556
- # 只修改句中的術語,保留句首的大寫
557
- text = re.sub(r'(?<=[.!?]\s)' + phrase, replacement, text)
558
- # 修改句中的術語,但保留句首的大寫
559
- text = re.sub(r'(?<=,\s)' + phrase, replacement, text)
560
 
561
- # 7. 確保標點符號後有空格
562
- text = re.sub(r'\s+([.,;:!?])', r'\1', text) # 標點符號前不要空格
563
- text = re.sub(r'([.,;:!?])([a-zA-Z0-9])', r'\1 \2', text) # 標點符號後要有空格
564
 
565
- # 8. 修正重複標點符號
566
- text = re.sub(r'\.{2,}', '.', text) # 多個句號變一個
567
- text = re.sub(r',{2,}', ',', text) # 多個逗號變一個
568
 
569
- # 9. 確保文本以標點結束
570
- if text and not text[-1] in '.!?':
571
- text += '.'
572
-
573
- return text
574
 
575
  def _is_intersection(self, detected_objects: List[Dict]) -> bool:
576
  """
@@ -652,65 +870,585 @@ class EnhancedSceneDescriber:
652
 
653
  return base_desc
654
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655
  def _generate_scene_details(self,
656
- scene_type: str,
657
- detected_objects: List[Dict],
658
- lighting_info: Optional[Dict] = None,
659
- viewpoint: str = "eye_level") -> str:
 
 
 
 
 
660
  """
661
  Generate detailed description based on scene type and detected objects.
 
662
 
663
  Args:
664
- scene_type: Identified scene type
665
- detected_objects: List of detected objects
666
- lighting_info: Optional lighting condition information
667
- viewpoint: Detected viewpoint (aerial, eye_level, etc.)
 
 
 
 
668
 
669
  Returns:
670
- str: Detailed scene description
671
  """
672
- # Get scene-specific templates
673
  scene_details = ""
674
  scene_templates = self.templates.get("scene_detail_templates", {})
675
 
676
- # Handle specific scene types
677
- if scene_type in scene_templates:
678
- # Select a template appropriate for the viewpoint if available
679
- viewpoint_key = f"{scene_type}_{viewpoint}"
 
 
680
 
681
- if viewpoint_key in scene_templates:
682
- # We have a viewpoint-specific template
683
- templates_list = scene_templates[viewpoint_key]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
684
  else:
685
- # Fall back to general templates for this scene type
686
- templates_list = scene_templates[scene_type]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
687
 
688
- # Select a random template from the list
689
  if templates_list:
690
  detail_template = random.choice(templates_list)
691
-
692
- # Fill the template with object information
693
  scene_details = self._fill_detail_template(
694
  detail_template,
695
  detected_objects,
696
- scene_type
697
- )
698
- else:
699
- # Use default templates if specific ones aren't available
700
- if "default" in scene_templates:
701
- detail_template = random.choice(scene_templates["default"])
702
- scene_details = self._fill_detail_template(
703
- detail_template,
704
- detected_objects,
705
- "default"
706
  )
707
  else:
708
- # Fall back to basic description if no templates are available
709
- scene_details = self._generate_basic_details(scene_type, detected_objects)
 
 
 
 
 
 
 
 
 
 
 
 
 
710
 
711
- return scene_details
712
 
713
- def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str) -> str:
714
  """
715
  Fill a template with specific details based on detected objects.
716
 
@@ -731,6 +1469,41 @@ class EnhancedSceneDescriber:
731
  # Get object template fillers
732
  fillers = self.templates.get("object_template_fillers", {})
733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
  # 為所有可能的變數設置默認值
735
  default_replacements = {
736
  # 室內相關
@@ -910,6 +1683,36 @@ class EnhancedSceneDescriber:
910
  "knowledge_transfer": "learning exchanges"
911
  }
912
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
913
  # For each placeholder, try to fill with appropriate content
914
  for placeholder in placeholders:
915
  if placeholder in fillers:
@@ -1137,7 +1940,7 @@ class EnhancedSceneDescriber:
1137
  if not detected_objects:
1138
  return "eye_level" # default
1139
 
1140
- # 提取物體位置和大小
1141
  top_region_count = 0
1142
  bottom_region_count = 0
1143
  total_objects = len(detected_objects)
@@ -1153,29 +1956,29 @@ class EnhancedSceneDescriber:
1153
  crosswalk_pattern_detected = False
1154
 
1155
  for obj in detected_objects:
1156
- # 計算頂部/底部區域中的物體
1157
  region = obj["region"]
1158
  if "top" in region:
1159
  top_region_count += 1
1160
  elif "bottom" in region:
1161
  bottom_region_count += 1
1162
 
1163
- # 計算標準化大小(面積)
1164
  if "normalized_area" in obj:
1165
  sizes.append(obj["normalized_area"])
1166
 
1167
- # 計算高度/寬度比例
1168
  if "normalized_size" in obj:
1169
  width, height = obj["normalized_size"]
1170
  if width > 0:
1171
  height_width_ratios.append(height / width)
1172
 
1173
- # 收集人的位置用於圖案檢測
1174
  if obj["class_id"] == 0: # 人
1175
  if "normalized_center" in obj:
1176
  people_positions.append(obj["normalized_center"])
1177
 
1178
- # 專門為斑馬線十字路口添加檢測邏輯
1179
  # 檢查是否有明顯的垂直和水平行人分布
1180
  people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # 人
1181
 
@@ -1194,7 +1997,7 @@ class EnhancedSceneDescriber:
1194
  y_range = max(y_coords) - min(y_coords)
1195
 
1196
  # 嘗試檢測十字形分布
1197
- # 如果 x 和 y 方向都有較大範圍,且範圍相似,可能是十字路口
1198
  if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
1199
 
1200
  # 計算到中心點的距離
@@ -1391,7 +2194,6 @@ class EnhancedSceneDescriber:
1391
  description = description.replace("a bed in the room", "a bed")
1392
 
1393
  # 處理重複的物品列表
1394
- # 尋找格式如 "item, item, item" 的模式
1395
  object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
1396
 
1397
  for obj_list in object_lists:
@@ -1441,6 +2243,20 @@ class EnhancedSceneDescriber:
1441
  if not functional_zones:
1442
  return ""
1443
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1444
  # 計算場景中的總人數
1445
  total_people_count = 0
1446
  people_by_zone = {}
@@ -1480,12 +2296,12 @@ class EnhancedSceneDescriber:
1480
 
1481
  # 生成匯總描述
1482
  summary = ""
1483
- max_mentioned_people = 0 # 跟踪已經提到的最大人數
1484
 
1485
  # 如果總人數顯著且還沒在主描述中提到,添加總人數描述
1486
  if total_people_count > 5:
1487
  summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
1488
- max_mentioned_people = total_people_count # 更新已提到的最大人數
1489
 
1490
  # 處理每個區域的描述,確保人數信息的一致性
1491
  processed_zones = []
@@ -1494,7 +2310,7 @@ class EnhancedSceneDescriber:
1494
  zone_desc = zone_info.get("description", "a functional zone")
1495
  zone_people_count = people_by_zone.get(zone_name, 0)
1496
 
1497
- # 檢查描述中是否包含人數信息
1498
  contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())
1499
 
1500
  # 如果描述包含人數信息,且人數較小(小於已提到的最大人數),則修改描述
 
1
  import os
2
  import re
3
  import json
4
+ import logging
5
  import random
6
  import numpy as np
7
  from typing import Dict, List, Tuple, Any, Optional
 
13
  from viewpoint_templates import VIEWPOINT_TEMPLATES
14
  from cultural_templates import CULTURAL_TEMPLATES
15
  from confifence_templates import CONFIDENCE_TEMPLATES
16
+ from landmark_data import ALL_LANDMARKS
17
 
18
  class EnhancedSceneDescriber:
19
  """
 
23
  detection results and scene classification.
24
  """
25
 
26
+ def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None, spatial_analyzer_instance: Optional[Any] = None):
27
  """
28
  Initialize the enhanced scene describer.
29
 
 
31
  templates_db: Optional custom templates database
32
  scene_types: Dictionary of scene type definitions
33
  """
34
+ self.logger = logging.getLogger(self.__class__.__name__) # Use class name for logger
35
+ self.logger.setLevel(logging.INFO) # Or your desired logging level
36
+ # Optional: Add a handler if not configured globally
37
+ if not self.logger.hasHandlers():
38
+ handler = logging.StreamHandler()
39
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
40
+ handler.setFormatter(formatter)
41
+ self.logger.addHandler(handler)
42
+
43
  # Load or use provided scene types
44
  self.scene_types = scene_types or self._load_default_scene_types()
45
 
 
68
  """
69
  templates = {}
70
 
71
+ # 載入事先準備的模板
72
  templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES
73
  templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS
74
  templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES
 
111
  "low": "This might be {description}, but the confidence is low. {details}"
112
  }
113
 
114
+ # 場景細節模板
115
  if "scene_detail_templates" not in templates:
116
  templates["scene_detail_templates"] = {
117
  "default": ["A space with various objects."]
118
  }
119
 
120
+ # 物體填充模板,用於生成物體描述
121
  if "object_template_fillers" not in templates:
122
  templates["object_template_fillers"] = {
123
  "default": ["various items"]
124
  }
125
 
126
+ # 視角模板,雖然現在從專門模組導入,但可作為備份
127
  if "viewpoint_templates" not in templates:
128
  # 使用簡化版的默認視角模板
129
  templates["viewpoint_templates"] = {
 
158
  "unknown": "The lighting conditions are not easily determined."
159
  }
160
 
161
+
162
  def _initialize_viewpoint_parameters(self):
163
  """
164
  Initialize parameters used for viewpoint detection.
 
177
  "elevated_top_threshold": 0.3 # Few objects at top of frame
178
  }
179
 
180
+ def _generate_landmark_description(self,
181
+ scene_type: str,
182
+ detected_objects: List[Dict],
183
+ confidence: float,
184
+ lighting_info: Optional[Dict] = None,
185
+ functional_zones: Optional[Dict] = None,
186
+ landmark_objects: Optional[List[Dict]] = None) -> str:
187
  """
188
+ 生成包含地標信息的場景描述
 
 
 
189
 
190
  Args:
191
+ scene_type: 識別的場景類型
192
+ detected_objects: 檢測到的物體列表
193
+ confidence: 場景分類置信度
194
+ lighting_info: 照明條件信息���可選)
195
+ functional_zones: 功能區域信息(可選)
196
+ landmark_objects: 識別為地標的物體列表(可選)
197
 
198
  Returns:
199
+ str: 包含地標信息的自然語言場景描述
200
  """
201
+ # 如果沒有提供地標物體,則從檢測物體中篩選
202
+ if landmark_objects is None:
203
+ landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
204
+
205
+ # 如果沒有地標,退回到標準描述
206
+ if not landmark_objects:
207
+ if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
208
+ # 場景類型是地標但沒有具體地標物體
209
+ base_description = "A scenic area that appears to be a tourist destination, though specific landmarks are not clearly identifiable."
210
+ else:
211
+ # 使用標準方法生成基本描述
212
+ return self._format_final_description(self._generate_scene_details(
213
+ scene_type,
214
+ detected_objects,
215
+ lighting_info,
216
+ self._detect_viewpoint(detected_objects)
217
+ ))
218
+ else:
219
+ # 獲取主要地標(信心度最高的)
220
+ primary_landmark = max(landmark_objects, key=lambda x: x.get("confidence", 0))
221
+ landmark_name = primary_landmark.get("class_name", "landmark")
222
+ landmark_location = primary_landmark.get("location", "")
223
+
224
+ # 根據地標類型選擇適當的描述模板
225
+ if scene_type == "natural_landmark" or primary_landmark.get("landmark_type") == "natural":
226
+ base_description = f"A natural landmark scene featuring {landmark_name} in {landmark_location}."
227
+ elif scene_type == "historical_monument" or primary_landmark.get("landmark_type") == "monument":
228
+ base_description = f"A historical monument scene showcasing {landmark_name}, a significant landmark in {landmark_location}."
229
  else:
230
+ base_description = f"A tourist landmark scene centered around {landmark_name}, an iconic structure in {landmark_location}."
231
 
232
+ # 加地標的額外信息
233
+ landmark_details = []
234
+ for landmark in landmark_objects:
235
+ details = []
236
 
237
+ # 加建造年份
238
+ if "year_built" in landmark:
239
+ details.append(f"built in {landmark['year_built']}")
 
 
 
 
240
 
241
+ # 加建築風格
242
+ if "architectural_style" in landmark:
243
+ details.append(f"featuring {landmark['architectural_style']} architectural style")
 
 
 
 
 
244
 
245
+ # 加重要性
246
+ if "significance" in landmark:
247
+ details.append(landmark["significance"])
 
 
 
 
248
 
249
+ # 如果有詳細信息,加到描述中
250
+ if details:
251
+ landmark_details.append(f"{landmark['class_name']} ({', '.join(details)})")
252
 
253
+ # 將詳細信息添加到基本描述中
254
+ if landmark_details:
255
+ description = base_description + " " + "The scene features " + ", ".join(landmark_details) + "."
256
+ else:
257
+ description = base_description
258
 
259
+ # 獲取視角
260
+ viewpoint = self._detect_viewpoint(detected_objects)
 
 
 
 
 
 
261
 
262
+ # 生成人員活動描述
263
+ people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) # 人的類別ID通常為0
 
264
 
265
+ if people_count > 0:
266
+ if people_count == 1:
267
+ people_description = "There is one person in the scene, likely a tourist or visitor."
268
+ elif people_count < 5:
269
+ people_description = f"There are {people_count} people in the scene, possibly tourists visiting the landmark."
270
+ else:
271
+ people_description = f"The scene includes a group of {people_count} people, indicating this is a popular tourist destination."
272
 
273
+ description = self._smart_append(description, people_description)
 
 
 
274
 
275
+ # 添加照明信息
 
276
  if lighting_info and "time_of_day" in lighting_info:
277
  lighting_type = lighting_info["time_of_day"]
278
  if lighting_type in self.templates.get("lighting_templates", {}):
279
  lighting_description = self.templates["lighting_templates"][lighting_type]
280
+ description = self._smart_append(description, lighting_description)
281
 
282
+ # 添加視角描述
 
 
 
 
283
  if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
284
  viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
285
 
286
+ # 添加視角前綴
287
  prefix = viewpoint_template.get('prefix', '')
288
  if prefix and not description.startswith(prefix):
289
+ # 保持句子流暢性
290
  if description and description[0].isupper():
 
291
  description = prefix + description[0].lower() + description[1:]
292
  else:
293
  description = prefix + description
294
 
295
+ # 添加視角觀察描述
 
 
 
 
 
296
  viewpoint_desc = viewpoint_template.get("observation", "").format(
297
+ scene_elements="the landmark and surrounding area"
298
  )
299
 
 
300
  if viewpoint_desc and viewpoint_desc not in description:
301
  description = self._smart_append(description, viewpoint_desc)
302
 
303
+ # 添加功能區域描述
304
  if functional_zones and len(functional_zones) > 0:
305
  zones_desc = self._describe_functional_zones(functional_zones)
306
  if zones_desc:
307
  description = self._smart_append(description, zones_desc)
308
 
309
+ # 描述可能的活動
310
+ landmark_activities = []
311
+
312
+ # 根據地標類型生成通用活動
313
+ if scene_type == "natural_landmark" or any(obj.get("landmark_type") == "natural" for obj in landmark_objects):
314
+ landmark_activities = [
315
+ "nature photography",
316
+ "scenic viewing",
317
+ "hiking or walking",
318
+ "guided nature tours",
319
+ "outdoor appreciation"
320
+ ]
321
+ elif scene_type == "historical_monument" or any(obj.get("landmark_type") == "monument" for obj in landmark_objects):
322
+ landmark_activities = [
323
+ "historical sightseeing",
324
+ "educational tours",
325
+ "cultural appreciation",
326
+ "photography of historical architecture",
327
+ "learning about historical significance"
328
+ ]
329
+ else:
330
+ landmark_activities = [
331
+ "sightseeing",
332
+ "taking photographs",
333
+ "guided tours",
334
+ "cultural tourism",
335
+ "souvenir shopping"
336
  ]
337
 
338
+ # 添加活動描述
339
+ if landmark_activities:
340
+ activities_text = "Common activities at this location include " + ", ".join(landmark_activities[:3]) + "."
341
+ description = self._smart_append(description, activities_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
+ # 最後格式化描述
344
+ return self._format_final_description(description)
345
 
346
+ def filter_landmark_references(self, text, enable_landmark=True):
347
+ """
348
+ 動態過濾文本中的地標引用
349
 
350
+ Args:
351
+ text: 需要過濾的文本
352
+ enable_landmark: 是否啟用地標功能
353
 
354
+ Returns:
355
+ str: 過濾後的文本
356
+ """
357
+ if enable_landmark or not text:
358
+ return text
359
+
360
+ try:
361
+ # 動態收集所有地標名稱和位置
362
+ landmark_names = []
363
+ locations = []
364
+
365
+ for landmark_id, info in ALL_LANDMARKS.items():
366
+ # 收集地標名稱及其別名
367
+ landmark_names.append(info["name"])
368
+ landmark_names.extend(info.get("aliases", []))
369
+
370
+ # 收集地理位置
371
+ if "location" in info:
372
+ location = info["location"]
373
+ locations.append(location)
374
+
375
+ # 處理分離的城市和國家名稱
376
+ parts = location.split(",")
377
+ if len(parts) >= 1:
378
+ locations.append(parts[0].strip())
379
+ if len(parts) >= 2:
380
+ locations.append(parts[1].strip())
381
+
382
+ # 使用正則表達式動態替換所有地標名稱
383
+ import re
384
+ for name in landmark_names:
385
+ if name and len(name) > 2: # 避免過短的名稱
386
+ text = re.sub(r'\b' + re.escape(name) + r'\b', "tall structure", text, flags=re.IGNORECASE)
387
+
388
+ # 動態替換所有位置引用
389
+ for location in locations:
390
+ if location and len(location) > 2:
391
+ # 替換常見位置表述模式
392
+ text = re.sub(r'in ' + re.escape(location), "in the urban area", text, flags=re.IGNORECASE)
393
+ text = re.sub(r'of ' + re.escape(location), "of the urban area", text, flags=re.IGNORECASE)
394
+ text = re.sub(r'\b' + re.escape(location) + r'\b', "the urban area", text, flags=re.IGNORECASE)
395
+
396
+ except ImportError:
397
+ # 如果無法導入,使用基本模式
398
+ pass
399
+
400
+ # 通用地標描述模式替換
401
+ landmark_patterns = [
402
+ (r'a (tourist|popular|famous) landmark', r'an urban structure'),
403
+ (r'an iconic structure in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'),
404
+ (r'a famous (monument|tower|landmark) in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'),
405
+ (r'(centered|built|located|positioned) around the ([A-Z][a-zA-Z\s]+? (Tower|Monument|Landmark))', r'located in this area'),
406
+ (r'(sightseeing|guided tours|cultural tourism) (at|around|near) (this landmark|the [A-Z][a-zA-Z\s]+)', r'\1 in this area'),
407
+ (r'this (famous|iconic|historic|well-known) (landmark|monument|tower|structure)', r'this urban structure'),
408
+ (r'([A-Z][a-zA-Z\s]+) Tower', r'tall structure'),
409
+ (r'a (tower|structure) in ([A-Z][a-zA-Z\s,]+)', r'a \1 in the area'),
410
+ (r'landmark scene', r'urban scene'),
411
+ (r'tourist destination', r'urban area'),
412
+ (r'tourist attraction', r'urban area')
413
+ ]
414
+
415
+ for pattern, replacement in landmark_patterns:
416
+ text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
417
 
418
+ return text
 
 
419
 
 
 
 
420
 
421
+ def generate_description(self, scene_type: str, detected_objects: List[Dict], confidence: float,
422
+ lighting_info: Dict, functional_zones: List[str], enable_landmark: bool = True,
423
+ scene_scores: Optional[Dict] = None, spatial_analysis: Optional[Dict] = None,
424
+ image_dimensions: Optional[Dict] = None, places365_info: Optional[Dict] = None,
425
+ object_statistics: Optional[Dict] = None) -> str:
426
+ """
427
+ Generate enhanced scene description based on detection results, scene type,
428
+ and additional contextual information.
429
+ This version ensures that the main scene_details (from the first call)
430
+ is properly integrated and not overwritten by a simplified second call.
431
+ """
432
+ # Handle unknown scene type or very low confidence as an early exit
433
+ if scene_type == "unknown" or confidence < 0.4:
434
+ # _generate_generic_description should also ideally use image_dimensions if it does spatial reasoning
435
+ generic_desc = self._generate_generic_description(detected_objects, lighting_info)
436
+ return self._format_final_description(generic_desc)
437
+
438
+ # Filter out landmark objects if landmark detection is disabled for this run
439
+ current_detected_objects = detected_objects
440
+ if not enable_landmark:
441
+ current_detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
442
+
443
+ # Log Places365 context if available
444
+ places365_context = ""
445
+ if places365_info and places365_info.get('confidence', 0) > 0.3:
446
+ scene_label = places365_info.get('scene_label', '')
447
+ attributes = places365_info.get('attributes', [])
448
+ is_indoor = places365_info.get('is_indoor', None)
449
+
450
+ if scene_label:
451
+ places365_context = f"Scene context: {scene_label}"
452
+ if attributes:
453
+ places365_context += f" with characteristics: {', '.join(attributes[:3])}"
454
+ if is_indoor is not None:
455
+ indoor_outdoor = "indoor" if is_indoor else "outdoor"
456
+ places365_context += f" ({indoor_outdoor} environment)"
457
+
458
+ print(f"Enhanced description incorporating Places365 context: {places365_context}")
459
+
460
+ landmark_objects_in_scene = [obj for obj in current_detected_objects if obj.get("is_landmark", False)]
461
+ has_landmark_in_scene = len(landmark_objects_in_scene) > 0
462
+
463
+ # If landmark processing is enabled and it's a landmark scene or landmarks are detected
464
+ if enable_landmark and (scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"] or has_landmark_in_scene):
465
+ landmark_desc = self._generate_landmark_description(
466
+ scene_type,
467
+ current_detected_objects, # Pass potentially filtered list
468
+ confidence,
469
+ lighting_info,
470
+ functional_zones,
471
+ landmark_objects_in_scene # Pass the explicitly filtered landmark objects
472
+ )
473
+ return self._format_final_description(landmark_desc)
474
+
475
+ # **[Start of main description construction for non-landmark or landmark-disabled everyday scenes]**
476
+
477
+ # Detect viewpoint based on current (potentially filtered) objects
478
+ viewpoint = self._detect_viewpoint(current_detected_objects)
479
+ current_scene_type = scene_type # Use a mutable variable for scene_type if it can change
480
+
481
+ # Process aerial viewpoint scene types (may re-assign current_scene_type)
482
+ if viewpoint == "aerial":
483
+ if "intersection" in current_scene_type.lower() or self._is_intersection(current_detected_objects): # Use lower for robustness
484
+ current_scene_type = "aerial_view_intersection"
485
+ elif any(keyword in current_scene_type.lower() for keyword in ["commercial", "shopping", "retail"]):
486
+ current_scene_type = "aerial_view_commercial_area"
487
+ elif any(keyword in current_scene_type.lower() for keyword in ["plaza", "square"]):
488
+ current_scene_type = "aerial_view_plaza"
489
+ else: # Default aerial if specific not matched
490
+ current_scene_type = "aerial_view_general" # Or use a specific default like aerial_view_intersection
491
+
492
+ # Detect cultural context (only for non-aerial viewpoints)
493
+ cultural_context = None
494
+ if viewpoint != "aerial":
495
+ cultural_context = self._detect_cultural_context(current_scene_type, current_detected_objects)
496
+
497
+ # Get base description for the (potentially updated) scene type
498
+ base_description = "A scene" # Default initialization
499
+ if viewpoint == "aerial":
500
+ # Check if current_scene_type (which might be an aerial type) has a base description
501
+ if current_scene_type in self.scene_types:
502
+ base_description = self.scene_types[current_scene_type].get("description", "An aerial view showing the layout and movement patterns from above")
503
+ else:
504
+ base_description = "An aerial view showing the layout and movement patterns from above"
505
+ elif current_scene_type in self.scene_types:
506
+ base_description = self.scene_types[current_scene_type].get("description", "A scene")
507
+
508
+ # spatial analysis, and image dimensions. This is where dynamic description or template filling happens.
509
+ core_scene_details = self._generate_scene_details(
510
+ current_scene_type, # Use the potentially updated scene_type
511
+ current_detected_objects,
512
+ lighting_info,
513
+ viewpoint,
514
+ spatial_analysis=spatial_analysis, # Pass this through
515
+ image_dimensions=image_dimensions, # Pass this through
516
+ places365_info=places365_info, # Pass Places365 info
517
+ object_statistics=object_statistics # Pass object statistics
518
+ )
519
+
520
+ # Start with the base description derived from SCENE_TYPES or a default.
521
+ description = base_description
522
+ if core_scene_details and core_scene_details.strip() != "": # Ensure core_scene_details is not empty
523
+ # If base_description is generic like "A scene", consider replacing it or appending smartly.
524
+ if base_description.lower() == "a scene" and len(core_scene_details) > len(base_description):
525
+ description = core_scene_details # Prioritize dynamic/template-filled details if base is too generic
526
+ else:
527
+ description = self._smart_append(description, core_scene_details)
528
+ elif not core_scene_details and not description: # If both are empty, use a generic fallback
529
+ description = self._generate_generic_description(current_detected_objects, lighting_info)
530
+
531
+
532
+ # Append secondary description from scene type template, if any
533
+ if current_scene_type in self.scene_types and "secondary_description" in self.scene_types[current_scene_type]:
534
+ secondary_desc = self.scene_types[current_scene_type]["secondary_description"]
535
+ if secondary_desc:
536
+ description = self._smart_append(description, secondary_desc)
537
+
538
+ # Append people count information
539
+ people_objs = [obj for obj in current_detected_objects if obj.get("class_id") == 0]
540
+ if people_objs:
541
+ people_count = len(people_objs)
542
+
543
+ if people_count == 1: people_phrase = "a single person"
544
+ elif people_count > 1 and people_count <= 3: people_phrase = f"{people_count} people" # Accurate for small counts
545
+ elif people_count > 3 and people_count <=7: people_phrase = "several people"
546
+ else: people_phrase = "multiple people" # For larger counts, or use "numerous"
547
+
548
+ # Only add if not already well covered in core_scene_details or base_description
549
+ if "person" not in description.lower() and "people" not in description.lower() and "pedestrian" not in description.lower():
550
+ description = self._smart_append(description, f"The scene includes {people_phrase}.")
551
+
552
+ # Append cultural context
553
+ if cultural_context and viewpoint != "aerial": # Already checked viewpoint
554
+ cultural_elements = self._generate_cultural_elements(cultural_context)
555
+ if cultural_elements:
556
+ description = self._smart_append(description, cultural_elements)
557
+
558
+ # Append lighting information
559
+ lighting_description_text = ""
560
+ if lighting_info and "time_of_day" in lighting_info:
561
+ lighting_type = lighting_info["time_of_day"]
562
+ lighting_desc_template = self.templates.get("lighting_templates", {}).get(lighting_type)
563
+ if lighting_desc_template:
564
+ lighting_description_text = lighting_desc_template
565
+ if lighting_description_text and lighting_description_text.lower() not in description.lower():
566
+ description = self._smart_append(description, lighting_description_text)
567
+
568
+ # Append viewpoint information (if not eye-level)
569
+ if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
570
+ viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
571
+ prefix = viewpoint_template.get('prefix', '')
572
+ observation_template = viewpoint_template.get("observation", "")
573
+
574
+ # Determine scene_elements for the observation template
575
+ scene_elements_for_vp = "the overall layout and objects" # Generic default
576
+ if viewpoint == "aerial":
577
+ scene_elements_for_vp = "crossing patterns and general layout"
578
+
579
+ viewpoint_observation_text = observation_template.format(scene_elements=scene_elements_for_vp)
580
+
581
+ # Combine prefix and observation carefully
582
+ full_viewpoint_text = ""
583
+ if prefix:
584
+ full_viewpoint_text = prefix.strip() + " "
585
+ if viewpoint_observation_text and viewpoint_observation_text[0].islower():
586
+ full_viewpoint_text += viewpoint_observation_text
587
+ elif viewpoint_observation_text:
588
+ full_viewpoint_text = prefix + viewpoint_observation_text[0].lower() + viewpoint_observation_text[1:] if description else prefix + viewpoint_observation_text
589
+
590
+ elif viewpoint_observation_text: # No prefix, but observation exists
591
+ full_viewpoint_text = viewpoint_observation_text[0].upper() + viewpoint_observation_text[1:]
592
+
593
+
594
+ if full_viewpoint_text and full_viewpoint_text.lower() not in description.lower():
595
+ description = self._smart_append(description, full_viewpoint_text)
596
+
597
+
598
+ # Append functional zones information
599
+ if functional_zones and len(functional_zones) > 0:
600
+ zones_desc_text = self._describe_functional_zones(functional_zones)
601
+ if zones_desc_text:
602
+ description = self._smart_append(description, zones_desc_text)
603
+
604
+ final_formatted_description = self._format_final_description(description)
605
+
606
+ if not enable_landmark:
607
+ final_formatted_description = self.filter_landmark_references(final_formatted_description, enable_landmark=False)
608
+
609
+ # If after all processing, description is empty, fallback to a very generic one.
610
+ if not final_formatted_description.strip() or final_formatted_description.strip() == ".":
611
+ self.logger.warning(f"Description for scene_type '{current_scene_type}' became empty after processing. Falling back.")
612
+ final_formatted_description = self._format_final_description(
613
+ self._generate_generic_description(current_detected_objects, lighting_info)
614
+ )
615
+
616
+ return final_formatted_description
617
+
618
 
619
  def _smart_append(self, current_text: str, new_fragment: str) -> str:
620
  """
 
648
  (new_fragment.startswith("A ") or new_fragment.startswith("An ")):
649
  return current_text + ". " + new_fragment
650
 
651
+ # 檢查新片段是否包含地標名稱(通常為專有名詞)
652
+ has_landmark_name = any(word[0].isupper() for word in new_fragment.split()
653
+ if len(word) > 2 and not word.startswith(("A ", "An ", "The ")))
654
+
655
  # Decide how to join the texts
656
  if ends_with_sentence:
657
  # After a sentence, start with uppercase and add proper spacing
658
  joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:])
659
  elif ends_with_comma:
660
  # After a comma, maintain flow with lowercase unless it's a proper noun or special case
661
+ if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name:
662
  joined_text = current_text + " " + new_fragment
663
  else:
664
  joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:]
 
668
  else:
669
  # For other cases, decide based on the content
670
  if self._is_related_phrases(current_text, new_fragment):
671
+ if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name:
672
  joined_text = current_text + ", " + new_fragment
673
  else:
674
  joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:]
 
717
 
718
  return False
719
 
720
+
721
  def _format_final_description(self, text: str) -> str:
722
  """
723
  Format the final description text to ensure correct punctuation,
724
  capitalization, and spacing.
 
 
 
 
 
 
725
  """
726
+ if not text or not text.strip(): # Also check if text is just whitespace
 
 
727
  return ""
728
 
729
+ # Trim leading/trailing whitespace first
730
+ text = text.strip()
 
731
 
732
+ # 1. Handle consecutive "A/An" segments (potentially split them into sentences)
733
+ text = re.sub(r'(A\s+[^.!?]+?[\w\.])\s+(A\s+)', r'\1. \2', text, flags=re.IGNORECASE)
734
+ text = re.sub(r'(An\s+[^.!?]+?[\w\.])\s+(An?\s+)', r'\1. \2', text, flags=re.IGNORECASE)
735
 
736
+ # 2. Ensure first character of the entire text is uppercase
737
+ if text:
738
+ text = text[0].upper() + text[1:]
739
 
740
+ # 3. Normalize whitespace: multiple spaces to one
741
+ text = re.sub(r'\s{2,}', ' ', text)
 
 
742
 
743
+ # 4. Capitalize after sentence-ending punctuation (. ! ?)
744
+ def capitalize_after_punctuation(match):
745
+ return match.group(1) + match.group(2).upper()
746
+ text = re.sub(r'([.!?]\s+)([a-z])', capitalize_after_punctuation, text)
747
 
748
+ # 5. Handle capitalization after commas (your existing robust logic is good)
749
  def fix_capitalization_after_comma(match):
750
+ leading_comma_space = match.group(1) # (,\s+)
751
+ word_after_comma = match.group(2) # ([A-Z][a-zA-Z]*)
752
+
753
+ proper_nouns_exceptions = ["I", "I'm", "I've", "I'd", "I'll",
754
+ "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
755
+ "January", "February", "March", "April", "May", "June", "July",
756
+ "August", "September", "October", "November", "December"]
757
+
758
+ if word_after_comma in proper_nouns_exceptions:
759
+ return match.group(0)
760
+ # If the word looks like a proper noun (e.g., multi-word capitalized, or a known location/brand)
761
+ # This heuristic can be tricky. For simplicity, if it's already capitalized and not a common word, keep it.
762
+ if len(word_after_comma) > 2 and word_after_comma[0].isupper() and word_after_comma.lower() not in ["this", "that", "these", "those", "they", "their", "then", "thus"]:
763
+ return match.group(0) # Keep it if it looks like a proper noun already
764
+
765
+ return leading_comma_space + word_after_comma[0].lower() + word_after_comma[1:]
766
+ text = re.sub(r'(,\s+)([A-Z][a-zA-Z\'\-]+)', fix_capitalization_after_comma, text) # Added hyphen and apostrophe to word
767
+
768
+ # 6. Correct spacing around punctuation
769
+ text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) # Ensures one space AFTER punctuation, none before
770
+ text = text.replace(' .', '.').replace(' ,', ',') # Clean up potential space before period/comma from previous rule
771
+
772
+ # 7. Consolidate multiple sentence-ending punctuations (e.g., "!!", "?.", ".?")
773
+ text = re.sub(r'[.!?]{2,}', '.', text) # Convert multiple to a single period
774
+ text = re.sub(r',+', ',', text) # Multiple commas to one
775
+
776
+ # 8. Ensure text ends with a single sentence-ending punctuation mark
777
+ text = text.strip() # Remove trailing whitespace before checking last char
778
+ if text and not text[-1] in '.!?':
779
+ text += '.'
780
 
781
+ # 9. Remove any leading punctuation or extra spaces that might have been introduced
782
+ text = re.sub(r'^[.,;:!?\s]+', '', text)
 
 
 
783
 
784
+ # 10. Final check for first letter capitalization
785
+ if text:
786
+ text = text[0].upper() + text[1:]
787
 
788
+ # 11. Remove space before final punctuation mark if accidentally added by rule 7
789
+ text = re.sub(r'\s+([.!?])$', r'\1', text)
 
790
 
791
+ return text.strip() # Final strip
 
 
 
 
792
 
793
  def _is_intersection(self, detected_objects: List[Dict]) -> bool:
794
  """
 
870
 
871
  return base_desc
872
 
873
+ def _get_prominent_objects(self, detected_objects: List[Dict], min_prominence_score: float = 0.1, max_categories_to_return: int = 5, max_total_objects: int = 7) -> List[Dict]:
874
+ """
875
+ Helper function to get the most prominent objects.
876
+ Prioritizes high-confidence, large objects, and ensures a diversity of object types.
877
+
878
+ Args:
879
+ detected_objects: List of detected objects.
880
+ min_prominence_score: Minimum score for an object to be considered initially.
881
+ max_categories_to_return: Max number of different object categories to prioritize.
882
+ max_total_objects: Overall cap on the number of prominent objects returned.
883
+
884
+ Returns:
885
+ List of prominent detected objects.
886
+ """
887
+ if not detected_objects:
888
+ return []
889
+
890
+ scored_objects = []
891
+ for obj in detected_objects:
892
+ area = obj.get("normalized_area", 0.0) + 1e-6
893
+ confidence = obj.get("confidence", 0.0)
894
+
895
+ # Base score: area and confidence are key
896
+ score = (area * 0.65) + (confidence * 0.35) # Slightly more weight to area
897
+
898
+ # Bonus for generally important object classes (in a generic way)
899
+ # This is a simple heuristic. More advanced would be context-dependent.
900
+ # For example, 'person' is often more salient.
901
+ # Avoid hardcoding specific class_ids here if possible, or use broad categories if available.
902
+ # For simplicity, we'll keep the landmark bonus for now.
903
+ if obj.get("class_name") == "person": # Example: person is generally prominent
904
+ score += 0.1
905
+ if obj.get("is_landmark"): # Landmarks are always prominent
906
+ score += 0.5
907
+
908
+ if score >= min_prominence_score:
909
+ scored_objects.append((obj, score))
910
+
911
+ if not scored_objects:
912
+ return []
913
+
914
+ # Sort by score in descending order
915
+ scored_objects.sort(key=lambda x: x[1], reverse=True)
916
+
917
+ # Prioritize diversity of object categories first
918
+ prominent_by_category = {}
919
+ final_prominent_objects = []
920
+
921
+ for obj, score in scored_objects:
922
+ category = obj.get("class_name", "unknown")
923
+ if category not in prominent_by_category:
924
+ if len(prominent_by_category) < max_categories_to_return:
925
+ prominent_by_category[category] = obj
926
+ final_prominent_objects.append(obj)
927
+
928
+ elif len(final_prominent_objects) < max_total_objects and obj not in final_prominent_objects:
929
+ if score > 0.3:
930
+ final_prominent_objects.append(obj)
931
+
932
+ # If still under max_total_objects, fill with highest scored remaining objects regardless of category
933
+ if len(final_prominent_objects) < max_total_objects:
934
+ for obj, score in scored_objects:
935
+ if len(final_prominent_objects) >= max_total_objects:
936
+ break
937
+ if obj not in final_prominent_objects:
938
+ final_prominent_objects.append(obj)
939
+
940
+ # Re-sort the final list by original prominence score to maintain order
941
+ final_prominent_objects_with_scores = []
942
+ for obj in final_prominent_objects:
943
+ for original_obj, original_score in scored_objects:
944
+ if obj is original_obj: # Check for object identity
945
+ final_prominent_objects_with_scores.append((obj, original_score))
946
+ break
947
+
948
+ final_prominent_objects_with_scores.sort(key=lambda x: x[1], reverse=True)
949
+
950
+ return [obj for obj, score in final_prominent_objects_with_scores[:max_total_objects]]
951
+
952
+
953
+ def _format_object_list_for_description(self,
954
+ objects: List[Dict],
955
+ use_indefinite_article_for_one: bool = False,
956
+ count_threshold_for_generalization: int = -1, # Default to -1 for precise counts
957
+ max_types_to_list: int = 5
958
+ ) -> str:
959
+ """
960
+ Formats a list of detected objects into a human-readable string with counts.
961
+ Args:
962
+ objects: List of object dictionaries, each expected to have 'class_name'.
963
+ use_indefinite_article_for_one: If True, uses "a/an" for single items. If False, uses "one".
964
+ count_threshold_for_generalization: If count exceeds this, use general terms. -1 means precise counts.
965
+ max_types_to_list: Maximum number of different object types to include in the list.
966
+ """
967
+ if not objects:
968
+ return "no specific objects clearly identified"
969
+
970
+ counts: Dict[str, int] = {}
971
+ for obj in objects:
972
+ name = obj.get("class_name", "unknown object")
973
+ if name == "unknown object" or not name: # Skip unknown or empty names
974
+ continue
975
+ counts[name] = counts.get(name, 0) + 1
976
+
977
+ if not counts:
978
+ return "no specific objects clearly identified"
979
+
980
+ descriptions = []
981
+ # Sort by count (desc) then name (asc) for consistent output order
982
+ # Limit the number of distinct object types being listed
983
+ sorted_counts = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:max_types_to_list]
984
+
985
+
986
+ for name, count in sorted_counts:
987
+ if count == 1:
988
+ if use_indefinite_article_for_one:
989
+ if name[0].lower() in 'aeiou':
990
+ descriptions.append(f"an {name}")
991
+ else:
992
+ descriptions.append(f"a {name}")
993
+ else:
994
+ descriptions.append(f"one {name}") # Output "one car" instead of "a car"
995
+ else: # count > 1
996
+ plural_name = name
997
+ if name.endswith("y") and not name.lower().endswith(("ay", "ey", "iy", "oy", "uy")):
998
+ plural_name = name[:-1] + "ies"
999
+ elif name.endswith(("s", "sh", "ch", "x", "z")):
1000
+ plural_name = name + "es"
1001
+ elif not name.endswith("s"): # Avoid double 's' like "buss"
1002
+ plural_name = name + "s"
1003
+
1004
+ if count_threshold_for_generalization != -1 and count > count_threshold_for_generalization:
1005
+ if count <= count_threshold_for_generalization + 3:
1006
+ descriptions.append(f"several {plural_name}")
1007
+ else:
1008
+ descriptions.append(f"many {plural_name}")
1009
+ else: # Use exact count (e.g., "6 cars")
1010
+ descriptions.append(f"{count} {plural_name}")
1011
+
1012
+ if not descriptions:
1013
+ return "no specific objects clearly identified"
1014
+
1015
+ if len(descriptions) == 1:
1016
+ return descriptions[0]
1017
+ elif len(descriptions) == 2:
1018
+ return f"{descriptions[0]} and {descriptions[1]}"
1019
+ else:
1020
+ # Oxford comma for lists of 3 or more.
1021
+ return ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}"
1022
+
1023
+ def _get_spatial_description(self, obj: Dict, image_width: Optional[int] = None, image_height: Optional[int] = None) -> str:
1024
+ """
1025
+ Generates a brief spatial description for an object.
1026
+ (This is a new helper function)
1027
+ """
1028
+ region = obj.get("region")
1029
+ if region:
1030
+ # Convert region name to more descriptive terms
1031
+ region_map = {
1032
+ "top_left": "in the top-left", "top_center": "at the top-center", "top_right": "in the top-right",
1033
+ "middle_left": "on the middle-left side", "middle_center": "in the center", "middle_right": "on the middle-right side",
1034
+ "bottom_left": "in the bottom-left", "bottom_center": "at the bottom-center", "bottom_right": "in the bottom-right"
1035
+ }
1036
+ # More general terms if exact region is not critical
1037
+ if "top" in region: general_v_pos = "towards the top"
1038
+ elif "bottom" in region: general_v_pos = "towards the bottom"
1039
+ else: general_v_pos = "in the middle vertically"
1040
+
1041
+ if "left" in region: general_h_pos = "towards the left"
1042
+ elif "right" in region: general_h_pos = "towards the right"
1043
+ else: general_h_pos = "in the center horizontally"
1044
+
1045
+ # Prioritize specific region if available, else use general
1046
+ specific_desc = region_map.get(region, "")
1047
+ if specific_desc:
1048
+ return f"{specific_desc} of the frame"
1049
+ else:
1050
+ return f"{general_v_pos} and {general_h_pos} of the frame"
1051
+
1052
+ # Fallback if region info is not detailed enough or missing
1053
+ # We can use normalized_center if available
1054
+ norm_center = obj.get("normalized_center")
1055
+ if norm_center and image_width and image_height: # Check if image_width/height are provided
1056
+ x_norm, y_norm = norm_center
1057
+ h_pos = "left" if x_norm < 0.4 else "right" if x_norm > 0.6 else "center"
1058
+ v_pos = "top" if y_norm < 0.4 else "bottom" if y_norm > 0.6 else "middle"
1059
+
1060
+ if h_pos == "center" and v_pos == "middle":
1061
+ return "near the center of the image"
1062
+ return f"in the {v_pos}-{h_pos} area of the image"
1063
+
1064
+ return "in the scene" # Generic fallback
1065
+
1066
+
1067
+ def _generate_dynamic_everyday_description(self,
1068
+ detected_objects: List[Dict],
1069
+ lighting_info: Optional[Dict] = None,
1070
+ viewpoint: str = "eye_level",
1071
+ spatial_analysis: Optional[Dict] = None,
1072
+ image_dimensions: Optional[Tuple[int, int]] = None,
1073
+ places365_info: Optional[Dict] = None,
1074
+ object_statistics: Optional[Dict] = None
1075
+ ) -> str:
1076
+ """
1077
+ Dynamically generates a description for everyday scenes based on ALL relevant detected_objects,
1078
+ their counts, and context.
1079
+ It aims to describe the overall scene first, then details of object groups including accurate counts.
1080
+ """
1081
+ description_segments = []
1082
+ image_width, image_height = image_dimensions if image_dimensions else (None, None)
1083
+
1084
+ if hasattr(self, 'logger'):
1085
+ self.logger.info(f"DynamicDesc: Start. Total Raw Objects: {len(detected_objects)}, View: {viewpoint}, Light: {lighting_info is not None}")
1086
+
1087
+ # 1. Overall Ambiance (Lighting and Viewpoint)
1088
+ ambiance_parts = []
1089
+ if lighting_info:
1090
+ time_of_day = lighting_info.get("time_of_day", "unknown lighting")
1091
+ is_indoor = lighting_info.get("is_indoor")
1092
+ ambiance_statement = "This is"
1093
+ if is_indoor is True: ambiance_statement += " an indoor scene"
1094
+ elif is_indoor is False: ambiance_statement += " an outdoor scene"
1095
+ else: ambiance_statement += " a scene"
1096
+ lighting_map = self.templates.get("lighting_templates", {})
1097
+ readable_lighting_base = lighting_map.get(time_of_day, f"with {time_of_day.replace('_', ' ')} lighting conditions")
1098
+ readable_lighting = readable_lighting_base.lower().replace("the scene is captured", "").replace("the scene has", "").strip()
1099
+ ambiance_statement += f", likely {readable_lighting}."
1100
+ ambiance_parts.append(ambiance_statement)
1101
+
1102
+ if viewpoint and viewpoint != "eye_level":
1103
+ vp_templates = self.templates.get("viewpoint_templates", {})
1104
+ if viewpoint in vp_templates:
1105
+ vp_prefix = vp_templates[viewpoint].get("prefix", "").strip()
1106
+ if vp_prefix:
1107
+ if not ambiance_parts:
1108
+ ambiance_parts.append(f"{vp_prefix.capitalize()} the general layout of the scene is observed.")
1109
+ else:
1110
+ ambiance_parts[-1] = ambiance_parts[-1].rstrip('.') + f", viewed {vp_templates[viewpoint].get('short_desc', viewpoint)}."
1111
+
1112
+ if ambiance_parts:
1113
+ description_segments.append(" ".join(ambiance_parts))
1114
+
1115
+ # 2. Describe ALL detected objects, grouped by class, with accurate counts and locations
1116
+ if not detected_objects:
1117
+ # This part remains, but the conditions to reach here might change based on confident_objects check
1118
+ if not description_segments:
1119
+ description_segments.append("A general scene is visible, but no specific objects were clearly identified.")
1120
+ else:
1121
+ description_segments.append("Within this setting, no specific objects were clearly identified.")
1122
+ else:
1123
+ objects_by_class: Dict[str, List[Dict]] = {}
1124
+
1125
+ # keeping 0.25 as a placeholder
1126
+ confidence_filter_threshold = getattr(self, 'confidence_threshold_for_description', 0.25)
1127
+ confident_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= confidence_filter_threshold]
1128
+
1129
+ if not confident_objects:
1130
+ # This message is more appropriate if objects existed but none met confidence
1131
+ no_confident_obj_msg = "While some elements might be present, no objects were identified with sufficient confidence for a detailed description."
1132
+ if not description_segments: description_segments.append(no_confident_obj_msg)
1133
+ else: description_segments.append(no_confident_obj_msg.lower().capitalize()) # Append as a new sentence
1134
+ else:
1135
+ if object_statistics:
1136
+ # 使用預計算的統計信息,並採用動態置信度策略
1137
+ for class_name, stats in object_statistics.items():
1138
+ count = stats.get("count", 0)
1139
+ avg_confidence = stats.get("avg_confidence", 0)
1140
+
1141
+ # 動態調整置信度閾值:裝飾性物品使用較低閾值
1142
+ dynamic_threshold = confidence_filter_threshold
1143
+ if class_name in ["potted plant", "vase", "clock", "book"]:
1144
+ dynamic_threshold = max(0.15, confidence_filter_threshold * 0.6)
1145
+ elif count >= 3: # 數量多的物品降低閾值
1146
+ dynamic_threshold = max(0.2, confidence_filter_threshold * 0.8)
1147
+
1148
+ if count > 0 and avg_confidence >= dynamic_threshold:
1149
+ matching_objects = [obj for obj in confident_objects if obj.get("class_name") == class_name]
1150
+ if not matching_objects:
1151
+ # 如果高信心度的物體中沒有,從原始列表中尋找
1152
+ matching_objects = [obj for obj in detected_objects
1153
+ if obj.get("class_name") == class_name and obj.get("confidence", 0) >= dynamic_threshold]
1154
+
1155
+ if matching_objects:
1156
+ actual_count = min(stats["count"], len(matching_objects))
1157
+ objects_by_class[class_name] = matching_objects[:actual_count]
1158
+ else:
1159
+ # 回退邏輯同樣使用動態閾值
1160
+ for obj in confident_objects:
1161
+ name = obj.get("class_name", "unknown object")
1162
+ if name == "unknown object" or not name: continue
1163
+ if name not in objects_by_class:
1164
+ objects_by_class[name] = []
1165
+ objects_by_class[name].append(obj)
1166
+
1167
+ if not objects_by_class: # Should be rare if confident_objects was not empty and had valid names
1168
+ description_segments.append("No common objects were confidently identified for detailed description.")
1169
+ else:
1170
+ def sort_key_object_groups(item_tuple: Tuple[str, List[Dict]]):
1171
+ class_name_key, obj_group_list = item_tuple
1172
+ priority = 3 # 預設優先級
1173
+ count = len(obj_group_list)
1174
+
1175
+ # 動態優先級:基於場景相關性和數量
1176
+ if class_name_key == "person":
1177
+ priority = 0
1178
+ elif class_name_key in ["dining table", "chair", "sofa", "bed"]:
1179
+ priority = 1 # 室內主要家具
1180
+ elif class_name_key in ["car", "bus", "truck", "traffic light"]:
1181
+ priority = 2 # 交通相關物體
1182
+ elif count >= 3: # 數量多的物體提升優先級
1183
+ priority = max(1, priority - 1)
1184
+ elif class_name_key in ["potted plant", "vase", "clock", "book"] and count >= 2:
1185
+ priority = 2 # 裝飾性物品有一定數量時提升優先級
1186
+
1187
+ avg_area = sum(o.get("normalized_area", 0.0) for o in obj_group_list) / len(obj_group_list) if obj_group_list else 0
1188
+
1189
+ # 增加數量權重:多個同類物體更重要
1190
+ quantity_bonus = min(count / 5.0, 1.0) # 最多1.0的加成
1191
+
1192
+ return (priority, -len(obj_group_list), -avg_area, -quantity_bonus)
1193
+
1194
+ # 去除重複的邏輯
1195
+ deduplicated_objects_by_class = {}
1196
+ processed_positions = []
1197
+
1198
+ for class_name, group_of_objects in objects_by_class.items():
1199
+ unique_objects = []
1200
+
1201
+ for obj in group_of_objects:
1202
+ obj_position = obj.get("normalized_center", [0.5, 0.5])
1203
+ is_duplicate = False
1204
+
1205
+ # 檢查是否與已處理的物體位置重疊
1206
+ for processed_pos in processed_positions:
1207
+ position_distance = abs(obj_position[0] - processed_pos[0]) + abs(obj_position[1] - processed_pos[1])
1208
+ if position_distance < 0.15: # 位置重疊閾值
1209
+ is_duplicate = True
1210
+ break
1211
+
1212
+ if not is_duplicate:
1213
+ unique_objects.append(obj)
1214
+ processed_positions.append(obj_position)
1215
+
1216
+ if unique_objects:
1217
+ deduplicated_objects_by_class[class_name] = unique_objects
1218
+
1219
+ objects_by_class = deduplicated_objects_by_class
1220
+
1221
+ sorted_object_groups = sorted(objects_by_class.items(), key=sort_key_object_groups)
1222
+
1223
+ object_clauses = [] # Stores individual object group descriptions
1224
+
1225
+ for class_name, group_of_objects in sorted_object_groups:
1226
+ count = len(group_of_objects)
1227
+ if count == 0: continue
1228
+
1229
+ # 使用統計信息確保準確的數量描述
1230
+ if object_statistics and class_name in object_statistics:
1231
+ actual_count = object_statistics[class_name]["count"]
1232
+ # 根據實際統計數量生成描述
1233
+ if actual_count == 1:
1234
+ formatted_name_with_exact_count = f"one {class_name}"
1235
+ else:
1236
+ plural_form = f"{class_name}s" if not class_name.endswith('s') else class_name
1237
+ formatted_name_with_exact_count = f"{actual_count} {plural_form}"
1238
+ else:
1239
+ # 回退到原有的格式化邏輯
1240
+ formatted_name_with_exact_count = self._format_object_list_for_description(
1241
+ [group_of_objects[0]] * count,
1242
+ use_indefinite_article_for_one=False,
1243
+ count_threshold_for_generalization=-1
1244
+ )
1245
+
1246
+ if formatted_name_with_exact_count == "no specific objects clearly identified" or not formatted_name_with_exact_count:
1247
+ continue
1248
+
1249
+ # Determine collective location for the group
1250
+ location_description_suffix = "" # e.g., "is in the center" or "are in the west area"
1251
+ if count == 1:
1252
+ location_description_suffix = f"is {self._get_spatial_description(group_of_objects[0], image_width, image_height)}"
1253
+ else:
1254
+ distinct_regions = sorted(list(set(obj.get("region", "unknown_region") for obj in group_of_objects)))
1255
+ known_regions = [r for r in distinct_regions if r != "unknown_region"]
1256
+ if not known_regions and "unknown_region" in distinct_regions:
1257
+ location_description_suffix = "are visible in the scene"
1258
+ elif len(known_regions) == 1:
1259
+ location_description_suffix = f"are primarily in the {known_regions[0].replace('_', ' ')} area"
1260
+ elif len(known_regions) == 2:
1261
+ location_description_suffix = f"are mainly across the {known_regions[0].replace('_',' ')} and {known_regions[1].replace('_',' ')} areas"
1262
+ elif len(known_regions) > 2:
1263
+ location_description_suffix = "are distributed in various parts of the scene"
1264
+ else:
1265
+ location_description_suffix = "are visible in the scene"
1266
+
1267
+ # Capitalize the object description (e.g., "Six cars")
1268
+ formatted_name_capitalized = formatted_name_with_exact_count[0].upper() + formatted_name_with_exact_count[1:]
1269
+ object_clauses.append(f"{formatted_name_capitalized} {location_description_suffix}")
1270
+
1271
+ if object_clauses:
1272
+ # Join object clauses into one or more sentences.
1273
+ if not description_segments: # If no ambiance, start with the first object clause.
1274
+ if object_clauses:
1275
+ first_clause = object_clauses.pop(0) # Take the first one out
1276
+ description_segments.append(first_clause + ".")
1277
+ else: # Ambiance exists, prepend with "The scene features..." or similar
1278
+ if object_clauses:
1279
+ description_segments.append("The scene features:") # Or "Key elements include:"
1280
+
1281
+ # Add remaining object clauses as separate points or a continuous sentence
1282
+ # For now, let's join them into a single continuous sentence string to be added.
1283
+ if object_clauses: # If there are more clauses after the first (or after "The scene features:")
1284
+ joined_object_clauses = ". ".join(object_clauses)
1285
+ if joined_object_clauses and not joined_object_clauses.endswith("."):
1286
+ joined_object_clauses += "."
1287
+ description_segments.append(joined_object_clauses)
1288
+
1289
+ elif not description_segments : # No ambiance and no describable objects after filtering
1290
+ return "The image depicts a scene, but specific objects could not be described with confidence or detail."
1291
+
1292
+ # --- Final assembly and formatting ---
1293
+ # Join all collected segments. _smart_append might be better if parts are not full sentences.
1294
+ # Since we aim for full sentences in segments, simple join then format.
1295
+ raw_description = ""
1296
+ for i, segment in enumerate(filter(None, description_segments)):
1297
+ segment = segment.strip()
1298
+ if not segment: continue
1299
+
1300
+ if not raw_description: # First non-empty segment
1301
+ raw_description = segment
1302
+ else:
1303
+ if not raw_description.endswith(('.', '!', '?')):
1304
+ raw_description += "."
1305
+ raw_description += " " + (segment[0].upper() + segment[1:] if len(segment) > 1 else segment.upper())
1306
+
1307
+ if raw_description and not raw_description.endswith(('.', '!', '?')):
1308
+ raw_description += "."
1309
+
1310
+ final_description = self._format_final_description(raw_description) # Crucial for final polish
1311
+
1312
+ if not final_description or len(final_description.strip()) < 20:
1313
+ # Fallback if description is too short or empty after processing
1314
+ # Use a more informative fallback if confident_objects existed
1315
+ if 'confident_objects' in locals() and confident_objects:
1316
+ return "The scene contains several detected objects, but a detailed textual description could not be fully constructed."
1317
+ else:
1318
+ return "A general scene is depicted with no objects identified with high confidence."
1319
+
1320
+ return final_description
1321
+
1322
+
1323
  def _generate_scene_details(self,
1324
+ scene_type: str,
1325
+ detected_objects: List[Dict],
1326
+ lighting_info: Optional[Dict] = None,
1327
+ viewpoint: str = "eye_level",
1328
+ spatial_analysis: Optional[Dict] = None,
1329
+ image_dimensions: Optional[Tuple[int, int]] = None,
1330
+ places365_info: Optional[Dict] = None,
1331
+ object_statistics: Optional[Dict] = None
1332
+ ) -> str:
1333
  """
1334
  Generate detailed description based on scene type and detected objects.
1335
+ Enhanced to handle everyday scenes dynamically with accurate object counting.
1336
 
1337
  Args:
1338
+ scene_type: Identified scene type.
1339
+ detected_objects: List of detected objects.
1340
+ lighting_info: Optional lighting condition information.
1341
+ viewpoint: Detected viewpoint (aerial, eye_level, etc.).
1342
+ spatial_analysis: Optional results from SpatialAnalyzer.
1343
+ image_dimensions: Optional tuple of (image_width, image_height).
1344
+ places365_info: Optional Places365 scene classification results.
1345
+ object_statistics: Optional detailed object statistics with counts and confidence.
1346
 
1347
  Returns:
1348
+ str: Detailed scene description.
1349
  """
 
1350
  scene_details = ""
1351
  scene_templates = self.templates.get("scene_detail_templates", {})
1352
 
1353
+ # List of scene types considered "everyday" or generic
1354
+ everyday_scene_types = [
1355
+ "general_indoor_space", "generic_street_view",
1356
+ "desk_area_workspace", "outdoor_gathering_spot",
1357
+ "kitchen_counter_or_utility_area", "unknown"
1358
+ ]
1359
 
1360
+ # Extract Places365 attributes for enhanced description
1361
+ places365_attributes = []
1362
+ scene_specific_details = ""
1363
+
1364
+ if places365_info and places365_info.get('confidence', 0) > 0.4:
1365
+ attributes = places365_info.get('attributes', [])
1366
+ scene_label = places365_info.get('scene_label', '')
1367
+
1368
+ # Filter relevant attributes for description enhancement
1369
+ relevant_attributes = [attr for attr in attributes if attr in [
1370
+ 'natural_lighting', 'artificial_lighting', 'commercial', 'residential',
1371
+ 'workplace', 'recreational', 'educational', 'open_space', 'enclosed_space'
1372
+ ]]
1373
+ places365_attributes = relevant_attributes[:2]
1374
+
1375
+ # Generate scene-specific contextual details using object statistics
1376
+ if object_statistics:
1377
+ if 'commercial' in attributes and object_statistics.get('person', {}).get('count', 0) > 0:
1378
+ person_count = object_statistics['person']['count']
1379
+ if person_count == 1:
1380
+ scene_specific_details = "This appears to be an active commercial environment with a customer present."
1381
+ else:
1382
+ scene_specific_details = f"This appears to be an active commercial environment with {person_count} people present."
1383
+ elif 'residential' in attributes and scene_type in ['living_room', 'bedroom', 'kitchen']:
1384
+ scene_specific_details = "The setting suggests a comfortable residential living space."
1385
+ elif 'workplace' in attributes and any(object_statistics.get(obj, {}).get('count', 0) > 0
1386
+ for obj in ['laptop', 'keyboard', 'monitor']):
1387
+ scene_specific_details = "The environment indicates an active workspace or office setting."
1388
  else:
1389
+ # Fallback to original logic if object_statistics not available
1390
+ if 'commercial' in attributes and any(obj['class_name'] in ['person', 'chair', 'table'] for obj in detected_objects):
1391
+ scene_specific_details = "This appears to be an active commercial environment with customer activity."
1392
+ elif 'residential' in attributes and scene_type in ['living_room', 'bedroom', 'kitchen']:
1393
+ scene_specific_details = "The setting suggests a comfortable residential living space."
1394
+ elif 'workplace' in attributes and any(obj['class_name'] in ['laptop', 'keyboard', 'monitor'] for obj in detected_objects):
1395
+ scene_specific_details = "The environment indicates an active workspace or office setting."
1396
+
1397
+ # Determine scene description approach
1398
+ is_confident_specific_scene = scene_type not in everyday_scene_types and scene_type in scene_templates
1399
+ treat_as_everyday = scene_type in everyday_scene_types
1400
+
1401
+ if hasattr(self, 'enable_landmark') and not self.enable_landmark:
1402
+ if scene_type not in ["kitchen", "bedroom", "living_room", "office_workspace", "dining_area", "professional_kitchen"]:
1403
+ treat_as_everyday = True
1404
+
1405
+ if treat_as_everyday or not is_confident_specific_scene:
1406
+ # Generate dynamic description for everyday scenes with object statistics
1407
+ self.logger.info(f"Generating dynamic description for scene_type: {scene_type}")
1408
+ scene_details = self._generate_dynamic_everyday_description(
1409
+ detected_objects,
1410
+ lighting_info,
1411
+ viewpoint,
1412
+ spatial_analysis,
1413
+ image_dimensions,
1414
+ places365_info,
1415
+ object_statistics # Pass object statistics to dynamic description
1416
+ )
1417
+ elif scene_type in scene_templates:
1418
+ # Use template-based description with enhanced object information
1419
+ self.logger.info(f"Using template for scene_type: {scene_type}")
1420
+ viewpoint_key = f"{scene_type}_{viewpoint}"
1421
+ templates_list = scene_templates.get(viewpoint_key, scene_templates.get(scene_type, []))
1422
 
 
1423
  if templates_list:
1424
  detail_template = random.choice(templates_list)
 
 
1425
  scene_details = self._fill_detail_template(
1426
  detail_template,
1427
  detected_objects,
1428
+ scene_type,
1429
+ places365_info,
1430
+ object_statistics # Pass object statistics to template filling
 
 
 
 
 
 
 
1431
  )
1432
  else:
1433
+ scene_details = self._generate_dynamic_everyday_description(
1434
+ detected_objects, lighting_info, viewpoint, spatial_analysis,
1435
+ image_dimensions, places365_info, object_statistics
1436
+ )
1437
+ else:
1438
+ # Fallback to dynamic description with object statistics
1439
+ self.logger.info(f"No specific template for {scene_type}, generating dynamic description.")
1440
+ scene_details = self._generate_dynamic_everyday_description(
1441
+ detected_objects, lighting_info, viewpoint, spatial_analysis,
1442
+ image_dimensions, places365_info, object_statistics
1443
+ )
1444
+
1445
+ # Filter out landmark references if landmark detection is disabled
1446
+ if hasattr(self, 'enable_landmark') and not self.enable_landmark:
1447
+ scene_details = self.filter_landmark_references(scene_details, enable_landmark=False)
1448
 
1449
+ return scene_details if scene_details else "A scene with some visual elements."
1450
 
1451
+ def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str, places365_info: Optional[Dict] = None, object_statistics: Optional[Dict] = None) -> str:
1452
  """
1453
  Fill a template with specific details based on detected objects.
1454
 
 
1469
  # Get object template fillers
1470
  fillers = self.templates.get("object_template_fillers", {})
1471
 
1472
+ # 基於物品的統計資訊形成更準確的模板填充內容
1473
+ statistics_based_replacements = {}
1474
+ if object_statistics:
1475
+ # 根據統計信息生成具體的物體描述
1476
+ for class_name, stats in object_statistics.items():
1477
+ count = stats.get("count", 0)
1478
+ if count > 0:
1479
+ # 為常見物體類別生成基於統計的描述
1480
+ if class_name == "potted plant":
1481
+ if count == 1:
1482
+ statistics_based_replacements["plant_elements"] = "a potted plant"
1483
+ elif count <= 3:
1484
+ statistics_based_replacements["plant_elements"] = f"{count} potted plants"
1485
+ else:
1486
+ statistics_based_replacements["plant_elements"] = f"multiple potted plants ({count} total)"
1487
+
1488
+ elif class_name == "chair":
1489
+ if count == 1:
1490
+ statistics_based_replacements["seating"] = "a chair"
1491
+ elif count <= 4:
1492
+ statistics_based_replacements["seating"] = f"{count} chairs"
1493
+ else:
1494
+ statistics_based_replacements["seating"] = f"numerous chairs ({count} total)"
1495
+
1496
+ elif class_name == "person":
1497
+ if count == 1:
1498
+ statistics_based_replacements["people_and_vehicles"] = "a person"
1499
+ statistics_based_replacements["pedestrian_flow"] = "an individual walking"
1500
+ elif count <= 5:
1501
+ statistics_based_replacements["people_and_vehicles"] = f"{count} people"
1502
+ statistics_based_replacements["pedestrian_flow"] = f"{count} people walking"
1503
+ else:
1504
+ statistics_based_replacements["people_and_vehicles"] = f"many people ({count} individuals)"
1505
+ statistics_based_replacements["pedestrian_flow"] = f"a crowd of {count} people"
1506
+
1507
  # 為所有可能的變數設置默認值
1508
  default_replacements = {
1509
  # 室內相關
 
1683
  "knowledge_transfer": "learning exchanges"
1684
  }
1685
 
1686
+ # 將統計的資訊形成的替換內容合併到默認替換中
1687
+ default_replacements.update(statistics_based_replacements)
1688
+
1689
+ # Add Places365-specific template variables
1690
+ places365_scene_context = ""
1691
+ places365_atmosphere = ""
1692
+
1693
+ if places365_info and places365_info.get('confidence', 0) > 0.35:
1694
+ scene_label = places365_info.get('scene_label', '').replace('_', ' ')
1695
+ attributes = places365_info.get('attributes', [])
1696
+
1697
+ if scene_label and scene_label != scene_type:
1698
+ places365_scene_context = f"characteristic of a {scene_label}"
1699
+
1700
+ if 'natural_lighting' in attributes:
1701
+ places365_atmosphere = "with natural illumination"
1702
+ elif 'artificial_lighting' in attributes:
1703
+ places365_atmosphere = "under artificial lighting"
1704
+
1705
+ # Update default_replacements with Places365 context
1706
+ if places365_scene_context:
1707
+ default_replacements["places365_context"] = places365_scene_context
1708
+ else:
1709
+ default_replacements["places365_context"] = ""
1710
+
1711
+ if places365_atmosphere:
1712
+ default_replacements["places365_atmosphere"] = places365_atmosphere
1713
+ else:
1714
+ default_replacements["places365_atmosphere"] = ""
1715
+
1716
  # For each placeholder, try to fill with appropriate content
1717
  for placeholder in placeholders:
1718
  if placeholder in fillers:
 
1940
  if not detected_objects:
1941
  return "eye_level" # default
1942
 
1943
+ # extract space and size
1944
  top_region_count = 0
1945
  bottom_region_count = 0
1946
  total_objects = len(detected_objects)
 
1956
  crosswalk_pattern_detected = False
1957
 
1958
  for obj in detected_objects:
1959
+ # 計算頂部or底部區域中的物體
1960
  region = obj["region"]
1961
  if "top" in region:
1962
  top_region_count += 1
1963
  elif "bottom" in region:
1964
  bottom_region_count += 1
1965
 
1966
+ # 計算標準化大小(Area)
1967
  if "normalized_area" in obj:
1968
  sizes.append(obj["normalized_area"])
1969
 
1970
+ # 計算高度or寬度比例
1971
  if "normalized_size" in obj:
1972
  width, height = obj["normalized_size"]
1973
  if width > 0:
1974
  height_width_ratios.append(height / width)
1975
 
1976
+ # 收集人的位置
1977
  if obj["class_id"] == 0: # 人
1978
  if "normalized_center" in obj:
1979
  people_positions.append(obj["normalized_center"])
1980
 
1981
+ # 專門為斑馬線的十字路口添加檢測邏輯
1982
  # 檢查是否有明顯的垂直和水平行人分布
1983
  people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # 人
1984
 
 
1997
  y_range = max(y_coords) - min(y_coords)
1998
 
1999
  # 嘗試檢測十字形分布
2000
+ # 如果 x 和 y 方向都有較大範圍,且範圍相似,就有可能是十字路口
2001
  if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
2002
 
2003
  # 計算到中心點的距離
 
2194
  description = description.replace("a bed in the room", "a bed")
2195
 
2196
  # 處理重複的物品列表
 
2197
  object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
2198
 
2199
  for obj_list in object_lists:
 
2243
  if not functional_zones:
2244
  return ""
2245
 
2246
+ # 處理不同類型的 functional_zones 參數
2247
+ if isinstance(functional_zones, list):
2248
+ # 如果是列表,轉換為字典格式
2249
+ zones_dict = {}
2250
+ for i, zone in enumerate(functional_zones):
2251
+ if isinstance(zone, dict) and 'name' in zone:
2252
+ zone_name = zone['name']
2253
+ else:
2254
+ zone_name = f"zone_{i}"
2255
+ zones_dict[zone_name] = zone if isinstance(zone, dict) else {"description": str(zone)}
2256
+ functional_zones = zones_dict
2257
+ elif not isinstance(functional_zones, dict):
2258
+ return ""
2259
+
2260
  # 計算場景中的總人數
2261
  total_people_count = 0
2262
  people_by_zone = {}
 
2296
 
2297
  # 生成匯總描述
2298
  summary = ""
2299
+ max_mentioned_people = 0 # track已經提到的最大人數
2300
 
2301
  # 如果總人數顯著且還沒在主描述中提到,添加總人數描述
2302
  if total_people_count > 5:
2303
  summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
2304
+ max_mentioned_people = total_people_count # update已提到的最大人數
2305
 
2306
  # 處理每個區域的描述,確保人數信息的一致性
2307
  processed_zones = []
 
2310
  zone_desc = zone_info.get("description", "a functional zone")
2311
  zone_people_count = people_by_zone.get(zone_name, 0)
2312
 
2313
+ # 檢查描述中是否包含人數資訊
2314
  contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())
2315
 
2316
  # 如果描述包含人數信息,且人數較小(小於已提到的最大人數),則修改描述
evaluation_metrics.py CHANGED
@@ -138,7 +138,7 @@ class EvaluationMetrics:
138
  # Create empty plot if error
139
  fig, ax = plt.subplots(figsize=figsize)
140
  ax.text(0.5, 0.5, viz_data["error"],
141
- ha='center', va='center', fontsize=14, fontfamily='Arial')
142
  ax.set_xlim(0, 1)
143
  ax.set_ylim(0, 1)
144
  ax.axis('off')
@@ -148,7 +148,7 @@ class EvaluationMetrics:
148
  # Create empty plot if no data
149
  fig, ax = plt.subplots(figsize=figsize)
150
  ax.text(0.5, 0.5, "No detection data available",
151
- ha='center', va='center', fontsize=14, fontfamily='Arial')
152
  ax.set_xlim(0, 1)
153
  ax.set_ylim(0, 1)
154
  ax.axis('off')
@@ -163,7 +163,6 @@ class EvaluationMetrics:
163
  colors = [item["color"] for item in class_data]
164
 
165
  # Create figure and horizontal bar chart with improved styling
166
- plt.rcParams['font.family'] = 'Arial'
167
  fig, ax = plt.subplots(figsize=figsize)
168
 
169
  # Set background color to white
@@ -181,15 +180,15 @@ class EvaluationMetrics:
181
  conf = class_data[i]["average_confidence"]
182
  ax.text(width + 0.3, bar.get_y() + bar.get_height()/2,
183
  f"{width:.0f} (conf: {conf:.2f})",
184
- va='center', fontsize=12, fontfamily='Arial')
185
 
186
  # Customize axis and labels with larger fonts
187
  ax.set_yticks(y_pos)
188
- ax.set_yticklabels(class_names, fontsize=14, fontfamily='Arial')
189
  ax.invert_yaxis() # Labels read top-to-bottom
190
- ax.set_xlabel('Count', fontsize=14, fontfamily='Arial')
191
  ax.set_title(f'Objects Detected: {viz_data["total_objects"]} Total',
192
- fontsize=16, fontfamily='Arial', fontweight='bold')
193
 
194
  # Add grid for better readability
195
  ax.set_axisbelow(True)
@@ -204,7 +203,7 @@ class EvaluationMetrics:
204
  f"Average Confidence: {viz_data['average_confidence']:.2f}\n"
205
  f"Unique Classes: {len(viz_data['class_data'])}"
206
  )
207
- plt.figtext(0.02, 0.02, summary_text, fontsize=12, fontfamily='Arial',
208
  bbox=dict(facecolor='white', alpha=0.9, boxstyle='round,pad=0.5',
209
  edgecolor='#E5E7EB'))
210
 
 
138
  # Create empty plot if error
139
  fig, ax = plt.subplots(figsize=figsize)
140
  ax.text(0.5, 0.5, viz_data["error"],
141
+ ha='center', va='center', fontsize=14)
142
  ax.set_xlim(0, 1)
143
  ax.set_ylim(0, 1)
144
  ax.axis('off')
 
148
  # Create empty plot if no data
149
  fig, ax = plt.subplots(figsize=figsize)
150
  ax.text(0.5, 0.5, "No detection data available",
151
+ ha='center', va='center', fontsize=14)
152
  ax.set_xlim(0, 1)
153
  ax.set_ylim(0, 1)
154
  ax.axis('off')
 
163
  colors = [item["color"] for item in class_data]
164
 
165
  # Create figure and horizontal bar chart with improved styling
 
166
  fig, ax = plt.subplots(figsize=figsize)
167
 
168
  # Set background color to white
 
180
  conf = class_data[i]["average_confidence"]
181
  ax.text(width + 0.3, bar.get_y() + bar.get_height()/2,
182
  f"{width:.0f} (conf: {conf:.2f})",
183
+ va='center', fontsize=12)
184
 
185
  # Customize axis and labels with larger fonts
186
  ax.set_yticks(y_pos)
187
+ ax.set_yticklabels(class_names, fontsize=14)
188
  ax.invert_yaxis() # Labels read top-to-bottom
189
+ ax.set_xlabel('Count', fontsize=14)
190
  ax.set_title(f'Objects Detected: {viz_data["total_objects"]} Total',
191
+ fontsize=16, fontweight='bold')
192
 
193
  # Add grid for better readability
194
  ax.set_axisbelow(True)
 
203
  f"Average Confidence: {viz_data['average_confidence']:.2f}\n"
204
  f"Unique Classes: {len(viz_data['class_data'])}"
205
  )
206
+ plt.figtext(0.02, 0.02, summary_text, fontsize=12,
207
  bbox=dict(facecolor='white', alpha=0.9, boxstyle='round,pad=0.5',
208
  edgecolor='#E5E7EB'))
209
 
image_processor.py CHANGED
@@ -13,6 +13,7 @@ from visualization_helper import VisualizationHelper
13
  from evaluation_metrics import EvaluationMetrics
14
  from lighting_analyzer import LightingAnalyzer
15
  from scene_analyzer import SceneAnalyzer
 
16
 
17
  class ImageProcessor:
18
  """
@@ -20,13 +21,76 @@ class ImageProcessor:
20
  Separates processing logic from UI components
21
  """
22
 
23
- def __init__(self, use_llm=True, llm_model_path=None):
24
  """Initialize the image processor with required components"""
25
- self.color_mapper = ColorMapper()
26
- self.model_instances = {}
27
- self.lighting_analyzer = LightingAnalyzer()
28
- self.use_llm = use_llm
29
- self.llm_model_path = llm_model_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
32
  """
@@ -53,48 +117,74 @@ class ImageProcessor:
53
 
54
  return self.model_instances[model_name]
55
 
56
- def analyze_scene(self, detection_result: Any, lighting_info: Optional[Dict] = None) -> Dict:
57
  """
58
  Perform scene analysis on detection results
59
 
60
  Args:
61
  detection_result: Object detection result from YOLOv8
62
  lighting_info: Lighting condition analysis results (optional)
 
 
63
 
64
  Returns:
65
  Dictionary containing scene analysis results
66
  """
 
67
  try:
68
- # Initialize scene analyzer if not already done
69
- if not hasattr(self, 'scene_analyzer'):
 
 
 
 
70
  self.scene_analyzer = SceneAnalyzer(
71
- class_names=detection_result.names,
72
  use_llm=self.use_llm,
 
 
73
  llm_model_path=self.llm_model_path
74
  )
75
 
76
- # 確保類名正確更新
77
- if self.scene_analyzer.class_names is None:
78
- self.scene_analyzer.class_names = detection_result.names
79
- self.scene_analyzer.spatial_analyzer.class_names = detection_result.names
 
 
 
 
 
 
 
 
 
 
 
80
 
81
- # Perform scene analysis with lighting info
82
  scene_analysis = self.scene_analyzer.analyze(
83
  detection_result=detection_result,
84
  lighting_info=lighting_info,
85
  class_confidence_threshold=0.35,
86
- scene_confidence_threshold=0.6
 
 
87
  )
88
 
89
  return scene_analysis
 
90
  except Exception as e:
91
  print(f"Error in scene analysis: {str(e)}")
92
  import traceback
93
  traceback.print_exc()
 
 
94
  return {
95
  "scene_type": "unknown",
96
  "confidence": 0.0,
97
  "description": f"Error during scene analysis: {str(e)}",
 
98
  "objects_present": [],
99
  "object_count": 0,
100
  "regions": {},
@@ -103,146 +193,256 @@ class ImageProcessor:
103
  "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
104
  }
105
 
106
- def analyze_lighting_conditions(self, image):
107
  """
108
- 分析光照條件。
109
 
110
  Args:
111
  image: 輸入圖像
 
112
 
113
  Returns:
114
  Dict: 光照分析結果
115
  """
116
- return self.lighting_analyzer.analyze(image)
117
 
118
- def process_image(self, image, model_name: str, confidence_threshold: float, filter_classes: Optional[List[int]] = None) -> Tuple[Any, str, Dict]:
119
  """
120
- Process an image for object detection
121
 
122
  Args:
123
- image: Input image (numpy array or PIL Image)
124
- model_name: Name of the model to use
125
- confidence_threshold: Confidence threshold for detection
126
- filter_classes: Optional list of classes to filter results
127
 
128
  Returns:
129
- Tuple of (result_image, result_text, stats_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  """
131
- # Get model instance
132
  model_instance = self.get_model_instance(model_name, confidence_threshold)
 
 
133
 
134
- # Initialize key variables
135
  result = None
136
- stats = {}
137
  temp_path = None
 
138
 
139
  try:
140
- # Processing input image
141
  if isinstance(image, np.ndarray):
142
- # Convert BGR to RGB if needed
143
- if image.shape[2] == 3:
144
- image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 
 
 
 
 
 
145
  else:
146
- image_rgb = image
147
- pil_image = Image.fromarray(image_rgb)
 
148
  elif image is None:
149
  return None, "No image provided. Please upload an image.", {}
150
  else:
151
- pil_image = image
 
 
 
152
 
153
- # Analyze lighting conditions
154
- lighting_info = self.analyze_lighting_conditions(pil_image)
155
 
156
- # Store temp files
157
- temp_dir = tempfile.gettempdir() # Use system temp directory
 
158
  temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
159
  temp_path = os.path.join(temp_dir, temp_filename)
160
- pil_image.save(temp_path)
161
 
162
- # Object detection
163
  result = model_instance.detect(temp_path)
164
 
165
- if result is None:
166
- return None, "Detection failed. Please try again with a different image.", {}
167
-
168
- # Calculate stats
169
- stats = EvaluationMetrics.calculate_basic_stats(result)
 
 
170
 
171
- # Add space calculation
 
172
  spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
173
- stats["spatial_metrics"] = spatial_metrics
 
 
 
174
 
175
- # Add lighting information
176
- stats["lighting_conditions"] = lighting_info
177
-
178
- # Apply filter if specified
179
  if filter_classes and len(filter_classes) > 0:
180
- # Get classes, boxes, confidence
181
  classes = result.boxes.cls.cpu().numpy().astype(int)
182
  confs = result.boxes.conf.cpu().numpy()
183
- boxes = result.boxes.xyxy.cpu().numpy()
184
-
185
- mask = np.zeros_like(classes, dtype=bool)
186
- for cls_id in filter_classes:
187
- mask = np.logical_or(mask, classes == cls_id)
188
-
189
- filtered_stats = {
190
- "total_objects": int(np.sum(mask)),
191
- "class_statistics": {},
192
- "average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0,
193
- "spatial_metrics": stats["spatial_metrics"],
194
  "lighting_conditions": lighting_info
195
  }
196
-
197
- # Update stats
198
  names = result.names
199
- for cls, conf in zip(classes[mask], confs[mask]):
200
- cls_name = names[int(cls)]
201
- if cls_name not in filtered_stats["class_statistics"]:
202
- filtered_stats["class_statistics"][cls_name] = {
203
- "count": 0,
204
- "average_confidence": 0
205
- }
206
-
207
- filtered_stats["class_statistics"][cls_name]["count"] += 1
208
- filtered_stats["class_statistics"][cls_name]["average_confidence"] = conf
209
-
210
- stats = filtered_stats
211
-
212
- viz_data = EvaluationMetrics.generate_visualization_data(
213
- result,
214
- self.color_mapper.get_all_colors()
 
215
  )
216
 
217
- result_image = VisualizationHelper.visualize_detection(
218
- temp_path, result, color_mapper=self.color_mapper, figsize=(12, 12), return_pil=True, filter_classes=filter_classes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  )
220
 
221
- result_text = EvaluationMetrics.format_detection_summary(viz_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
- if result is not None:
224
- # Perform scene analysis with lighting info
225
- scene_analysis = self.analyze_scene(result, lighting_info)
 
 
226
 
227
- # Add scene analysis to stats
228
- stats["scene_analysis"] = scene_analysis
 
229
 
230
- return result_image, result_text, stats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  except Exception as e:
233
- error_message = f"Error Occurs: {str(e)}"
234
  import traceback
235
  traceback.print_exc()
236
- print(error_message)
237
- return None, error_message, {}
238
-
239
  finally:
240
  if temp_path and os.path.exists(temp_path):
241
- try:
242
- os.remove(temp_path)
243
- except Exception as e:
244
- print(f"Cannot delete temp files {temp_path}: {str(e)}")
245
-
246
 
247
  def format_result_text(self, stats: Dict) -> str:
248
  """
@@ -281,7 +481,7 @@ class ImageProcessor:
281
  else:
282
  lines.append("No class information available.")
283
 
284
- # 添加空間信息
285
  if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
286
  lines.append("Object Distribution:")
287
 
 
13
  from evaluation_metrics import EvaluationMetrics
14
  from lighting_analyzer import LightingAnalyzer
15
  from scene_analyzer import SceneAnalyzer
16
+ from places365_model import Places365Model
17
 
18
  class ImageProcessor:
19
  """
 
21
  Separates processing logic from UI components
22
  """
23
 
24
+ def __init__(self, use_llm=True, llm_model_path=None, enable_places365=True, places365_model_name='resnet50_places365'):
25
  """Initialize the image processor with required components"""
26
+ print(f"Initializing ImageProcessor with use_llm={use_llm}, enable_places365={enable_places365}")
27
+
28
+ try:
29
+ # Initialize basic components first
30
+ self.use_llm = use_llm
31
+ self.llm_model_path = llm_model_path
32
+ self.enable_places365 = enable_places365
33
+ self.model_instances = {}
34
+
35
+ # Initialize ColorMapper
36
+ self.color_mapper = ColorMapper()
37
+ print("ColorMapper initialized successfully")
38
+
39
+ # Initialize LightingAnalyzer
40
+ self.lighting_analyzer = LightingAnalyzer()
41
+ print("LightingAnalyzer initialized successfully")
42
+
43
+ # Initialize Places365 model if enabled
44
+ self.places365_model = None
45
+ if self.enable_places365:
46
+ try:
47
+ self.places365_model = Places365Model(
48
+ model_name=places365_model_name,
49
+ device=None
50
+ )
51
+ print(f"Places365 model initialized successfully with {places365_model_name}")
52
+ except Exception as e:
53
+ print(f"Warning: Failed to initialize Places365 model: {e}")
54
+ print("Continuing without Places365 analysis")
55
+ self.enable_places365 = False
56
+ self.places365_model = None
57
+
58
+ # Initialize SceneAnalyzer with error handling
59
+ self.scene_analyzer = None
60
+ self.class_names = None # Will be set when first model is loaded
61
+
62
+ try:
63
+ # Initialize SceneAnalyzer without class_names (will be set later)
64
+ self.scene_analyzer = SceneAnalyzer(
65
+ class_names=None,
66
+ use_llm=self.use_llm,
67
+ use_clip=True,
68
+ enable_landmark=True,
69
+ llm_model_path=self.llm_model_path
70
+ )
71
+ print("SceneAnalyzer initialized successfully")
72
+
73
+ # Verify critical components
74
+ if self.scene_analyzer is not None:
75
+ print(f"SceneAnalyzer status - spatial_analyzer: {hasattr(self.scene_analyzer, 'spatial_analyzer')}, "
76
+ f"descriptor: {hasattr(self.scene_analyzer, 'descriptor')}, "
77
+ f"scene_describer: {hasattr(self.scene_analyzer, 'scene_describer')}")
78
+ else:
79
+ print("WARNING: scene_analyzer is None after initialization")
80
+
81
+ except Exception as e:
82
+ print(f"Error initializing SceneAnalyzer: {e}")
83
+ import traceback
84
+ traceback.print_exc()
85
+ self.scene_analyzer = None
86
+
87
+ print("ImageProcessor initialization completed successfully")
88
+
89
+ except Exception as e:
90
+ print(f"Critical error during ImageProcessor initialization: {e}")
91
+ import traceback
92
+ traceback.print_exc()
93
+ raise RuntimeError(f"Failed to initialize ImageProcessor: {str(e)}")
94
 
95
  def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
96
  """
 
117
 
118
  return self.model_instances[model_name]
119
 
120
+ def analyze_scene(self, detection_result: Any, lighting_info: Optional[Dict] = None, enable_landmark=True, places365_info=None) -> Dict:
121
  """
122
  Perform scene analysis on detection results
123
 
124
  Args:
125
  detection_result: Object detection result from YOLOv8
126
  lighting_info: Lighting condition analysis results (optional)
127
+ enable_landmark: Whether to enable landmark detection
128
+ places365_info: Places365 analysis results (optional)
129
 
130
  Returns:
131
  Dictionary containing scene analysis results
132
  """
133
+ print(f"DEBUG: analyze_scene received enable_landmark={enable_landmark}")
134
  try:
135
+ # Check if detection_result has valid names
136
+ class_names = getattr(detection_result, 'names', None) if detection_result else None
137
+
138
+ # Initialize or reinitialize scene analyzer if needed
139
+ if self.scene_analyzer is None:
140
+ print("Scene analyzer not initialized, creating new instance")
141
  self.scene_analyzer = SceneAnalyzer(
142
+ class_names=class_names,
143
  use_llm=self.use_llm,
144
+ use_clip=True,
145
+ enable_landmark=enable_landmark,
146
  llm_model_path=self.llm_model_path
147
  )
148
 
149
+ if self.scene_analyzer is None:
150
+ raise ValueError("Failed to create SceneAnalyzer instance")
151
+ else:
152
+ # Update existing scene analyzer settings
153
+ self.scene_analyzer.enable_landmark = enable_landmark
154
+
155
+ # Update class names if available and different
156
+ if class_names and self.scene_analyzer.class_names != class_names:
157
+ self.scene_analyzer.class_names = class_names
158
+ if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
159
+ self.scene_analyzer.spatial_analyzer.class_names = class_names
160
+
161
+ # Update landmark detection settings in child components
162
+ if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
163
+ self.scene_analyzer.spatial_analyzer.enable_landmark = enable_landmark
164
 
165
+ # Perform scene analysis with lighting info and Places365 context
166
  scene_analysis = self.scene_analyzer.analyze(
167
  detection_result=detection_result,
168
  lighting_info=lighting_info,
169
  class_confidence_threshold=0.35,
170
+ scene_confidence_threshold=0.6,
171
+ enable_landmark=enable_landmark,
172
+ places365_info=places365_info
173
  )
174
 
175
  return scene_analysis
176
+
177
  except Exception as e:
178
  print(f"Error in scene analysis: {str(e)}")
179
  import traceback
180
  traceback.print_exc()
181
+
182
+ # Return a valid default result
183
  return {
184
  "scene_type": "unknown",
185
  "confidence": 0.0,
186
  "description": f"Error during scene analysis: {str(e)}",
187
+ "enhanced_description": "Scene analysis could not be completed due to an error.",
188
  "objects_present": [],
189
  "object_count": 0,
190
  "regions": {},
 
193
  "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
194
  }
195
 
196
+ def analyze_lighting_conditions(self, image, places365_info: Optional[Dict] = None):
197
  """
198
+ 分析光照條件並考慮 Places365 場景資訊。
199
 
200
  Args:
201
  image: 輸入圖像
202
+ places365_info: Places365 場景分析結果,用於覆蓋邏輯
203
 
204
  Returns:
205
  Dict: 光照分析結果
206
  """
207
+ return self.lighting_analyzer.analyze(image, places365_info=places365_info)
208
 
209
+ def analyze_places365_scene(self, image):
210
  """
211
+ Analyze scene using Places365 model.
212
 
213
  Args:
214
+ image: Input image (PIL Image)
 
 
 
215
 
216
  Returns:
217
+ Dict: Places365 analysis results or None if disabled/failed
218
+ """
219
+ if not self.enable_places365 or self.places365_model is None:
220
+ return None
221
+
222
+ try:
223
+ if not isinstance(image, Image.Image):
224
+ if isinstance(image, np.ndarray):
225
+ image = Image.fromarray(image)
226
+ else:
227
+ print(f"Warning: Cannot process image of type {type(image)} for Places365")
228
+ return None
229
+
230
+ places365_result = self.places365_model.predict(image)
231
+
232
+ if places365_result and places365_result.get('confidence', 0) > 0.1:
233
+ print(f"Places365 detected: {places365_result['scene_label']} "
234
+ f"(mapped: {places365_result['mapped_scene_type']}) "
235
+ f"confidence: {places365_result['confidence']:.3f}")
236
+ return places365_result
237
+ else:
238
+ print("Places365 analysis failed or low confidence")
239
+ return None
240
+
241
+ except Exception as e:
242
+ print(f"Error in Places365 analysis: {str(e)}")
243
+ return None
244
+
245
+ def process_image(self, image: Any, model_name: str, confidence_threshold: float, filter_classes: Optional[List[int]] = None, enable_landmark: bool = True) -> Tuple[Any, str, Dict]:
246
+ """
247
+ Process an image for object detection and scene analysis.
248
+ Args:
249
+ image: Input image (numpy array or PIL Image).
250
+ model_name: Name of the model to use.
251
+ confidence_threshold: Confidence threshold for detection.
252
+ filter_classes: Optional list of classes to filter results.
253
+ enable_landmark: Whether to enable landmark detection for this run.
254
+ Returns:
255
+ Tuple of (result_image_pil, result_text, stats_data_with_scene_analysis).
256
  """
 
257
  model_instance = self.get_model_instance(model_name, confidence_threshold)
258
+ if model_instance is None:
259
+ return None, f"Failed to load model: {model_name}. Please check model configuration.", {}
260
 
 
261
  result = None
262
+ stats_data = {}
263
  temp_path = None
264
+ pil_image_for_processing = None # Use this to store the consistently processed PIL image
265
 
266
  try:
 
267
  if isinstance(image, np.ndarray):
268
+ if image.ndim == 3 and image.shape[2] == 3: # RGB or BGR
269
+ # Assuming BGR from OpenCV, convert to RGB for PIL standard
270
+ image_rgb_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
271
+ pil_image_for_processing = Image.fromarray(image_rgb_np)
272
+ elif image.ndim == 3 and image.shape[2] == 4: # RGBA or BGRA
273
+ image_rgba_np = cv2.cvtColor(image, cv2.COLOR_BGRA2RGBA) # Ensure RGBA
274
+ pil_image_for_processing = Image.fromarray(image_rgba_np).convert("RGB") # Convert to RGB
275
+ elif image.ndim == 2: # Grayscale
276
+ pil_image_for_processing = Image.fromarray(image).convert("RGB")
277
  else:
278
+ pil_image_for_processing = Image.fromarray(image) # Hope for the best
279
+ elif isinstance(image, Image.Image):
280
+ pil_image_for_processing = image.copy() # Use a copy
281
  elif image is None:
282
  return None, "No image provided. Please upload an image.", {}
283
  else:
284
+ return None, f"Unsupported image type: {type(image)}. Please provide a NumPy array or PIL Image.", {}
285
+
286
+ if pil_image_for_processing.mode != "RGB": # Ensure final image is RGB
287
+ pil_image_for_processing = pil_image_for_processing.convert("RGB")
288
 
289
+ # Add Places365 scene analysis parallel to lighting analysis
290
+ places365_info = self.analyze_places365_scene(pil_image_for_processing)
291
 
292
+ lighting_info = self.analyze_lighting_conditions(pil_image_for_processing, places365_info=places365_info)
293
+
294
+ temp_dir = tempfile.gettempdir()
295
  temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
296
  temp_path = os.path.join(temp_dir, temp_filename)
297
+ pil_image_for_processing.save(temp_path, format="JPEG")
298
 
 
299
  result = model_instance.detect(temp_path)
300
 
301
+ if result is None or not hasattr(result, 'boxes'):
302
+ scene_analysis_no_yolo = self.analyze_scene(result, lighting_info, enable_landmark=enable_landmark, places365_info=places365_info)
303
+ desc_no_yolo = scene_analysis_no_yolo.get("enhanced_description", scene_analysis_no_yolo.get("description", "Detection failed, scene context analysis attempted."))
304
+ stats_data["scene_analysis"] = scene_analysis_no_yolo
305
+ if places365_info:
306
+ stats_data["places365_analysis"] = places365_info
307
+ return pil_image_for_processing, desc_no_yolo, stats_data
308
 
309
+ # 統計資訊
310
+ stats_data = EvaluationMetrics.calculate_basic_stats(result)
311
  spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
312
+ stats_data["spatial_metrics"] = spatial_metrics
313
+ stats_data["lighting_conditions"] = lighting_info
314
+ if places365_info:
315
+ stats_data["places365_analysis"] = places365_info
316
 
 
 
 
 
317
  if filter_classes and len(filter_classes) > 0:
 
318
  classes = result.boxes.cls.cpu().numpy().astype(int)
319
  confs = result.boxes.conf.cpu().numpy()
320
+ mask = np.isin(classes, filter_classes)
321
+ filtered_stats_data = {
322
+ "total_objects": int(np.sum(mask)), "class_statistics": {},
323
+ "average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0.0,
324
+ "spatial_metrics": stats_data.get("spatial_metrics",{}),
 
 
 
 
 
 
325
  "lighting_conditions": lighting_info
326
  }
327
+ if places365_info:
328
+ filtered_stats_data["places365_analysis"] = places365_info
329
  names = result.names
330
+ class_conf_sums = {}
331
+ for cls_id_int, conf_val in zip(classes[mask], confs[mask]):
332
+ cls_name = names[cls_id_int]
333
+ if cls_name not in filtered_stats_data["class_statistics"]:
334
+ filtered_stats_data["class_statistics"][cls_name] = {"count": 0}
335
+ class_conf_sums[cls_name] = 0.0
336
+ filtered_stats_data["class_statistics"][cls_name]["count"] += 1 # 累計統計資訊
337
+ class_conf_sums[cls_name] += conf_val
338
+ for cls_name_stat, data_stat in filtered_stats_data["class_statistics"].items():
339
+ data_stat["average_confidence"] = round(class_conf_sums[cls_name_stat] / data_stat["count"] if data_stat["count"] > 0 else 0.0, 4)
340
+ stats_data = filtered_stats_data
341
+
342
+ viz_data = EvaluationMetrics.generate_visualization_data(result, self.color_mapper.get_all_colors())
343
+
344
+ result_image_pil = VisualizationHelper.visualize_detection(
345
+ temp_path, result, color_mapper=self.color_mapper,
346
+ figsize=(12, 12), return_pil=True, filter_classes=filter_classes
347
  )
348
 
349
+ result_text_summary = EvaluationMetrics.format_detection_summary(viz_data)
350
+
351
+ # Pass the enable_landmark parameter from function signature
352
+ # Initialize or update scene analyzer if needed
353
+ if self.scene_analyzer is None:
354
+ print("Creating SceneAnalyzer in process_image")
355
+ self.scene_analyzer = SceneAnalyzer(
356
+ class_names=result.names if result else None,
357
+ use_llm=self.use_llm,
358
+ use_clip=True,
359
+ enable_landmark=enable_landmark,
360
+ llm_model_path=self.llm_model_path
361
+ )
362
+
363
+ if self.scene_analyzer is None:
364
+ print("ERROR: Failed to create SceneAnalyzer in process_image")
365
+ else:
366
+ # Update existing scene analyzer with current settings
367
+ if result and hasattr(result, 'names'):
368
+ self.scene_analyzer.class_names = result.names
369
+ if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
370
+ self.scene_analyzer.spatial_analyzer.class_names = result.names
371
+
372
+ self.scene_analyzer.enable_landmark = enable_landmark
373
+ if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
374
+ self.scene_analyzer.spatial_analyzer.enable_landmark = enable_landmark
375
+
376
+ # Perform scene analysis using the existing analyze_scene method
377
+ scene_analysis_result = self.analyze_scene(
378
+ detection_result=result,
379
+ lighting_info=lighting_info,
380
+ enable_landmark=enable_landmark,
381
+ places365_info=places365_info
382
  )
383
 
384
+ stats_data["scene_analysis"] = scene_analysis_result
385
+
386
+ final_result_text = result_text_summary
387
+
388
+ # Use enable_landmark parameter for landmark block
389
+ if enable_landmark and "detected_landmarks" in scene_analysis_result:
390
+ landmarks_detected = scene_analysis_result.get("detected_landmarks", [])
391
+ if not landmarks_detected and scene_analysis_result.get("primary_landmark"):
392
+ primary_lm = scene_analysis_result.get("primary_landmark")
393
+ if isinstance(primary_lm, dict): landmarks_detected = [primary_lm]
394
+
395
+ if landmarks_detected:
396
+ final_result_text += "\n\n--- Detected Landmarks ---\n"
397
+ # Ensure drawing on the correct PIL image
398
+ img_to_draw_on = result_image_pil.copy() # Draw on a copy
399
+ img_for_drawing_cv2 = cv2.cvtColor(np.array(img_to_draw_on), cv2.COLOR_RGB2BGR)
400
+
401
+ for landmark_item in landmarks_detected:
402
+ if not isinstance(landmark_item, dict): continue
403
+
404
+ # Use .get() for all potentially missing keys 比較保險
405
+ landmark_name_disp = landmark_item.get("class_name", landmark_item.get("name", "N/A"))
406
+ landmark_loc_disp = landmark_item.get("location", "N/A")
407
+ landmark_conf_disp = landmark_item.get("confidence", 0.0)
408
+
409
+ final_result_text += f"• {landmark_name_disp} ({landmark_loc_disp}, confidence: {landmark_conf_disp:.2f})\n"
410
 
411
+ if "box" in landmark_item:
412
+ box = landmark_item["box"]
413
+ pt1 = (int(box[0]), int(box[1])); pt2 = (int(box[2]), int(box[3]))
414
+ color_lm = (255, 0, 255); thickness_lm = 3 # Magenta BGR
415
+ cv2.rectangle(img_for_drawing_cv2, pt1, pt2, color_lm, thickness_lm)
416
 
417
+ label_lm = f"{landmark_name_disp} ({landmark_conf_disp:.2f})"
418
+ font_scale_lm = 0.6; font_thickness_lm = 1
419
+ (w_text, h_text), baseline = cv2.getTextSize(label_lm, cv2.FONT_HERSHEY_SIMPLEX, font_scale_lm, font_thickness_lm)
420
 
421
+ # Label position logic (simplified from your extensive one for brevity)
422
+ label_y_pos = pt1[1] - baseline - 3
423
+ if label_y_pos < h_text : # If label goes above image, put it below box
424
+ label_y_pos = pt2[1] + h_text + baseline + 3
425
+
426
+ label_bg_pt1 = (pt1[0], label_y_pos - h_text - baseline)
427
+ label_bg_pt2 = (pt1[0] + w_text, label_y_pos + baseline)
428
+
429
+ cv2.rectangle(img_for_drawing_cv2, label_bg_pt1, label_bg_pt2, color_lm, -1)
430
+ cv2.putText(img_for_drawing_cv2, label_lm, (pt1[0], label_y_pos),
431
+ cv2.FONT_HERSHEY_SIMPLEX, font_scale_lm, (255,255,255), font_thickness_lm, cv2.LINE_AA)
432
+
433
+ result_image_pil = Image.fromarray(cv2.cvtColor(img_for_drawing_cv2, cv2.COLOR_BGR2RGB))
434
+
435
+ return result_image_pil, final_result_text, stats_data
436
 
437
  except Exception as e:
438
+ error_message = f"Error in ImageProcessor.process_image: {str(e)}"
439
  import traceback
440
  traceback.print_exc()
441
+ return pil_image_for_processing if pil_image_for_processing else None, error_message, {}
 
 
442
  finally:
443
  if temp_path and os.path.exists(temp_path):
444
+ try: os.remove(temp_path)
445
+ except Exception as e: print(f"Warning: Cannot delete temp file {temp_path}: {str(e)}")
 
 
 
446
 
447
  def format_result_text(self, stats: Dict) -> str:
448
  """
 
481
  else:
482
  lines.append("No class information available.")
483
 
484
+ # 添加空間資訊
485
  if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
486
  lines.append("Object Distribution:")
487
 
landmark_activities.py ADDED
The diff for this file is too large to render. See raw diff
 
landmark_data.py ADDED
The diff for this file is too large to render. See raw diff
 
lighting_analyzer.py CHANGED
The diff for this file is too large to render. See raw diff
 
lighting_conditions.py CHANGED
@@ -12,6 +12,36 @@ LIGHTING_CONDITIONS = {
12
  "bright": "The scene has the diffused bright lighting of an overcast day.",
13
  "medium": "The scene has even, soft lighting typical of a cloudy day.",
14
  "dim": "The scene has the muted lighting of a heavily overcast day."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  },
16
  "sunset/sunrise": {
17
  "general": "The scene is captured during golden hour with warm lighting.",
@@ -81,6 +111,10 @@ LIGHTING_CONDITIONS = {
81
  "beach_lighting": "sun-drenched",
82
  "sports_venue_lighting": "arena-lit",
83
  "professional_kitchen_lighting": "kitchen-task lit",
 
 
 
 
84
  "unknown": ""
85
  },
86
  "activity_modifiers": {
@@ -127,5 +161,11 @@ LIGHTING_CONDITIONS = {
127
  "bright": "The space blends bright natural and artificial light sources across indoor-outdoor boundaries.",
128
  "medium": "The area combines moderate indoor lighting with outdoor illumination in a balanced way.",
129
  "dim": "The transition space features subtle lighting gradients between indoor and outdoor zones."
 
 
 
 
 
 
130
  }
131
  }
 
12
  "bright": "The scene has the diffused bright lighting of an overcast day.",
13
  "medium": "The scene has even, soft lighting typical of a cloudy day.",
14
  "dim": "The scene has the muted lighting of a heavily overcast day."
15
+ },
16
+ "day_cloudy_gray": {
17
+ "general": "The scene is captured during an overcast day with muted gray lighting.",
18
+ "bright": "The scene has bright but diffused gray daylight from heavy cloud cover.",
19
+ "medium": "The scene has even, muted lighting typical of a gray, overcast day.",
20
+ "dim": "The scene has subdued lighting under thick gray clouds."
21
+ },
22
+ "indoor_residential_natural": {
23
+ "general": "The scene is captured in a residential setting with natural window lighting.",
24
+ "bright": "The residential space is brightly lit with abundant natural light from windows.",
25
+ "medium": "The home interior has comfortable natural lighting complemented by artificial sources.",
26
+ "dim": "The residential space has soft natural lighting creating a cozy atmosphere."
27
+ },
28
+ "indoor_designer_residential": {
29
+ "general": "The scene is captured in a well-designed residential space with curated lighting.",
30
+ "bright": "The residential interior features bright, designer lighting creating an elegant atmosphere.",
31
+ "medium": "The home space has thoughtfully planned lighting balancing aesthetics and functionality.",
32
+ "dim": "The residential area has sophisticated mood lighting enhancing the design elements."
33
+ },
34
+ "indoor_bright_natural_mix": {
35
+ "general": "The scene is captured indoors with a blend of natural and artificial lighting.",
36
+ "bright": "The indoor space combines bright natural window light with artificial illumination.",
37
+ "medium": "The interior has balanced mixed lighting from windows and electric sources.",
38
+ "dim": "The indoor area has gentle mixed lighting creating comfortable illumination."
39
+ },
40
+ "indoor_restaurant_bar": {
41
+ "general": "The scene is captured inside a restaurant or bar with characteristic warm lighting.",
42
+ "bright": "The dining establishment is well-lit with warm illumination emphasizing ambiance.",
43
+ "medium": "The restaurant/bar has moderate warm lighting creating a comfortable social atmosphere.",
44
+ "dim": "The establishment features soft, warm lighting creating an intimate dining or social atmosphere."
45
  },
46
  "sunset/sunrise": {
47
  "general": "The scene is captured during golden hour with warm lighting.",
 
111
  "beach_lighting": "sun-drenched",
112
  "sports_venue_lighting": "arena-lit",
113
  "professional_kitchen_lighting": "kitchen-task lit",
114
+ "day_cloudy_gray": "gray-lit",
115
+ "indoor_residential_natural": "naturally-lit residential",
116
+ "indoor_designer_residential": "designer-lit residential",
117
+ "indoor_bright_natural_mix": "mixed-lit indoor",
118
  "unknown": ""
119
  },
120
  "activity_modifiers": {
 
161
  "bright": "The space blends bright natural and artificial light sources across indoor-outdoor boundaries.",
162
  "medium": "The area combines moderate indoor lighting with outdoor illumination in a balanced way.",
163
  "dim": "The transition space features subtle lighting gradients between indoor and outdoor zones."
164
+ },
165
+ "stadium_or_floodlit_area": {
166
+ "general": "The scene is captured under powerful floodlights creating uniform bright illumination.",
167
+ "bright": "The area is intensely illuminated by floodlights, similar to stadium conditions.",
168
+ "medium": "The space has even, powerful lighting typical of sports facilities or outdoor events.",
169
+ "dim": "The area has moderate floodlight illumination providing consistent lighting across the space."
170
  }
171
  }
llm_enhancer.py CHANGED
@@ -19,7 +19,6 @@ class LLMEnhancer:
19
  top_p: float = 0.85):
20
  """
21
  初始化LLM增強器
22
-
23
  Args:
24
  model_path: LLM模型的路徑或HuggingFace log in,默認使用Llama 3.2
25
  tokenizer_path: token處理器的路徑,通常與model_path相同
@@ -38,7 +37,7 @@ class LLMEnhancer:
38
  self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
39
  self.tokenizer_path = tokenizer_path or self.model_path
40
 
41
- # 確定運行設備
42
  self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
43
  self.logger.info(f"Using device: {self.device}")
44
 
@@ -50,7 +49,7 @@ class LLMEnhancer:
50
  self.model = None
51
  self.tokenizer = None
52
 
53
- # 計數器,用來追蹤模型調用次數
54
  self.call_count = 0
55
 
56
  self._initialize_prompts()
@@ -124,17 +123,12 @@ class LLMEnhancer:
124
  self.enhance_description_template = """
125
  <|system|>
126
  You are an expert visual analyst. Your task is to improve the readability and fluency of scene descriptions using STRICT factual accuracy.
127
-
128
  Your **top priority is to avoid hallucination** or fabrication. You are working in a computer vision pipeline using object detection (YOLO) and image embeddings. You MUST treat the input object list as a whitelist. Do not speculate beyond this list.
129
-
130
  </|system|>
131
-
132
  <|user|>
133
  Rewrite the following scene description to be fluent and clear. DO NOT add any objects, events, or spatial relationships that are not explicitly present in the original or object list.
134
-
135
  ORIGINAL:
136
  {original_description}
137
-
138
  CRITICAL RULES:
139
  1. NEVER assume room type, object function, or scene purpose unless directly stated.
140
  2. NEVER invent object types. You are limited to: {object_list}
@@ -143,60 +137,51 @@ class LLMEnhancer:
143
  5. You MAY describe confirmed materials, colors, and composition style if visually obvious and non-speculative.
144
  6. Write 2–4 complete, well-structured sentences with punctuation.
145
  7. Final output MUST be a single fluent paragraph of 60–200 words (not longer).
146
- 8. NEVER include explanations, reasoning, or tags. ONLY provide the enhanced description.
147
- 9. Do not repeat any sentence structure or phrase more than once.
 
 
 
 
 
 
148
  </|user|>
149
-
150
  <|assistant|>
151
  """
152
 
153
-
154
  # 錯誤檢測的prompt
155
  self.verify_detection_template = """
156
  Task: You are an advanced vision system that verifies computer vision detections for accuracy.
157
-
158
  Analyze the following detection results and identify any potential errors or inconsistencies:
159
-
160
  SCENE TYPE: {scene_type}
161
  SCENE NAME: {scene_name}
162
  CONFIDENCE: {confidence:.2f}
163
-
164
  DETECTED OBJECTS: {detected_objects}
165
-
166
  CLIP ANALYSIS RESULTS:
167
  {clip_analysis}
168
-
169
  Possible Errors to Check:
170
  1. Objects misidentified (e.g., architectural elements labeled as vehicles)
171
  2. Cultural elements misunderstood (e.g., Asian temple structures labeled as boats)
172
  3. Objects that seem out of place for this type of scene
173
  4. Inconsistencies between different detection systems
174
-
175
  If you find potential errors, list them clearly with explanations. If the detections seem reasonable, state that they appear accurate.
176
-
177
  Verification Results:
178
  """
179
 
180
  # 無檢測處理的prompt
181
  self.no_detection_template = """
182
  Task: You are an advanced scene understanding system analyzing an image where standard object detection failed to identify specific objects.
183
-
184
  Based on advanced image embeddings (CLIP analysis), we have the following information:
185
-
186
  MOST LIKELY SCENE: {top_scene} (confidence: {top_confidence:.2f})
187
  VIEWPOINT: {viewpoint}
188
  LIGHTING: {lighting_condition}
189
-
190
  CULTURAL ANALYSIS: {cultural_analysis}
191
-
192
  Create a detailed description of what might be in this scene, considering:
193
  1. The most likely type of location or setting
194
  2. Possible architectural or natural elements present
195
  3. The lighting and atmosphere
196
  4. Potential cultural or regional characteristics
197
-
198
  Your description should be natural, flowing, and offer insights into what the image likely contains despite the lack of specific object detection.
199
-
200
  Scene Description:
201
  """
202
 
@@ -300,7 +285,7 @@ class LLMEnhancer:
300
  self.logger.info("Model not loaded, no context to reset")
301
 
302
  def _remove_introduction_sentences(self, response: str) -> str:
303
- """移除生成文本中可能的介紹性句子"""
304
  # 識別常見的介紹性模式
305
  intro_patterns = [
306
  r'^Here is the (?:rewritten|enhanced) .*?description:',
@@ -318,7 +303,7 @@ class LLMEnhancer:
318
  return response
319
 
320
  def enhance_description(self, scene_data: Dict[str, Any]) -> str:
321
- """改進的場景描述增強器,處理各種場景類型並保留視角與光照資訊,並作為總窗口可運用於其他class"""
322
  try:
323
  # 重置上下文
324
  self.reset_context()
@@ -332,7 +317,7 @@ class LLMEnhancer:
332
  if not original_desc:
333
  return "No original description provided."
334
 
335
- # 獲取scene type 並標準化
336
  scene_type = scene_data.get("scene_type", "unknown scene")
337
  scene_type = self._clean_scene_type(scene_type)
338
 
@@ -357,16 +342,28 @@ class LLMEnhancer:
357
  if confidence >= high_confidence_threshold:
358
  filtered_objects.append(obj)
359
 
360
- # 計算物件列表和數量 - 僅使用過濾後的高信心度物件
 
361
  object_counts = {}
362
- for obj in filtered_objects:
363
- class_name = obj.get("class_name", "")
364
- if class_name not in object_counts:
365
- object_counts[class_name] = 0
366
- object_counts[class_name] += 1
367
 
368
- # 將高置信度物件格式化為清單
369
- high_confidence_objects = ", ".join([f"{count} {obj}" for obj, count in object_counts.items()])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
 
371
  # 如果沒有高信心度物件,回退到使用原始描述中的關鍵詞
372
  if not high_confidence_objects:
@@ -399,6 +396,29 @@ class LLMEnhancer:
399
  response = self._generate_llm_response(prompt)
400
 
401
  # 檢查回應完整性的更嚴格標準
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  is_incomplete = (
403
  len(response) < 100 or # too short
404
  (len(response) < 200 and "." not in response[-30:]) or # 結尾沒有適當的標點符號
@@ -442,7 +462,15 @@ class LLMEnhancer:
442
  if perspective and perspective.lower() not in result.lower():
443
  result = f"{perspective}, {result[0].lower()}{result[1:]}"
444
 
445
- return str(result)
 
 
 
 
 
 
 
 
446
 
447
  except Exception as e:
448
  self.logger.error(f"Enhancement failed: {str(e)}")
@@ -451,7 +479,7 @@ class LLMEnhancer:
451
  return original_desc # 發生任何錯誤時返回原始描述
452
 
453
  def _verify_factual_accuracy(self, original: str, generated: str, object_list: str) -> str:
454
- """驗證生成的描述不包含原始描述或物體列表中沒有的信息"""
455
 
456
  # 將原始描述和物體列表合併為授權詞彙源
457
  authorized_content = original.lower() + " " + object_list.lower()
@@ -475,6 +503,55 @@ class LLMEnhancer:
475
  pattern = re.compile(r'\b' + term + r'\b', re.IGNORECASE)
476
  generated = pattern.sub(replacement, generated)
477
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  return generated
479
 
480
 
@@ -486,14 +563,12 @@ class LLMEnhancer:
486
  confidence: float) -> Dict[str, Any]:
487
  """
488
  驗證並可能修正YOLO的檢測結果
489
-
490
  Args:
491
  detected_objects: YOLO檢測到的物體列表
492
  clip_analysis: CLIP分析結果
493
  scene_type: 識別的場景類型
494
  scene_name: 場景名稱
495
  confidence: 場景分類的信心度
496
-
497
  Returns:
498
  Dict: 包含驗證結果和建議的字典
499
  """
@@ -520,7 +595,7 @@ class LLMEnhancer:
520
  result = {
521
  "verification_text": verification_result,
522
  "has_errors": "appear accurate" not in verification_result.lower(),
523
- "corrected_objects": None # 可能在未來版本實現詳細錯誤修正
524
  }
525
 
526
  return result
@@ -567,10 +642,8 @@ class LLMEnhancer:
567
  def handle_no_detection(self, clip_analysis: Dict[str, Any]) -> str:
568
  """
569
  處理YOLO未檢測到物體的情況
570
-
571
  Args:
572
  clip_analysis: CLIP分析結果
573
-
574
  Returns:
575
  str: 生成的場景描述
576
  """
@@ -603,10 +676,8 @@ class LLMEnhancer:
603
  def _clean_input_text(self, text: str) -> str:
604
  """
605
  對輸入文本進行通用的格式清理,處理常見的格式問題。
606
-
607
  Args:
608
  text: 輸入文本
609
-
610
  Returns:
611
  清理後的文本
612
  """
@@ -635,13 +706,11 @@ class LLMEnhancer:
635
  def _fact_check_description(self, original_desc: str, enhanced_desc: str, scene_type: str, detected_objects: List[str]) -> str:
636
  """
637
  驗證並可能修正增強後的描述,確保有保持事實準確性。
638
-
639
  Args:
640
  original_desc: 原始場景描述
641
  enhanced_desc: 增強後的描述待驗證
642
  scene_type: 場景類型
643
  detected_objects: 檢測到的物體名稱列表
644
-
645
  Returns:
646
  經過事實檢查的描述
647
  """
@@ -842,13 +911,14 @@ class LLMEnhancer:
842
  # 為 Llama 模型設置特定參數
843
  if "llama" in self.model_path.lower():
844
  generation_params.update({
845
- "temperature": 0.4, # 不要太高, 否則模型可能會太有主觀意見
846
  "max_new_tokens": 600,
847
  "do_sample": True,
848
- "top_p": 0.8,
849
- "repetition_penalty": 1.2, # 重複的懲罰權重,可避免掉重複字
850
- "num_beams": 4 ,
851
- "length_penalty": 1.2,
 
852
  })
853
 
854
  else:
@@ -885,9 +955,9 @@ class LLMEnhancer:
885
  if response.startswith(input_text):
886
  response = response[len(input_text):].strip()
887
 
888
- # 確保不返回空響應
889
  if not response or len(response.strip()) < 10:
890
- self.logger.warning("生成的回應為空的或太短,返回默認回應")
891
  return "No detailed description could be generated."
892
 
893
  return response
@@ -902,10 +972,8 @@ class LLMEnhancer:
902
  """
903
  Clean the LLM response to ensure the output contains only clean descriptive text.
904
  Sometimes it will not only display the description but display tags, notes...etc
905
-
906
  Args:
907
  response: Original response from the LLM
908
-
909
  Returns:
910
  Cleaned description text
911
  """
@@ -939,13 +1007,27 @@ class LLMEnhancer:
939
  for marker in section_markers:
940
  response = re.sub(marker, '', response, flags=re.IGNORECASE)
941
 
 
 
 
 
 
 
 
 
 
 
942
  # 3. Remove common prefixes and suffixes
943
  prefixes_to_remove = [
944
  "Enhanced Description:",
945
  "Scene Description:",
946
  "Description:",
947
  "Here is the enhanced description:",
948
- "Here's the enhanced description:"
 
 
 
 
949
  ]
950
 
951
  for prefix in prefixes_to_remove:
@@ -1004,6 +1086,49 @@ class LLMEnhancer:
1004
  # Recombine unique sentences
1005
  response = ' '.join(unique_sentences)
1006
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1007
  # 10. Ensure word count is within limits (50-150 words)
1008
  words = response.split()
1009
  if len(words) > 200:
@@ -1035,7 +1160,20 @@ class LLMEnhancer:
1035
  # Remove the last preposition or conjunction
1036
  response = " ".join(words[:-1]) + "."
1037
 
1038
- # 12. Ensure haven't over-filtered
 
 
 
 
 
 
 
 
 
 
 
 
 
1039
  if not response or len(response) < 40:
1040
  # Try to get the first meaningful paragraph from the original response
1041
  paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
@@ -1052,7 +1190,7 @@ class LLMEnhancer:
1052
  # If still no good content, return a simple message
1053
  return "Unable to generate a valid enhanced description."
1054
 
1055
- # 13. Final cleaning - catch any missed special cases
1056
  response = re.sub(r'</?\|.*?\|>', '', response) # Any remaining tags
1057
  response = re.sub(r'\(.*?\)', '', response) # Any remaining parenthetical content
1058
  response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE) # Any remaining notes
@@ -1064,7 +1202,7 @@ class LLMEnhancer:
1064
  if response and response[0].islower():
1065
  response = response[0].upper() + response[1:]
1066
 
1067
- # 14. 統一格式 - 確保輸出始終是單一段落
1068
  response = re.sub(r'\s*\n\s*', ' ', response) # 將所有換行符替換為空格
1069
  response = ' '.join(response.split())
1070
 
 
19
  top_p: float = 0.85):
20
  """
21
  初始化LLM增強器
 
22
  Args:
23
  model_path: LLM模型的路徑或HuggingFace log in,默認使用Llama 3.2
24
  tokenizer_path: token處理器的路徑,通常與model_path相同
 
37
  self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
38
  self.tokenizer_path = tokenizer_path or self.model_path
39
 
40
+ # check device
41
  self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
42
  self.logger.info(f"Using device: {self.device}")
43
 
 
49
  self.model = None
50
  self.tokenizer = None
51
 
52
+ # 追蹤模型調用次數
53
  self.call_count = 0
54
 
55
  self._initialize_prompts()
 
123
  self.enhance_description_template = """
124
  <|system|>
125
  You are an expert visual analyst. Your task is to improve the readability and fluency of scene descriptions using STRICT factual accuracy.
 
126
  Your **top priority is to avoid hallucination** or fabrication. You are working in a computer vision pipeline using object detection (YOLO) and image embeddings. You MUST treat the input object list as a whitelist. Do not speculate beyond this list.
 
127
  </|system|>
 
128
  <|user|>
129
  Rewrite the following scene description to be fluent and clear. DO NOT add any objects, events, or spatial relationships that are not explicitly present in the original or object list.
 
130
  ORIGINAL:
131
  {original_description}
 
132
  CRITICAL RULES:
133
  1. NEVER assume room type, object function, or scene purpose unless directly stated.
134
  2. NEVER invent object types. You are limited to: {object_list}
 
137
  5. You MAY describe confirmed materials, colors, and composition style if visually obvious and non-speculative.
138
  6. Write 2–4 complete, well-structured sentences with punctuation.
139
  7. Final output MUST be a single fluent paragraph of 60–200 words (not longer).
140
+ 8. Begin your response directly with the scene description. Do NOT include any introductory phrases, explanations, or formatting indicators.
141
+ 9. Ensure grammatical completeness in all sentences. Each sentence must have a complete subject and predicate structure.
142
+ 10. Vary sentence structures naturally while maintaining grammatical accuracy. Avoid incomplete phrases or dangling modifiers.
143
+ 11. Limit repetition of descriptive verbs and spatial indicators to maintain text diversity and readability.
144
+ 12. Create natural spatial flow by connecting object descriptions organically rather than listing positions mechanically.
145
+ 13. Use transitional phrases to connect ideas smoothly, varying expression patterns throughout the description.
146
+ 14. End with a conclusive observation about atmosphere, style, or overall impression rather than restating layout information.
147
+ 15. When describing quantities or arrangements, use only information explicitly confirmed by the object detection system.
148
  </|user|>
 
149
  <|assistant|>
150
  """
151
 
 
152
  # 錯誤檢測的prompt
153
  self.verify_detection_template = """
154
  Task: You are an advanced vision system that verifies computer vision detections for accuracy.
 
155
  Analyze the following detection results and identify any potential errors or inconsistencies:
 
156
  SCENE TYPE: {scene_type}
157
  SCENE NAME: {scene_name}
158
  CONFIDENCE: {confidence:.2f}
 
159
  DETECTED OBJECTS: {detected_objects}
 
160
  CLIP ANALYSIS RESULTS:
161
  {clip_analysis}
 
162
  Possible Errors to Check:
163
  1. Objects misidentified (e.g., architectural elements labeled as vehicles)
164
  2. Cultural elements misunderstood (e.g., Asian temple structures labeled as boats)
165
  3. Objects that seem out of place for this type of scene
166
  4. Inconsistencies between different detection systems
 
167
  If you find potential errors, list them clearly with explanations. If the detections seem reasonable, state that they appear accurate.
 
168
  Verification Results:
169
  """
170
 
171
  # 無檢測處理的prompt
172
  self.no_detection_template = """
173
  Task: You are an advanced scene understanding system analyzing an image where standard object detection failed to identify specific objects.
 
174
  Based on advanced image embeddings (CLIP analysis), we have the following information:
 
175
  MOST LIKELY SCENE: {top_scene} (confidence: {top_confidence:.2f})
176
  VIEWPOINT: {viewpoint}
177
  LIGHTING: {lighting_condition}
 
178
  CULTURAL ANALYSIS: {cultural_analysis}
 
179
  Create a detailed description of what might be in this scene, considering:
180
  1. The most likely type of location or setting
181
  2. Possible architectural or natural elements present
182
  3. The lighting and atmosphere
183
  4. Potential cultural or regional characteristics
 
184
  Your description should be natural, flowing, and offer insights into what the image likely contains despite the lack of specific object detection.
 
185
  Scene Description:
186
  """
187
 
 
285
  self.logger.info("Model not loaded, no context to reset")
286
 
287
  def _remove_introduction_sentences(self, response: str) -> str:
288
+ """remove introduction sentences"""
289
  # 識別常見的介紹性模式
290
  intro_patterns = [
291
  r'^Here is the (?:rewritten|enhanced) .*?description:',
 
303
  return response
304
 
305
  def enhance_description(self, scene_data: Dict[str, Any]) -> str:
306
+ """場景描述增強器,處理各種場景類型並保留視角與光照資訊,並作為總窗口可運用於其他class"""
307
  try:
308
  # 重置上下文
309
  self.reset_context()
 
317
  if not original_desc:
318
  return "No original description provided."
319
 
320
+ # get scene type 並標準化
321
  scene_type = scene_data.get("scene_type", "unknown scene")
322
  scene_type = self._clean_scene_type(scene_type)
323
 
 
342
  if confidence >= high_confidence_threshold:
343
  filtered_objects.append(obj)
344
 
345
+ # 優先使用傳入的物體統計信息,如果不存在則計算
346
+ object_statistics = scene_data.get("object_statistics", {})
347
  object_counts = {}
 
 
 
 
 
348
 
349
+ if object_statistics:
350
+ # 使用預計算的統計資訊,確保數量準確
351
+ for class_name, stats in object_statistics.items():
352
+ if stats.get("count", 0) > 0 and stats.get("avg_confidence", 0) >= high_confidence_threshold:
353
+ object_counts[class_name] = stats["count"]
354
+ else:
355
+ # 回退到原有的計算方式
356
+ for obj in filtered_objects:
357
+ class_name = obj.get("class_name", "")
358
+ if class_name not in object_counts:
359
+ object_counts[class_name] = 0
360
+ object_counts[class_name] += 1
361
+
362
+ # 將物件格式化為更精確的描述
363
+ high_confidence_objects = ", ".join([
364
+ f"{count} {obj}{'s' if count > 1 else ''}"
365
+ for obj, count in object_counts.items()
366
+ ])
367
 
368
  # 如果沒有高信心度物件,回退到使用原始描述中的關鍵詞
369
  if not high_confidence_objects:
 
396
  response = self._generate_llm_response(prompt)
397
 
398
  # 檢查回應完整性的更嚴格標準
399
+ is_landmark_only = (
400
+ scene_data.get("scene_type") in ["tourist_landmark", "natural_landmark", "historical_monument"] and
401
+ (not scene_data.get("detected_objects") or len(scene_data.get("detected_objects", [])) <= 1)
402
+ )
403
+
404
+ # 如果是只有地標的情況,調整相關邏輯
405
+ if is_landmark_only:
406
+ # 確保原始描述不為空
407
+ original_desc = scene_data.get("original_description", "")
408
+ if not original_desc or len(original_desc.strip()) < 10:
409
+ # 從場景類型和地標信息生成基本描述
410
+ scene_type = scene_data.get("scene_type", "unknown")
411
+ scene_name = scene_data.get("scene_name", "Unknown")
412
+ if "primary_landmark" in scene_data:
413
+ landmark_name = scene_data["primary_landmark"].get("name", "unnamed landmark")
414
+ original_desc = f"A {scene_type.replace('_', ' ')} scene featuring {landmark_name}."
415
+ else:
416
+ original_desc = f"A {scene_type.replace('_', ' ')} scene."
417
+
418
+ # 更新場景數據
419
+ scene_data["original_description"] = original_desc
420
+
421
+ # 檢查回應完整性的更嚴格標準 (保持不變)
422
  is_incomplete = (
423
  len(response) < 100 or # too short
424
  (len(response) < 200 and "." not in response[-30:]) or # 結尾沒有適當的標點符號
 
462
  if perspective and perspective.lower() not in result.lower():
463
  result = f"{perspective}, {result[0].lower()}{result[1:]}"
464
 
465
+ final_result = str(result)
466
+ if not final_result or len(final_result.strip()) < 20:
467
+ self.logger.warning(f"WARNING: LLM enhanced description is empty or too short!")
468
+ self.logger.info(f"Original description: {original_desc[:50]}...")
469
+ self.logger.info(f"Input data: scene_type={scene_data.get('scene_type')}, objects={len(scene_data.get('detected_objects', []))}")
470
+ else:
471
+ self.logger.info(f"LLM enhanced description generated successfully ({len(final_result)} chars)")
472
+
473
+ return final_result
474
 
475
  except Exception as e:
476
  self.logger.error(f"Enhancement failed: {str(e)}")
 
479
  return original_desc # 發生任何錯誤時返回原始描述
480
 
481
  def _verify_factual_accuracy(self, original: str, generated: str, object_list: str) -> str:
482
+ """驗證生成的描述不包含原始描述或物體列表中沒有的信息,並檢測重複用詞問題"""
483
 
484
  # 將原始描述和物體列表合併為授權詞彙源
485
  authorized_content = original.lower() + " " + object_list.lower()
 
503
  pattern = re.compile(r'\b' + term + r'\b', re.IGNORECASE)
504
  generated = pattern.sub(replacement, generated)
505
 
506
+ # 檢查描述性詞彙重複問題
507
+ repetitive_patterns = [
508
+ (r'\b(visible)\b.*?\b(visible)\b', 'Multiple uses of "visible" detected'),
509
+ (r'\b(positioned)\b.*?\b(positioned)\b', 'Multiple uses of "positioned" detected'),
510
+ (r'\b(located)\b.*?\b(located)\b', 'Multiple uses of "located" detected'),
511
+ (r'\b(situated)\b.*?\b(situated)\b', 'Multiple uses of "situated" detected'),
512
+ (r'\b(appears)\b.*?\b(appears)\b', 'Multiple uses of "appears" detected'),
513
+ (r'\b(features)\b.*?\b(features)\b', 'Multiple uses of "features" detected'),
514
+ (r'\bThis\s+(\w+)\s+.*?\bThis\s+\1\b', 'Repetitive sentence structure detected')
515
+ ]
516
+
517
+ # 定義替換詞典,提供多樣化的表達方式
518
+ replacement_dict = {
519
+ 'visible': ['present', 'evident', 'apparent', 'observable'],
520
+ 'positioned': ['arranged', 'placed', 'set', 'organized'],
521
+ 'located': ['found', 'placed', 'situated', 'established'],
522
+ 'situated': ['placed', 'positioned', 'arranged', 'set'],
523
+ 'appears': ['seems', 'looks', 'presents', 'exhibits'],
524
+ 'features': ['includes', 'contains', 'displays', 'showcases']
525
+ }
526
+
527
+ for pattern, issue in repetitive_patterns:
528
+ matches = list(re.finditer(pattern, generated, re.IGNORECASE | re.DOTALL))
529
+ if matches:
530
+ self.logger.warning(f"Text quality issue detected: {issue}")
531
+
532
+ # 針對特定重複詞彙進行替換
533
+ for word in replacement_dict.keys():
534
+ if word in issue.lower():
535
+ word_pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
536
+ word_matches = list(word_pattern.finditer(generated))
537
+
538
+ # 保留第一次出現,替換後續出現
539
+ for i, match in enumerate(word_matches[1:], 1):
540
+ if i <= len(replacement_dict[word]):
541
+ replacement = replacement_dict[word][(i-1) % len(replacement_dict[word])]
542
+
543
+ # 保持原始大小寫格式
544
+ if match.group().isupper():
545
+ replacement = replacement.upper()
546
+ elif match.group().istitle():
547
+ replacement = replacement.capitalize()
548
+
549
+ # 執行替換
550
+ generated = generated[:match.start()] + replacement + generated[match.end():]
551
+ # 重新計算後續匹配位置
552
+ word_matches = list(word_pattern.finditer(generated))
553
+ break
554
+
555
  return generated
556
 
557
 
 
563
  confidence: float) -> Dict[str, Any]:
564
  """
565
  驗證並可能修正YOLO的檢測結果
 
566
  Args:
567
  detected_objects: YOLO檢測到的物體列表
568
  clip_analysis: CLIP分析結果
569
  scene_type: 識別的場景類型
570
  scene_name: 場景名稱
571
  confidence: 場景分類的信心度
 
572
  Returns:
573
  Dict: 包含驗證結果和建議的字典
574
  """
 
595
  result = {
596
  "verification_text": verification_result,
597
  "has_errors": "appear accurate" not in verification_result.lower(),
598
+ "corrected_objects": None
599
  }
600
 
601
  return result
 
642
  def handle_no_detection(self, clip_analysis: Dict[str, Any]) -> str:
643
  """
644
  處理YOLO未檢測到物體的情況
 
645
  Args:
646
  clip_analysis: CLIP分析結果
 
647
  Returns:
648
  str: 生成的場景描述
649
  """
 
676
  def _clean_input_text(self, text: str) -> str:
677
  """
678
  對輸入文本進行通用的格式清理,處理常見的格式問題。
 
679
  Args:
680
  text: 輸入文本
 
681
  Returns:
682
  清理後的文本
683
  """
 
706
  def _fact_check_description(self, original_desc: str, enhanced_desc: str, scene_type: str, detected_objects: List[str]) -> str:
707
  """
708
  驗證並可能修正增強後的描述,確保有保持事實準確性。
 
709
  Args:
710
  original_desc: 原始場景描述
711
  enhanced_desc: 增強後的描述待驗證
712
  scene_type: 場景類型
713
  detected_objects: 檢測到的物體名稱列表
 
714
  Returns:
715
  經過事實檢查的描述
716
  """
 
911
  # 為 Llama 模型設置特定參數
912
  if "llama" in self.model_path.lower():
913
  generation_params.update({
914
+ "temperature": 0.35, # 不要太高, 否則模型可能會太有主觀意見
915
  "max_new_tokens": 600,
916
  "do_sample": True,
917
+ "top_p": 0.75,
918
+ "repetition_penalty": 1.5, # 重複的懲罰權重,可避免掉重複字
919
+ "num_beams": 5 ,
920
+ "length_penalty": 1,
921
+ "no_repeat_ngram_size": 3
922
  })
923
 
924
  else:
 
955
  if response.startswith(input_text):
956
  response = response[len(input_text):].strip()
957
 
958
+ # 確保不返回空的回應
959
  if not response or len(response.strip()) < 10:
960
+ self.logger.warning("response is too short or empty")
961
  return "No detailed description could be generated."
962
 
963
  return response
 
972
  """
973
  Clean the LLM response to ensure the output contains only clean descriptive text.
974
  Sometimes it will not only display the description but display tags, notes...etc
 
975
  Args:
976
  response: Original response from the LLM
 
977
  Returns:
978
  Cleaned description text
979
  """
 
1007
  for marker in section_markers:
1008
  response = re.sub(marker, '', response, flags=re.IGNORECASE)
1009
 
1010
+ # 2.5. Deal with Here is...
1011
+ intro_prefixes = [
1012
+ r'^Here\s+is\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?scene\s+description.*?:\s*',
1013
+ r'^The\s+(?:rewritten\s+|enhanced\s+)?(?:scene\s+)?description\s+is.*?:\s*',
1014
+ r'^Here\'s\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?description.*?:\s*'
1015
+ ]
1016
+
1017
+ for prefix_pattern in intro_prefixes:
1018
+ response = re.sub(prefix_pattern, '', response, flags=re.IGNORECASE)
1019
+
1020
  # 3. Remove common prefixes and suffixes
1021
  prefixes_to_remove = [
1022
  "Enhanced Description:",
1023
  "Scene Description:",
1024
  "Description:",
1025
  "Here is the enhanced description:",
1026
+ "Here's the enhanced description:",
1027
+ "Here is a rewritten scene description that adheres to the provided critical rules:",
1028
+ "Here is the rewritten scene description:",
1029
+ "Here's a rewritten scene description:",
1030
+ "The rewritten scene description is as follows:"
1031
  ]
1032
 
1033
  for prefix in prefixes_to_remove:
 
1086
  # Recombine unique sentences
1087
  response = ' '.join(unique_sentences)
1088
 
1089
+ # 9.5. Advanced repetition detection and replacement
1090
+ repetitive_descriptors = ['visible', 'positioned', 'located', 'situated', 'appears', 'features', 'shows', 'displays']
1091
+ word_usage_count = {}
1092
+
1093
+ # Count occurrences of each repetitive descriptor
1094
+ for word in repetitive_descriptors:
1095
+ count = len(re.findall(r'\b' + word + r'\b', response, re.IGNORECASE))
1096
+ if count > 1:
1097
+ word_usage_count[word] = count
1098
+
1099
+ # Replace excessive repetitions with varied alternatives
1100
+ replacement_alternatives = {
1101
+ 'visible': ['present', 'evident', 'apparent', 'observable'],
1102
+ 'positioned': ['arranged', 'placed', 'set', 'organized'],
1103
+ 'located': ['found', 'placed', 'situated', 'established'],
1104
+ 'situated': ['placed', 'positioned', 'arranged', 'set'],
1105
+ 'appears': ['seems', 'looks', 'presents', 'exhibits'],
1106
+ 'features': ['includes', 'contains', 'displays', 'showcases'],
1107
+ 'shows': ['reveals', 'presents', 'exhibits', 'demonstrates'],
1108
+ 'displays': ['presents', 'exhibits', 'shows', 'reveals']
1109
+ }
1110
+
1111
+ for word, count in word_usage_count.items():
1112
+ if count > 1 and word in replacement_alternatives:
1113
+ # Find all occurrences
1114
+ pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
1115
+ matches = list(pattern.finditer(response))
1116
+
1117
+ # Replace subsequent occurrences (keep first one)
1118
+ for i, match in enumerate(matches[1:], 1):
1119
+ if i <= len(replacement_alternatives[word]):
1120
+ replacement = replacement_alternatives[word][(i-1) % len(replacement_alternatives[word])]
1121
+ # Maintain original case pattern
1122
+ if match.group().isupper():
1123
+ replacement = replacement.upper()
1124
+ elif match.group().istitle():
1125
+ replacement = replacement.capitalize()
1126
+
1127
+ response = response[:match.start()] + replacement + response[match.end():]
1128
+ # Update remaining matches positions
1129
+ offset = len(replacement) - len(match.group())
1130
+ matches = list(pattern.finditer(response))
1131
+
1132
  # 10. Ensure word count is within limits (50-150 words)
1133
  words = response.split()
1134
  if len(words) > 200:
 
1160
  # Remove the last preposition or conjunction
1161
  response = " ".join(words[:-1]) + "."
1162
 
1163
+ # 12. Grammar completeness check
1164
+ incomplete_patterns = [
1165
+ r'\b(fine|the)\s+(the\s+)?(?:urban|area|scene)\b(?!\s+\w)', # 檢測不完整的片語
1166
+ r'\b(and|or|but|with|from|in|at|on)\s*[.!?]', # 介詞後直接結束
1167
+ r'\b\w+\s+\1\b' # 重複詞語檢測
1168
+ ]
1169
+
1170
+ for pattern in incomplete_patterns:
1171
+ if re.search(pattern, response, re.IGNORECASE):
1172
+ # 移除有問題的片段或進行修正
1173
+ response = re.sub(pattern, '', response, flags=re.IGNORECASE)
1174
+ response = re.sub(r'\s{2,}', ' ', response) # 清理多餘空格
1175
+
1176
+ # 13. Ensure haven't over-filtered
1177
  if not response or len(response) < 40:
1178
  # Try to get the first meaningful paragraph from the original response
1179
  paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
 
1190
  # If still no good content, return a simple message
1191
  return "Unable to generate a valid enhanced description."
1192
 
1193
+ # 14. Final cleaning - catch any missed special cases
1194
  response = re.sub(r'</?\|.*?\|>', '', response) # Any remaining tags
1195
  response = re.sub(r'\(.*?\)', '', response) # Any remaining parenthetical content
1196
  response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE) # Any remaining notes
 
1202
  if response and response[0].islower():
1203
  response = response[0].upper() + response[1:]
1204
 
1205
+ # 15. 統一格式 - 確保輸出始終是單一段落
1206
  response = re.sub(r'\s*\n\s*', ' ', response) # 將所有換行符替換為空格
1207
  response = ' '.join(response.split())
1208
 
object_template_fillers.py CHANGED
@@ -74,5 +74,10 @@ OBJECT_TEMPLATE_FILLERS = {
74
  "playing_surface": ["marked court boundaries", "manicured field turf", "running tracks", "competition equipment", "sports field markers"],
75
  "construction_equipment": ["tower cranes", "excavators", "cement mixers", "scaffolding structures", "construction barriers"],
76
  "medical_elements": ["examination furniture", "monitoring equipment", "sanitation stations", "privacy screens", "medical supply carts"],
77
- "educational_furniture": ["student desks", "lecture podiums", "laboratory benches", "learning stations", "collaborative workspace tables"]
78
- }
 
 
 
 
 
 
74
  "playing_surface": ["marked court boundaries", "manicured field turf", "running tracks", "competition equipment", "sports field markers"],
75
  "construction_equipment": ["tower cranes", "excavators", "cement mixers", "scaffolding structures", "construction barriers"],
76
  "medical_elements": ["examination furniture", "monitoring equipment", "sanitation stations", "privacy screens", "medical supply carts"],
77
+ "educational_furniture": ["student desks", "lecture podiums", "laboratory benches", "learning stations", "collaborative workspace tables"],
78
+
79
+ "landmark_features": ["distinctive architecture", "iconic structural elements", "famous design features", "recognized silhouette", "impressive proportions"],
80
+ "tourist_activities": ["sightseeing", "guided tours", "photography", "cultural exploration", "souvenir shopping"],
81
+ "outdoor_activities": ["nature photography", "hiking", "scenic viewing", "wildlife observation", "outdoor exploration"],
82
+ "historical_elements": ["cultural heritage", "historical events", "architectural periods", "traditional craftsmanship", "significant achievements"]
83
+ }
places365_model.py ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torchvision.transforms as transforms
4
+ import numpy as np
5
+ from PIL import Image
6
+ from typing import Dict, List, Tuple, Optional, Any
7
+ import logging
8
+
9
+ class Places365Model:
10
+ """
11
+ Places365 scene classification model wrapper for scene understanding integration.
12
+ Provides scene classification and scene attribute prediction capabilities.
13
+ """
14
+
15
+ def __init__(self, model_name: str = 'resnet50_places365', device: Optional[str] = None):
16
+ """
17
+ Initialize Places365 model with configurable architecture and device.
18
+
19
+ Args:
20
+ model_name: Model architecture name (默認 resnet50)
21
+ device: Target device for inference (auto-detected if None)
22
+ """
23
+ self.logger = logging.getLogger(self.__class__.__name__)
24
+
25
+ # Device configuration with fallback logic
26
+ if device is None:
27
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
28
+ else:
29
+ self.device = device
30
+
31
+ self.model_name = model_name
32
+ self.model = None
33
+ self.scene_classes = []
34
+ self.scene_attributes = []
35
+
36
+ # Model configuration mapping
37
+ self.model_configs = {
38
+ 'resnet18_places365': {
39
+ 'arch': 'resnet18',
40
+ 'num_classes': 365,
41
+ 'url': 'http://places2.csail.mit.edu/models_places365/resnet18_places365.pth.tar'
42
+ },
43
+ 'resnet50_places365': {
44
+ 'arch': 'resnet50',
45
+ 'num_classes': 365,
46
+ 'url': 'http://places2.csail.mit.edu/models_places365/resnet50_places365.pth.tar'
47
+ },
48
+ 'densenet161_places365': {
49
+ 'arch': 'densenet161',
50
+ 'num_classes': 365,
51
+ 'url': 'http://places2.csail.mit.edu/models_places365/densenet161_places365.pth.tar'
52
+ }
53
+ }
54
+
55
+ self._load_model()
56
+ self._load_class_names()
57
+ self._setup_scene_mapping()
58
+
59
+ def _load_model(self):
60
+ """載入與初始化 Places365 model"""
61
+ try:
62
+ if self.model_name not in self.model_configs:
63
+ raise ValueError(f"Unsupported model name: {self.model_name}")
64
+
65
+ config = self.model_configs[self.model_name]
66
+
67
+ # Import model architecture
68
+ if config['arch'].startswith('resnet'):
69
+ import torchvision.models as models
70
+ if config['arch'] == 'resnet18':
71
+ self.model = models.resnet18(num_classes=config['num_classes'])
72
+ elif config['arch'] == 'resnet50':
73
+ self.model = models.resnet50(num_classes=config['num_classes'])
74
+ elif config['arch'] == 'densenet161':
75
+ import torchvision.models as models
76
+ self.model = models.densenet161(num_classes=config['num_classes'])
77
+
78
+ # Load pretrained weights
79
+ checkpoint = torch.hub.load_state_dict_from_url(
80
+ config['url'],
81
+ map_location=self.device,
82
+ progress=True
83
+ )
84
+
85
+ # Handle different checkpoint formats
86
+ if 'state_dict' in checkpoint:
87
+ state_dict = checkpoint['state_dict']
88
+ # Remove 'module.' prefix if present
89
+ state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
90
+ else:
91
+ state_dict = checkpoint
92
+
93
+ self.model.load_state_dict(state_dict)
94
+ self.model.to(self.device)
95
+ self.model.eval()
96
+
97
+ self.logger.info(f"Places365 model {self.model_name} loaded successfully on {self.device}")
98
+
99
+ except Exception as e:
100
+ self.logger.error(f"Error loading Places365 model: {str(e)}")
101
+ raise
102
+
103
+ def _load_class_names(self):
104
+ """Load Places365 class names and scene attributes."""
105
+ try:
106
+ # Load scene class names (365 categories)
107
+ import urllib.request
108
+
109
+ class_url = 'https://raw.githubusercontent.com/csailvision/places365/master/categories_places365.txt'
110
+ class_file = urllib.request.urlopen(class_url)
111
+
112
+ self.scene_classes = []
113
+ for line in class_file:
114
+ class_name = line.decode('utf-8').strip().split(' ')[0][3:] # Remove /x/ prefix
115
+ self.scene_classes.append(class_name)
116
+
117
+ # Load scene attributes (optional, for enhanced description)
118
+ attr_url = 'https://raw.githubusercontent.com/csailvision/places365/master/labels_sunattribute.txt'
119
+ try:
120
+ attr_file = urllib.request.urlopen(attr_url)
121
+ self.scene_attributes = []
122
+ for line in attr_file:
123
+ attr_name = line.decode('utf-8').strip()
124
+ self.scene_attributes.append(attr_name)
125
+ except:
126
+ self.logger.warning("Scene attributes not loaded, continuing with basic classification")
127
+ self.scene_attributes = []
128
+
129
+ self.logger.info(f"Loaded {len(self.scene_classes)} scene classes and {len(self.scene_attributes)} attributes")
130
+
131
+ except Exception as e:
132
+ self.logger.error(f"Error loading class names: {str(e)}")
133
+ # Fallback to basic class names if download fails
134
+ self.scene_classes = [f"scene_class_{i}" for i in range(365)]
135
+ self.scene_attributes = []
136
+
137
+ def _setup_scene_mapping(self):
138
+ """Setup mapping from Places365 classes to common scene types."""
139
+ # 建立Places365類別到通用場景類型的映射關係
140
+ self.scene_type_mapping = {
141
+ # Indoor scenes
142
+ 'living_room': 'living_room',
143
+ 'bedroom': 'bedroom',
144
+ 'kitchen': 'kitchen',
145
+ 'dining_room': 'dining_area',
146
+ 'bathroom': 'bathroom',
147
+ 'office': 'office_workspace',
148
+ 'conference_room': 'office_workspace',
149
+ 'classroom': 'educational_setting',
150
+ 'library': 'library',
151
+ 'restaurant': 'restaurant',
152
+ 'cafe': 'cafe',
153
+ 'bar': 'bar',
154
+ 'hotel_room': 'hotel_room',
155
+ 'hospital_room': 'medical_facility',
156
+ 'gym': 'gym',
157
+ 'supermarket': 'retail_store',
158
+ 'clothing_store': 'retail_store',
159
+
160
+ # Outdoor urban scenes
161
+ 'street': 'city_street',
162
+ 'crosswalk': 'intersection',
163
+ 'parking_lot': 'parking_lot',
164
+ 'gas_station': 'gas_station',
165
+ 'bus_station': 'bus_stop',
166
+ 'train_station': 'train_station',
167
+ 'airport_terminal': 'airport',
168
+ 'subway_station': 'subway_station',
169
+ 'bridge': 'bridge',
170
+ 'highway': 'highway',
171
+ 'downtown': 'commercial_district',
172
+ 'shopping_mall': 'shopping_mall',
173
+
174
+ # Natural outdoor scenes
175
+ 'park': 'park_area',
176
+ 'beach': 'beach',
177
+ 'forest': 'forest',
178
+ 'mountain': 'mountain',
179
+ 'lake': 'lake',
180
+ 'river': 'river',
181
+ 'ocean': 'ocean',
182
+ 'desert': 'desert',
183
+ 'field': 'field',
184
+ 'garden': 'garden',
185
+
186
+ # Landmark and tourist areas
187
+ 'castle': 'historical_monument',
188
+ 'palace': 'historical_monument',
189
+ 'temple': 'temple',
190
+ 'church': 'church',
191
+ 'mosque': 'mosque',
192
+ 'museum': 'museum',
193
+ 'art_gallery': 'art_gallery',
194
+ 'tower': 'tourist_landmark',
195
+ 'monument': 'historical_monument',
196
+
197
+ # Sports and entertainment
198
+ 'stadium': 'stadium',
199
+ 'basketball_court': 'sports_field',
200
+ 'tennis_court': 'sports_field',
201
+ 'swimming_pool': 'swimming_pool',
202
+ 'playground': 'playground',
203
+ 'amusement_park': 'amusement_park',
204
+ 'theater': 'theater',
205
+ 'concert_hall': 'concert_hall',
206
+
207
+ # Transportation
208
+ 'airplane_cabin': 'airplane_cabin',
209
+ 'train_interior': 'train_interior',
210
+ 'car_interior': 'car_interior',
211
+
212
+ # Construction and industrial
213
+ 'construction_site': 'construction_site',
214
+ 'factory': 'factory',
215
+ 'warehouse': 'warehouse'
216
+ }
217
+
218
+ # Indoor/outdoor classification helper
219
+ self.indoor_classes = {
220
+ 'living_room', 'bedroom', 'kitchen', 'dining_room', 'bathroom', 'office',
221
+ 'conference_room', 'classroom', 'library', 'restaurant', 'cafe', 'bar',
222
+ 'hotel_room', 'hospital_room', 'gym', 'supermarket', 'clothing_store',
223
+ 'airplane_cabin', 'train_interior', 'car_interior', 'theater', 'concert_hall',
224
+ 'museum', 'art_gallery', 'shopping_mall'
225
+ }
226
+
227
+ self.outdoor_classes = {
228
+ 'street', 'crosswalk', 'parking_lot', 'gas_station', 'bus_station',
229
+ 'train_station', 'airport_terminal', 'bridge', 'highway', 'downtown',
230
+ 'park', 'beach', 'forest', 'mountain', 'lake', 'river', 'ocean',
231
+ 'desert', 'field', 'garden', 'stadium', 'basketball_court', 'tennis_court',
232
+ 'swimming_pool', 'playground', 'amusement_park', 'construction_site',
233
+ 'factory', 'warehouse', 'castle', 'palace', 'temple', 'church', 'mosque',
234
+ 'tower', 'monument'
235
+ }
236
+
237
+ def preprocess(self, image_pil: Image.Image) -> torch.Tensor:
238
+ """
239
+ Preprocess PIL image for Places365 model inference.
240
+
241
+ Args:
242
+ image_pil: Input PIL image
243
+
244
+ Returns:
245
+ torch.Tensor: Preprocessed image tensor
246
+ """
247
+ # Places365 standard preprocessing
248
+ transform = transforms.Compose([
249
+ transforms.Resize((256, 256)),
250
+ transforms.CenterCrop(224),
251
+ transforms.ToTensor(),
252
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
253
+ ])
254
+
255
+ # Convert to RGB if needed
256
+ if image_pil.mode != 'RGB':
257
+ image_pil = image_pil.convert('RGB')
258
+
259
+ # Apply preprocessing
260
+ input_tensor = transform(image_pil).unsqueeze(0)
261
+ return input_tensor.to(self.device)
262
+
263
+ def predict(self, image_pil: Image.Image) -> Dict[str, Any]:
264
+ """
265
+ Predict scene classification and attributes for input image.
266
+
267
+ Args:
268
+ image_pil: Input PIL image
269
+
270
+ Returns:
271
+ Dict containing scene predictions and confidence scores
272
+ """
273
+ try:
274
+ # Preprocess image
275
+ input_tensor = self.preprocess(image_pil)
276
+
277
+ # Model inference
278
+ with torch.no_grad():
279
+ outputs = self.model(input_tensor)
280
+ probabilities = torch.nn.functional.softmax(outputs, dim=1)
281
+
282
+ # 返回最有可能的項目
283
+ top_k = min(10, len(self.scene_classes)) # Configurable top-k
284
+ top_probs, top_indices = torch.topk(probabilities, top_k, dim=1)
285
+
286
+ # Extract results
287
+ top_probs = top_probs.cpu().numpy()[0]
288
+ top_indices = top_indices.cpu().numpy()[0]
289
+
290
+ # Build prediction results
291
+ predictions = []
292
+ for i in range(top_k):
293
+ class_idx = top_indices[i]
294
+ confidence = float(top_probs[i])
295
+ scene_class = self.scene_classes[class_idx]
296
+
297
+ predictions.append({
298
+ 'class_name': scene_class,
299
+ 'class_index': class_idx,
300
+ 'confidence': confidence
301
+ })
302
+
303
+ # Get primary prediction
304
+ primary_prediction = predictions[0]
305
+ primary_class = primary_prediction['class_name']
306
+
307
+ # 確認是 indoor/outdoor
308
+ is_indoor = self._classify_indoor_outdoor(primary_class)
309
+
310
+ # Map to common scene type
311
+ mapped_scene_type = self._map_places365_to_scene_types(primary_class)
312
+
313
+ # Determine scene attributes (basic inference based on class)
314
+ scene_attributes = self._infer_scene_attributes(primary_class)
315
+
316
+ result = {
317
+ 'scene_label': primary_class,
318
+ 'mapped_scene_type': mapped_scene_type,
319
+ 'confidence': primary_prediction['confidence'],
320
+ 'is_indoor': is_indoor,
321
+ 'attributes': scene_attributes,
322
+ 'top_predictions': predictions,
323
+ 'all_probabilities': probabilities.cpu().numpy()[0].tolist()
324
+ }
325
+
326
+ return result
327
+
328
+ except Exception as e:
329
+ self.logger.error(f"Error in Places365 prediction: {str(e)}")
330
+ return {
331
+ 'scene_label': 'unknown',
332
+ 'mapped_scene_type': 'unknown',
333
+ 'confidence': 0.0,
334
+ 'is_indoor': None,
335
+ 'attributes': [],
336
+ 'top_predictions': [],
337
+ 'error': str(e)
338
+ }
339
+
340
+ def _classify_indoor_outdoor(self, scene_class: str) -> Optional[bool]:
341
+ """
342
+ Classify if scene is indoor or outdoor based on Places365 class.
343
+
344
+ Args:
345
+ scene_class: Places365 scene class name
346
+
347
+ Returns:
348
+ bool or None: True for indoor, False for outdoor, None if uncertain
349
+ """
350
+ if scene_class in self.indoor_classes:
351
+ return True
352
+ elif scene_class in self.outdoor_classes:
353
+ return False
354
+ else:
355
+ # For ambiguous classes, use heuristics
356
+ indoor_keywords = ['room', 'office', 'store', 'shop', 'hall', 'interior', 'indoor']
357
+ outdoor_keywords = ['street', 'road', 'park', 'field', 'beach', 'mountain', 'outdoor']
358
+
359
+ scene_lower = scene_class.lower()
360
+ if any(keyword in scene_lower for keyword in indoor_keywords):
361
+ return True
362
+ elif any(keyword in scene_lower for keyword in outdoor_keywords):
363
+ return False
364
+ else:
365
+ return None
366
+
367
+ def _map_places365_to_scene_types(self, places365_class: str) -> str:
368
+ """
369
+ Map Places365 class to common scene type used by the system.
370
+
371
+ Args:
372
+ places365_class: Places365 scene class name
373
+
374
+ Returns:
375
+ str: Mapped scene type
376
+ """
377
+ # Direct mapping lookup
378
+ if places365_class in self.scene_type_mapping:
379
+ return self.scene_type_mapping[places365_class]
380
+
381
+ # Fuzzy matching for similar classes
382
+ places365_lower = places365_class.lower()
383
+
384
+ # Indoor fuzzy matching
385
+ if any(keyword in places365_lower for keyword in ['living', 'bedroom', 'kitchen']):
386
+ return 'general_indoor_space'
387
+ elif any(keyword in places365_lower for keyword in ['office', 'conference', 'meeting']):
388
+ return 'office_workspace'
389
+ elif any(keyword in places365_lower for keyword in ['dining', 'restaurant', 'cafe']):
390
+ return 'dining_area'
391
+ elif any(keyword in places365_lower for keyword in ['store', 'shop', 'market']):
392
+ return 'retail_store'
393
+ elif any(keyword in places365_lower for keyword in ['school', 'class', 'library']):
394
+ return 'educational_setting'
395
+
396
+ # Outdoor fuzzy matching
397
+ elif any(keyword in places365_lower for keyword in ['street', 'road', 'crosswalk']):
398
+ return 'city_street'
399
+ elif any(keyword in places365_lower for keyword in ['park', 'garden', 'plaza']):
400
+ return 'park_area'
401
+ elif any(keyword in places365_lower for keyword in ['beach', 'ocean', 'lake']):
402
+ return 'beach'
403
+ elif any(keyword in places365_lower for keyword in ['mountain', 'forest', 'desert']):
404
+ return 'natural_outdoor_area'
405
+ elif any(keyword in places365_lower for keyword in ['parking', 'garage']):
406
+ return 'parking_lot'
407
+ elif any(keyword in places365_lower for keyword in ['station', 'terminal', 'airport']):
408
+ return 'transportation_hub'
409
+
410
+ # Landmark fuzzy matching
411
+ elif any(keyword in places365_lower for keyword in ['castle', 'palace', 'monument', 'temple']):
412
+ return 'historical_monument'
413
+ elif any(keyword in places365_lower for keyword in ['tower', 'landmark']):
414
+ return 'tourist_landmark'
415
+ elif any(keyword in places365_lower for keyword in ['museum', 'gallery']):
416
+ return 'cultural_venue'
417
+
418
+ # Default fallback based on indoor/outdoor
419
+ is_indoor = self._classify_indoor_outdoor(places365_class)
420
+ if is_indoor is True:
421
+ return 'general_indoor_space'
422
+ elif is_indoor is False:
423
+ return 'generic_street_view'
424
+ else:
425
+ return 'unknown'
426
+
427
+ def _infer_scene_attributes(self, scene_class: str) -> List[str]:
428
+ """
429
+ Infer basic scene attributes from Places365 class.
430
+
431
+ Args:
432
+ scene_class: Places365 scene class name
433
+
434
+ Returns:
435
+ List[str]: Inferred scene attributes
436
+ """
437
+ attributes = []
438
+ scene_lower = scene_class.lower()
439
+
440
+ # Lighting attributes
441
+ if any(keyword in scene_lower for keyword in ['outdoor', 'street', 'park', 'beach']):
442
+ attributes.append('natural_lighting')
443
+ elif any(keyword in scene_lower for keyword in ['indoor', 'room', 'office']):
444
+ attributes.append('artificial_lighting')
445
+
446
+ # Functional attributes
447
+ if any(keyword in scene_lower for keyword in ['commercial', 'store', 'shop', 'restaurant']):
448
+ attributes.append('commercial')
449
+ elif any(keyword in scene_lower for keyword in ['residential', 'home', 'living', 'bedroom']):
450
+ attributes.append('residential')
451
+ elif any(keyword in scene_lower for keyword in ['office', 'conference', 'meeting']):
452
+ attributes.append('workplace')
453
+ elif any(keyword in scene_lower for keyword in ['recreation', 'park', 'playground', 'stadium']):
454
+ attributes.append('recreational')
455
+ elif any(keyword in scene_lower for keyword in ['educational', 'school', 'library', 'classroom']):
456
+ attributes.append('educational')
457
+
458
+ # Spatial attributes
459
+ if any(keyword in scene_lower for keyword in ['open', 'field', 'plaza', 'stadium']):
460
+ attributes.append('open_space')
461
+ elif any(keyword in scene_lower for keyword in ['enclosed', 'room', 'interior']):
462
+ attributes.append('enclosed_space')
463
+
464
+ return attributes
465
+
466
+ def get_scene_probabilities(self, image_pil: Image.Image) -> Dict[str, float]:
467
+ """
468
+ Get probability distribution over all scene classes.
469
+
470
+ Args:
471
+ image_pil: Input PIL image
472
+
473
+ Returns:
474
+ Dict mapping scene class names to probabilities
475
+ """
476
+ try:
477
+ input_tensor = self.preprocess(image_pil)
478
+
479
+ with torch.no_grad():
480
+ outputs = self.model(input_tensor)
481
+ probabilities = torch.nn.functional.softmax(outputs, dim=1)
482
+
483
+ probs = probabilities.cpu().numpy()[0]
484
+
485
+ return {
486
+ self.scene_classes[i]: float(probs[i])
487
+ for i in range(len(self.scene_classes))
488
+ }
489
+
490
+ except Exception as e:
491
+ self.logger.error(f"Error getting scene probabilities: {str(e)}")
492
+ return {}
requirements.txt CHANGED
@@ -1,16 +1,17 @@
1
- torch>=2.0.0
2
- torchvision>=0.15.0
3
- ultralytics>=8.0.0
4
- opencv-python>=4.7.0
5
- pillow>=9.4.0
6
- numpy>=1.23.5
7
- matplotlib>=3.7.0
8
- gradio>=3.32.0
9
- git+https://github.com/openai/CLIP.git
10
- yt-dlp>=2023.3.4
11
- requests>=2.28.1
12
- transformers
13
- accelerate
14
- bitsandbytes
15
- sentencepiece
16
- huggingface_hub>=0.19.0
 
 
1
+ # torch>=2.0.0
2
+ # torchvision>=0.15.0
3
+ # ultralytics>=8.0.0
4
+ # opencv-python>=4.7.0
5
+ # pillow>=9.4.0
6
+ # numpy>=1.23.5
7
+ # matplotlib>=3.7.0
8
+ # gradio>=3.32.0
9
+ # git+https://github.com/openai/CLIP.git
10
+ # yt-dlp>=2023.3.4
11
+ # requests>=2.28.1
12
+ # transformers
13
+ # accelerate
14
+ # bitsandbytes
15
+ # sentencepiece
16
+ # huggingface_hub>=0.19.0
17
+ # urllib3>=1.26.0
scene_analyzer.py CHANGED
The diff for this file is too large to render. See raw diff
 
scene_description.py CHANGED
@@ -59,7 +59,7 @@ class SceneDescriptor:
59
  "low": "This might be {description}, but the confidence is low. {details}"
60
  }
61
 
62
- # 僅提供最基本的模板作為後備
63
  self.scene_detail_templates = {
64
  "default": ["A space with various objects."]
65
  }
@@ -105,53 +105,90 @@ class SceneDescriptor:
105
  return alternatives
106
 
107
 
108
- def _infer_possible_activities(self, scene_type: str, detected_objects: List[Dict]) -> List[str]:
109
  """
110
  Infer possible activities based on scene type and detected objects.
111
 
112
  Args:
113
  scene_type: Identified scene type
114
  detected_objects: List of detected objects
 
 
115
 
116
  Returns:
117
  List of possible activities
118
  """
119
  activities = []
120
 
 
 
 
 
 
 
 
121
  if scene_type.startswith("aerial_view_"):
122
  if scene_type == "aerial_view_intersection":
123
- # 使用預定義的十字路口活動
124
  activities.extend(self.activity_templates.get("aerial_view_intersection", []))
125
-
126
- # 添加與行人和車輛相關的特定活動
127
  pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
128
  vehicles = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]] # Car, bus, truck
129
-
130
  if pedestrians and vehicles:
131
  activities.append("Waiting for an opportunity to cross the street")
132
  activities.append("Obeying traffic signals")
133
-
134
  elif scene_type == "aerial_view_commercial_area":
135
  activities.extend(self.activity_templates.get("aerial_view_commercial_area", []))
136
-
137
  elif scene_type == "aerial_view_plaza":
138
  activities.extend(self.activity_templates.get("aerial_view_plaza", []))
139
-
140
  else:
141
- # 處理其他未明確定義的空中視角場景
142
  aerial_activities = [
143
- "Street crossing",
144
- "Waiting for signals",
145
- "Following traffic rules",
146
  "Pedestrian movement"
147
  ]
148
  activities.extend(aerial_activities)
149
 
 
150
  if scene_type in self.activity_templates:
151
  activities.extend(self.activity_templates[scene_type])
152
  elif "default" in self.activity_templates:
153
  activities.extend(self.activity_templates["default"])
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  detected_class_ids = [obj["class_id"] for obj in detected_objects]
156
 
157
  # Add activities based on specific object combinations
@@ -181,8 +218,48 @@ class SceneDescriptor:
181
  if 24 in detected_class_ids or 26 in detected_class_ids: # Backpack or handbag
182
  activities.append("Carrying personal items")
183
 
184
- # Remove duplicates
185
- return list(set(activities))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  def _identify_safety_concerns(self, detected_objects: List[Dict], scene_type: str) -> List[str]:
188
  """
@@ -198,8 +275,6 @@ class SceneDescriptor:
198
  concerns = []
199
  detected_class_ids = [obj["class_id"] for obj in detected_objects]
200
 
201
- # ORIGINAL SAFETY CONCERNS LOGIC
202
-
203
  # General safety concerns
204
  if 42 in detected_class_ids or 43 in detected_class_ids: # Fork or knife
205
  concerns.append("Sharp utensils present")
@@ -232,8 +307,6 @@ class SceneDescriptor:
232
  if obj["region"] in ["top_left", "top_center", "top_right"] and obj["normalized_area"] > 0.05:
233
  concerns.append(f"Elevated {obj['class_name']} might be unstable")
234
 
235
- # NEW SAFETY CONCERNS LOGIC FOR ADDITIONAL SCENE TYPES
236
-
237
  # Upscale dining safety concerns
238
  if scene_type == "upscale_dining":
239
  # Check for fragile items
@@ -295,7 +368,6 @@ class SceneDescriptor:
295
  concerns.append("Two-wheeled vehicles in pedestrian areas")
296
 
297
  # Check for potential trip hazards
298
- # We can't directly detect this, but can infer from context
299
  if scene_type == "asian_commercial_street" and "bottom" in " ".join([obj["region"] for obj in detected_objects if obj["class_id"] == 0]):
300
  # If people are in bottom regions, they might be walking on uneven surfaces
301
  concerns.append("Potential uneven walking surfaces in commercial area")
@@ -324,7 +396,6 @@ class SceneDescriptor:
324
  concerns.append("Busy traffic area potentially without visible traffic signals in view")
325
 
326
  # Time of day considerations
327
- # We don't have direct time data, but can infer from vehicle lights
328
  vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]
329
  if vehicle_objs and any("lighting_conditions" in obj for obj in detected_objects):
330
  # If vehicles are present and it might be evening/night
 
59
  "low": "This might be {description}, but the confidence is low. {details}"
60
  }
61
 
62
+ # 只提供最基本的模板作為後備
63
  self.scene_detail_templates = {
64
  "default": ["A space with various objects."]
65
  }
 
105
  return alternatives
106
 
107
 
108
+ def _infer_possible_activities(self, scene_type: str, detected_objects: List[Dict], enable_landmark: bool = True, scene_scores: Optional[Dict] = None) -> List[str]:
109
  """
110
  Infer possible activities based on scene type and detected objects.
111
 
112
  Args:
113
  scene_type: Identified scene type
114
  detected_objects: List of detected objects
115
+ enable_landmark: Whether landmark detection is enabled
116
+ scene_scores: Optional dictionary of scene type scores
117
 
118
  Returns:
119
  List of possible activities
120
  """
121
  activities = []
122
 
123
+ # Dynamically replace landmark scene types when landmark detection is disabled
124
+ if not enable_landmark and scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
125
+ alternative_scene_type = self._get_alternative_scene_type(scene_type, detected_objects, scene_scores)
126
+ print(f"Replacing landmark scene type '{scene_type}' with '{alternative_scene_type}' for activity inference")
127
+ scene_type = alternative_scene_type
128
+
129
+ # Process aerial view scenes
130
  if scene_type.startswith("aerial_view_"):
131
  if scene_type == "aerial_view_intersection":
132
+ # Use predefined intersection activities
133
  activities.extend(self.activity_templates.get("aerial_view_intersection", []))
134
+
135
+ # Add pedestrian and vehicle specific activities
136
  pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
137
  vehicles = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]] # Car, bus, truck
138
+
139
  if pedestrians and vehicles:
140
  activities.append("Waiting for an opportunity to cross the street")
141
  activities.append("Obeying traffic signals")
142
+
143
  elif scene_type == "aerial_view_commercial_area":
144
  activities.extend(self.activity_templates.get("aerial_view_commercial_area", []))
145
+
146
  elif scene_type == "aerial_view_plaza":
147
  activities.extend(self.activity_templates.get("aerial_view_plaza", []))
148
+
149
  else:
150
+ # Handle other undefined aerial view scenes
151
  aerial_activities = [
152
+ "Street crossing",
153
+ "Waiting for signals",
154
+ "Following traffic rules",
155
  "Pedestrian movement"
156
  ]
157
  activities.extend(aerial_activities)
158
 
159
+ # Add scene-specific activities from templates
160
  if scene_type in self.activity_templates:
161
  activities.extend(self.activity_templates[scene_type])
162
  elif "default" in self.activity_templates:
163
  activities.extend(self.activity_templates["default"])
164
 
165
+ # Filter out landmark-related activities when landmark detection is disabled
166
+ if not enable_landmark:
167
+ filtered_activities = []
168
+ landmark_keywords = ["sightseeing", "landmark", "tourist", "monument", "historical",
169
+ "guided tour", "photography", "cultural tourism", "heritage"]
170
+
171
+ for activity in activities:
172
+ if not any(keyword in activity.lower() for keyword in landmark_keywords):
173
+ filtered_activities.append(activity)
174
+
175
+ activities = filtered_activities
176
+
177
+ # If we filtered out all activities, add some generic ones based on scene type
178
+ if not activities:
179
+ generic_activities = {
180
+ "city_street": ["Walking", "Commuting", "Shopping"],
181
+ "intersection": ["Crossing the street", "Waiting for traffic signals"],
182
+ "commercial_district": ["Shopping", "Walking", "Dining"],
183
+ "pedestrian_area": ["Walking", "Socializing", "Shopping"],
184
+ "park_area": ["Relaxing", "Walking", "Exercise"],
185
+ "outdoor_natural_area": ["Walking", "Nature observation", "Relaxation"],
186
+ "urban_architecture": ["Walking", "Urban exploration", "Photography"]
187
+ }
188
+
189
+ activities.extend(generic_activities.get(scene_type, ["Walking", "Observing surroundings"]))
190
+
191
+ # Add activities based on detected objects
192
  detected_class_ids = [obj["class_id"] for obj in detected_objects]
193
 
194
  # Add activities based on specific object combinations
 
218
  if 24 in detected_class_ids or 26 in detected_class_ids: # Backpack or handbag
219
  activities.append("Carrying personal items")
220
 
221
+ # Add more person count-dependent activities
222
+ person_count = detected_class_ids.count(0)
223
+ if person_count > 3:
224
+ activities.append("Group gathering")
225
+ elif person_count > 1:
226
+ activities.append("Social interaction")
227
+
228
+ # Add additional activities based on significant objects
229
+ if 43 in detected_class_ids: # cup
230
+ activities.append("Drinking beverages")
231
+
232
+ if 32 in detected_class_ids: # sports ball
233
+ activities.append("Playing sports")
234
+
235
+ if 25 in detected_class_ids: # umbrella
236
+ activities.append("Sheltering from weather")
237
+
238
+ # Add location-specific activities based on environment objects
239
+ if any(furniture in detected_class_ids for furniture in [56, 57, 58, 59, 60]): # furniture items
240
+ activities.append("Using indoor facilities")
241
+
242
+ if any(outdoor_item in detected_class_ids for outdoor_item in [13, 14, 15]): # bench, outdoor items
243
+ activities.append("Enjoying outdoor spaces")
244
+
245
+ # Remove duplicates and ensure reasonable number of activities
246
+ unique_activities = list(set(activities))
247
+
248
+ # Limit to reasonable number (maximum 8 activities)
249
+ if len(unique_activities) > 8:
250
+ # Prioritize more specific activities over general ones
251
+ general_activities = ["Walking", "Observing surroundings", "Commuting", "Using indoor facilities"]
252
+ specific_activities = [a for a in unique_activities if a not in general_activities]
253
+
254
+ # Take all specific activities first, then fill with general ones if needed
255
+ if len(specific_activities) <= 8:
256
+ result = specific_activities + general_activities[:8-len(specific_activities)]
257
+ else:
258
+ result = specific_activities[:8]
259
+ else:
260
+ result = unique_activities
261
+
262
+ return result
263
 
264
  def _identify_safety_concerns(self, detected_objects: List[Dict], scene_type: str) -> List[str]:
265
  """
 
275
  concerns = []
276
  detected_class_ids = [obj["class_id"] for obj in detected_objects]
277
 
 
 
278
  # General safety concerns
279
  if 42 in detected_class_ids or 43 in detected_class_ids: # Fork or knife
280
  concerns.append("Sharp utensils present")
 
307
  if obj["region"] in ["top_left", "top_center", "top_right"] and obj["normalized_area"] > 0.05:
308
  concerns.append(f"Elevated {obj['class_name']} might be unstable")
309
 
 
 
310
  # Upscale dining safety concerns
311
  if scene_type == "upscale_dining":
312
  # Check for fragile items
 
368
  concerns.append("Two-wheeled vehicles in pedestrian areas")
369
 
370
  # Check for potential trip hazards
 
371
  if scene_type == "asian_commercial_street" and "bottom" in " ".join([obj["region"] for obj in detected_objects if obj["class_id"] == 0]):
372
  # If people are in bottom regions, they might be walking on uneven surfaces
373
  concerns.append("Potential uneven walking surfaces in commercial area")
 
396
  concerns.append("Busy traffic area potentially without visible traffic signals in view")
397
 
398
  # Time of day considerations
 
399
  vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]
400
  if vehicle_objs and any("lighting_conditions" in obj for obj in detected_objects):
401
  # If vehicles are present and it might be evening/night
scene_detail_templates.py CHANGED
@@ -200,4 +200,19 @@ SCENE_DETAIL_TEMPLATES = {
200
  "This professional culinary area contains {kitchen_equipment} arranged in stations for {food_preparation}.",
201
  "An industrial kitchen featuring {kitchen_equipment} designed for efficient {food_preparation}."
202
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  }
 
200
  "This professional culinary area contains {kitchen_equipment} arranged in stations for {food_preparation}.",
201
  "An industrial kitchen featuring {kitchen_equipment} designed for efficient {food_preparation}."
202
  ],
203
+ "tourist_landmark": [
204
+ "This notable landmark attracts visitors who come to see {landmark_features} and experience {tourist_activities}.",
205
+ "A famous landmark site where tourists can observe {landmark_features} and engage in {tourist_activities}.",
206
+ "This iconic landmark showcases {landmark_features} and is a popular destination for {tourist_activities}."
207
+ ],
208
+ "natural_landmark": [
209
+ "This natural landmark features {landmark_features} and offers opportunities for {outdoor_activities}.",
210
+ "A scenic natural formation with {landmark_features} where visitors enjoy {outdoor_activities}.",
211
+ "This impressive natural landmark displays {landmark_features} and attracts nature enthusiasts for {outdoor_activities}."
212
+ ],
213
+ "historical_monument": [
214
+ "This historical monument exhibits {landmark_features} and has significance related to {historical_elements}.",
215
+ "An important historical site featuring {landmark_features} and representing {historical_elements}.",
216
+ "This heritage monument showcases {landmark_features} and commemorates {historical_elements}."
217
+ ]
218
  }
scene_type.py CHANGED
@@ -384,4 +384,127 @@ SCENE_TYPES = {
384
  "minimum_required": 3,
385
  "description": "A commercial kitchen with professional cooking equipment and food preparation areas"
386
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  }
 
384
  "minimum_required": 3,
385
  "description": "A commercial kitchen with professional cooking equipment and food preparation areas"
386
  },
387
+ "tourist_landmark": {
388
+ "name": "Tourist Landmark",
389
+ "required_objects": [0], # person
390
+ "optional_objects": [24, 26, 67], # backpack, handbag, cell phone
391
+ "minimum_required": 0, # 可能沒有人,但仍然是地標
392
+ "description": "A location featuring a famous landmark with tourist activity",
393
+ "priority": 1.2 # 提高優先級
394
+ },
395
+ "natural_landmark": {
396
+ "name": "Natural Landmark",
397
+ "required_objects": [0], # person
398
+ "optional_objects": [24, 26, 67], # backpack, handbag, cell phone
399
+ "minimum_required": 0,
400
+ "description": "A natural landmark site with scenic views",
401
+ "priority": 1.2
402
+ },
403
+ "historical_monument": {
404
+ "name": "Historical Monument",
405
+ "required_objects": [0], # person
406
+ "optional_objects": [24, 26, 67], # backpack, handbag, cell phone
407
+ "minimum_required": 0,
408
+ "description": "A historical monument or heritage site",
409
+ "priority": 1.2
410
+ },
411
+ "general_indoor_space": {
412
+ "name": "General Indoor Space",
413
+ "required_objects": [], # No strict required objects, depends on combination
414
+ "optional_objects": [
415
+ 56, # chair
416
+ 57, # couch
417
+ 58, # potted plant
418
+ 59, # bed
419
+ 60, # dining table
420
+ 61, # toilet
421
+ 62, # tv
422
+ 63, # laptop
423
+ 66, # keyboard
424
+ 67, # cell phone
425
+ 73, # book
426
+ 74, # clock
427
+ 75, # vase
428
+ 39, # bottle
429
+ 41, # cup
430
+ ],
431
+ "minimum_required": 2, # Needs at least a few common indoor items
432
+ "description": "An indoor area with various common household or functional items.",
433
+ "priority": 0.8 # Lower priority than more specific scenes
434
+ },
435
+ "generic_street_view": {
436
+ "name": "Generic Street View",
437
+ "required_objects": [], # More about the combination
438
+ "optional_objects": [
439
+ 0, # person
440
+ 1, # bicycle
441
+ 2, # car
442
+ 3, # motorcycle
443
+ 5, # bus
444
+ 7, # truck
445
+ 9, # traffic light
446
+ 10, # fire hydrant
447
+ 11, # stop sign
448
+ 13, # bench
449
+ # Consider adding building if YOLO detects it (not a standard COCO class for YOLOv8, but some custom models might)
450
+ ],
451
+ "minimum_required": 2, # e.g., a car and a person, or multiple vehicles
452
+ "description": "An outdoor street view, likely in an urban or suburban setting, with vehicles and/or pedestrians.",
453
+ "priority": 0.85
454
+ },
455
+ "desk_area_workspace": {
456
+ "name": "Desk Area / Workspace",
457
+ "required_objects": [
458
+ 63, # laptop or 62 (tv as monitor) or 66 (keyboard)
459
+ ],
460
+ "optional_objects": [
461
+ 56, # chair
462
+ 60, # dining table (often used as a desk)
463
+ 64, # mouse
464
+ 66, # keyboard
465
+ 73, # book
466
+ 41, # cup
467
+ 67, # cell phone
468
+ 74, # clock
469
+ ],
470
+ "minimum_required": 2, # e.g., laptop and chair, or table and keyboard
471
+ "description": "A workspace or desk area, typically featuring a computer and related accessories.",
472
+ "priority": 0.9
473
+ },
474
+ "outdoor_gathering_spot": {
475
+ "name": "Outdoor Gathering Spot",
476
+ "required_objects": [
477
+ 0, # person
478
+ ],
479
+ "optional_objects": [
480
+ 13, # bench
481
+ 32, # sports ball
482
+ 24, # backpack
483
+ 25, # umbrella
484
+ 29, # frisbee
485
+ 33, # kite
486
+ 58, # potted plant (if in a more structured park area)
487
+ ],
488
+ "minimum_required": 2, # e.g., person and bench, or multiple people
489
+ "description": "An outdoor area where people might gather for leisure or activity.",
490
+ "priority": 0.8
491
+ },
492
+ "kitchen_counter_or_utility_area": {
493
+ "name": "Kitchen Counter or Utility Area",
494
+ "required_objects": [],
495
+ "optional_objects": [
496
+ 39, # bottle
497
+ 41, # cup
498
+ 44, # spoon
499
+ 45, # bowl
500
+ 68, # microwave
501
+ 69, # oven
502
+ 70, # toaster
503
+ 71, # sink
504
+ 72, # refrigerator
505
+ ],
506
+ "minimum_required": 2, # e.g., sink and microwave, or refrigerator and bottles
507
+ "description": "An area likely used for food preparation or kitchen utilities.",
508
+ "priority": 0.9
509
+ }
510
  }
spatial_analyzer.py CHANGED
@@ -282,19 +282,29 @@ class SpatialAnalyzer:
282
  # Group objects by category and region
283
  category_regions = {}
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  for obj in detected_objects:
286
- # Find object category
287
- category = "other"
288
- for cat_name, cat_ids in self.OBJECT_CATEGORIES.items():
289
- if obj["class_id"] in cat_ids:
290
- category = cat_name
291
- break
292
-
293
- # Add to category-region mapping
294
  if category not in category_regions:
295
  category_regions[category] = {}
296
 
297
- region = obj["region"]
298
  if region not in category_regions[category]:
299
  category_regions[category][region] = []
300
 
@@ -328,156 +338,470 @@ class SpatialAnalyzer:
328
  elif scene_type == "upscale_dining":
329
  # Upscale dining specific logic
330
  zones.update(self._identify_upscale_dining_zones(category_regions, detected_objects))
 
 
 
 
 
 
331
  else:
332
  # Default zone identification for other scene types
333
  zones.update(self._identify_default_zones(category_regions, detected_objects))
334
 
335
- # If no zones were identified, try the default approach
 
 
 
 
 
 
 
 
 
 
 
336
  if not zones:
337
  zones.update(self._identify_default_zones(category_regions, detected_objects))
338
 
 
 
 
 
339
  return zones
340
 
341
- def _identify_indoor_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
342
  """
343
- Identify functional zones for indoor scenes.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
  Args:
346
- category_regions: Objects grouped by category and region
347
  detected_objects: List of detected objects
348
- scene_type: Specific indoor scene type
349
 
350
  Returns:
351
- Dict: Indoor functional zones
352
  """
353
  zones = {}
354
 
355
- # Seating/social zone
356
- if "furniture" in category_regions:
357
- furniture_regions = category_regions["furniture"]
358
- main_furniture_region = max(furniture_regions.items(),
359
- key=lambda x: len(x[1]),
360
- default=(None, []))
361
 
362
- if main_furniture_region[0] is not None and len(main_furniture_region[1]) >= 2:
363
- zone_objects = [obj["class_name"] for obj in main_furniture_region[1]]
364
- zones["social_zone"] = {
365
- "region": main_furniture_region[0],
366
- "objects": zone_objects,
367
- "description": f"Social or seating area with {', '.join(zone_objects)}"
368
- }
369
 
370
- # Entertainment zone
371
- if "electronics" in category_regions:
372
- electronics_items = []
373
- for region_objects in category_regions["electronics"].values():
374
- electronics_items.extend([obj["class_name"] for obj in region_objects])
375
-
376
- if electronics_items:
377
- zones["entertainment_zone"] = {
378
- "region": self._find_main_region(category_regions.get("electronics", {})),
379
- "objects": electronics_items,
380
- "description": f"Entertainment or media area with {', '.join(electronics_items)}"
381
- }
382
 
383
- # Dining/food zone
384
- food_zone_categories = ["kitchen_items", "food"]
385
- food_items = []
386
- food_regions = {}
387
-
388
- for category in food_zone_categories:
389
- if category in category_regions:
390
- for region, objects in category_regions[category].items():
391
- if region not in food_regions:
392
- food_regions[region] = []
393
- food_regions[region].extend(objects)
394
- food_items.extend([obj["class_name"] for obj in objects])
395
-
396
- if food_items:
397
- main_food_region = max(food_regions.items(),
398
- key=lambda x: len(x[1]),
399
- default=(None, []))
400
 
401
- if main_food_region[0] is not None:
402
- zones["dining_zone"] = {
403
- "region": main_food_region[0],
404
- "objects": list(set(food_items)),
405
- "description": f"Dining or food preparation area with {', '.join(list(set(food_items))[:3])}"
406
  }
407
 
408
- # Work/study zone - enhanced to detect even when scene_type is not explicitly office
409
- work_items = []
410
- work_regions = {}
411
 
412
- for obj in detected_objects:
413
- if obj["class_id"] in [56, 60, 63, 64, 66, 73]: # chair, table, laptop, mouse, keyboard, book
414
- region = obj["region"]
415
- if region not in work_regions:
416
- work_regions[region] = []
417
- work_regions[region].append(obj)
418
- work_items.append(obj["class_name"])
419
-
420
- # Check for laptop and table/chair combinations that suggest a workspace
421
- has_laptop = any(obj["class_id"] == 63 for obj in detected_objects)
422
- has_keyboard = any(obj["class_id"] == 66 for obj in detected_objects)
423
- has_table = any(obj["class_id"] == 60 for obj in detected_objects)
424
- has_chair = any(obj["class_id"] == 56 for obj in detected_objects)
425
-
426
- # If we have electronics with furniture in the same region, likely a workspace
427
- workspace_detected = (has_laptop or has_keyboard) and (has_table or has_chair)
428
-
429
- if (workspace_detected or scene_type in ["office_workspace", "meeting_room"]) and work_items:
430
- main_work_region = max(work_regions.items(),
431
- key=lambda x: len(x[1]),
432
- default=(None, []))
433
 
434
- if main_work_region[0] is not None:
435
- zones["workspace_zone"] = {
436
- "region": main_work_region[0],
437
- "objects": list(set(work_items)),
438
- "description": f"Work or study area with {', '.join(list(set(work_items))[:3])}"
439
- }
440
 
441
- # Bedroom-specific zones
442
- if scene_type == "bedroom":
443
- bed_objects = [obj for obj in detected_objects if obj["class_id"] == 59] # Bed
444
- if bed_objects:
445
- bed_region = bed_objects[0]["region"]
446
- zones["sleeping_zone"] = {
447
- "region": bed_region,
448
- "objects": ["bed"],
449
- "description": "Sleeping area with bed"
450
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
 
452
- # Kitchen-specific zones
453
- if scene_type == "kitchen":
454
- # Look for appliances (refrigerator, oven, microwave, sink)
455
- appliance_ids = [68, 69, 71, 72] # microwave, oven, sink, refrigerator
456
- appliance_objects = [obj for obj in detected_objects if obj["class_id"] in appliance_ids]
 
457
 
458
- if appliance_objects:
459
- appliance_regions = {}
460
- for obj in appliance_objects:
461
- region = obj["region"]
462
- if region not in appliance_regions:
463
- appliance_regions[region] = []
464
- appliance_regions[region].append(obj)
465
-
466
- if appliance_regions:
467
- main_appliance_region = max(appliance_regions.items(),
468
- key=lambda x: len(x[1]),
469
- default=(None, []))
470
-
471
- if main_appliance_region[0] is not None:
472
- appliance_names = [obj["class_name"] for obj in main_appliance_region[1]]
473
- zones["kitchen_appliance_zone"] = {
474
- "region": main_appliance_region[0],
475
- "objects": appliance_names,
476
- "description": f"Kitchen appliance area with {', '.join(appliance_names)}"
477
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
 
479
  return zones
480
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
481
  def _identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
482
  """
483
  Identify functional zones for urban intersections with enhanced spatial awareness.
@@ -532,6 +856,142 @@ class SpatialAnalyzer:
532
 
533
  return zones
534
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict],
536
  region_distribution: Dict) -> Dict:
537
  """
@@ -601,7 +1061,7 @@ class SpatialAnalyzer:
601
  if not vehicles:
602
  return traffic_zones
603
 
604
- # Group vehicles by region
605
  vehicle_regions = {}
606
  for v in vehicles:
607
  region = v["region"]
@@ -652,7 +1112,7 @@ class SpatialAnalyzer:
652
 
653
  def _get_directional_description(self, region: str) -> str:
654
  """
655
- Convert region name to a directional description.
656
 
657
  Args:
658
  region: Region name from the grid
@@ -1433,12 +1893,3 @@ class SpatialAnalyzer:
1433
  return max(region_objects_dict.items(),
1434
  key=lambda x: len(x[1]),
1435
  default=("unknown", []))[0]
1436
-
1437
- def _find_main_region(self, region_objects_dict: Dict) -> str:
1438
- """Find the main region with the most objects"""
1439
- if not region_objects_dict:
1440
- return "unknown"
1441
-
1442
- return max(region_objects_dict.items(),
1443
- key=lambda x: len(x[1]),
1444
- default=("unknown", []))[0]
 
282
  # Group objects by category and region
283
  category_regions = {}
284
 
285
+ if not getattr(self, 'enable_landmark', True):
286
+ detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
287
+
288
+ # 過濾地標相關場景類型
289
+ if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
290
+ scene_type = "city_street"
291
+
292
+ # MODIFIED: Smart threshold evaluation instead of fixed values
293
+ should_identify = self._evaluate_zone_identification_feasibility(detected_objects, scene_type)
294
+
295
+ if not should_identify:
296
+ return {}
297
+
298
+ # MODIFIED: Build category_regions mapping (was missing in original)
299
  for obj in detected_objects:
300
+ category = self._categorize_object(obj)
301
+ if not category:
302
+ continue
303
+
 
 
 
 
304
  if category not in category_regions:
305
  category_regions[category] = {}
306
 
307
+ region = obj.get("region", "center")
308
  if region not in category_regions[category]:
309
  category_regions[category][region] = []
310
 
 
338
  elif scene_type == "upscale_dining":
339
  # Upscale dining specific logic
340
  zones.update(self._identify_upscale_dining_zones(category_regions, detected_objects))
341
+ elif scene_type == "tourist_landmark" or "landmark" in scene_type:
342
+ # 處理地標場景類型
343
+ landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
344
+ if landmark_objects:
345
+ landmark_zones = self._identify_landmark_zones(landmark_objects)
346
+ zones.update(landmark_zones)
347
  else:
348
  # Default zone identification for other scene types
349
  zones.update(self._identify_default_zones(category_regions, detected_objects))
350
 
351
+ # 檢查是否有地標物體但場景類型不是地標類型
352
+ if scene_type != "tourist_landmark" and "landmark" not in scene_type:
353
+ landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
354
+ if landmark_objects:
355
+ # 添加地標功能區,但不覆蓋已有的功能區
356
+ landmark_zones = self._identify_landmark_zones(landmark_objects)
357
+ # 確保地標區域不會覆蓋已識別的其他重要功能區
358
+ for zone_id, zone_info in landmark_zones.items():
359
+ if zone_id not in zones:
360
+ zones[zone_id] = zone_info
361
+
362
+ # MODIFIED: Enhanced fallback strategy - try simplified identification if no zones found
363
  if not zones:
364
  zones.update(self._identify_default_zones(category_regions, detected_objects))
365
 
366
+ # Final fallback: create basic zones from high-confidence objects
367
+ if not zones:
368
+ zones.update(self._create_basic_zones_from_objects(detected_objects, scene_type))
369
+
370
  return zones
371
 
372
+ def _identify_core_objects_for_scene(self, detected_objects: List[Dict], scene_type: str) -> List[Dict]:
373
  """
374
+ Identify core objects that define a particular scene type.
375
+
376
+ Args:
377
+ detected_objects: List of detected objects
378
+ scene_type: Scene type
379
+
380
+ Returns:
381
+ List of core objects for the scene
382
+ """
383
+ core_objects = []
384
+
385
+ scene_core_mapping = {
386
+ "bedroom": [59], # bed
387
+ "kitchen": [68, 69, 71, 72], # microwave, oven, sink, refrigerator
388
+ "living_room": [57, 58, 62], # sofa, chair, tv
389
+ "dining_area": [60, 46, 47], # dining table, fork, knife
390
+ "office_workspace": [63, 64, 66, 73] # laptop, mouse, keyboard, book
391
+ }
392
+
393
+ if scene_type in scene_core_mapping:
394
+ core_class_ids = scene_core_mapping[scene_type]
395
+ for obj in detected_objects:
396
+ if obj["class_id"] in core_class_ids and obj.get("confidence", 0) >= 0.4:
397
+ core_objects.append(obj)
398
+
399
+ return core_objects
400
+
401
+ def _get_object_categories(self, detected_objects: List[Dict]) -> set:
402
+ """Get unique object categories from detected objects."""
403
+ object_categories = set()
404
+ for obj in detected_objects:
405
+ category = self._categorize_object(obj)
406
+ if category:
407
+ object_categories.add(category)
408
+ return object_categories
409
+
410
+ def _create_basic_zones_from_objects(self, detected_objects: List[Dict], scene_type: str) -> Dict:
411
+ """
412
+ Create basic functional zones from individual high-confidence objects.
413
+ This is a fallback when standard zone identification fails.
414
 
415
  Args:
 
416
  detected_objects: List of detected objects
417
+ scene_type: Scene type
418
 
419
  Returns:
420
+ Dictionary of basic zones
421
  """
422
  zones = {}
423
 
424
+ # Focus on high-confidence objects
425
+ high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
 
 
 
 
426
 
427
+ if not high_conf_objects:
428
+ high_conf_objects = detected_objects # Fallback to all objects
 
 
 
 
 
429
 
430
+ # Create zones based on individual important objects
431
+ for i, obj in enumerate(high_conf_objects[:3]): # Limit to top 3 objects
432
+ class_name = obj["class_name"]
433
+ region = obj.get("region", "center")
 
 
 
 
 
 
 
 
434
 
435
+ # Create descriptive zone based on object type
436
+ zone_description = self._get_basic_zone_description(class_name, scene_type)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
 
438
+ if zone_description:
439
+ zones[f"functional_area_{i+1}"] = {
440
+ "region": region,
441
+ "objects": [class_name],
442
+ "description": zone_description
443
  }
444
 
445
+ return zones
 
 
446
 
447
+ def _get_basic_zone_description(self, class_name: str, scene_type: str) -> str:
448
+ """Generate basic zone description based on object and scene type."""
449
+
450
+ # Object-specific descriptions
451
+ descriptions = {
452
+ "bed": "Sleeping and rest area",
453
+ "sofa": "Seating and relaxation area",
454
+ "chair": "Seating area",
455
+ "dining table": "Dining and meal area",
456
+ "tv": "Entertainment and media area",
457
+ "laptop": "Work and computing area",
458
+ "potted plant": "Decorative and green space area",
459
+ "refrigerator": "Food storage and kitchen area",
460
+ "car": "Vehicle and transportation area",
461
+ "person": "Activity and social area"
462
+ }
 
 
 
 
 
463
 
464
+ return descriptions.get(class_name, f"Functional area with {class_name}")
 
 
 
 
 
465
 
466
+ def _categorize_object(self, obj: Dict) -> str:
467
+ """
468
+ Categorize detected objects into functional categories for zone identification.
469
+ """
470
+ class_id = obj.get("class_id", -1)
471
+ class_name = obj.get("class_name", "").lower()
472
+
473
+ # Use existing category mapping if available
474
+ if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
475
+ for category, ids in self.OBJECT_CATEGORIES.items():
476
+ if class_id in ids:
477
+ return category
478
+
479
+ # Fallback categorization based on class names for common COCO classes
480
+ furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
481
+ plant_items = ["potted plant"]
482
+ electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
483
+ vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
484
+ person_items = ["person"]
485
+ kitchen_items = ["bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
486
+ "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
487
+ "pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"]
488
+ sports_items = ["frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
489
+ "baseball glove", "skateboard", "surfboard", "tennis racket"]
490
+ personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
491
+
492
+ if any(item in class_name for item in furniture_items):
493
+ return "furniture"
494
+ elif any(item in class_name for item in plant_items):
495
+ return "plant"
496
+ elif any(item in class_name for item in electronic_items):
497
+ return "electronics"
498
+ elif any(item in class_name for item in vehicle_items):
499
+ return "vehicle"
500
+ elif any(item in class_name for item in person_items):
501
+ return "person"
502
+ elif any(item in class_name for item in kitchen_items):
503
+ return "kitchen_items"
504
+ elif any(item in class_name for item in sports_items):
505
+ return "sports"
506
+ elif any(item in class_name for item in personal_items):
507
+ return "personal_items"
508
+ else:
509
+ return "misc"
510
 
511
+ def _evaluate_zone_identification_feasibility(self, detected_objects: List[Dict], scene_type: str) -> bool:
512
+ """
513
+ 基於物件關聯性和分布特徵的彈性可行性評估
514
+ """
515
+ if len(detected_objects) < 2:
516
+ return False
517
 
518
+ # 計算不同置信度層級的物件分布
519
+ high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
520
+ medium_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.4]
521
+
522
+ # 基礎條件:至少需要一定數量的可信物件
523
+ if len(medium_conf_objects) < 2:
524
+ return False
525
+
526
+ # evalure relationships
527
+ functional_relationships = self._calculate_functional_relationships(detected_objects)
528
+
529
+ # 評估space的分布多樣性
530
+ spatial_diversity = self._calculate_spatial_diversity(detected_objects)
531
+
532
+ # 綜合評分機制
533
+ feasibility_score = 0
534
+
535
+ # 物件數量的貢獻值(權重30%)
536
+ object_count_score = min(len(detected_objects) / 5.0, 1.0) * 0.3
537
+
538
+ # 信心度質量貢獻(權重25%)
539
+ confidence_score = len(high_conf_objects) / max(len(detected_objects), 1) * 0.25
540
+
541
+ # 功能關聯性貢獻(權重25%)
542
+ relationship_score = functional_relationships * 0.25
543
+
544
+ # space多樣性貢獻(權重20%)
545
+ diversity_score = spatial_diversity * 0.20
546
+
547
+ feasibility_score = object_count_score + confidence_score + relationship_score + diversity_score
548
+
549
+ # 動態閾值:基於場景複雜度調整
550
+ complexity_threshold = self._get_complexity_threshold(scene_type)
551
+
552
+ return feasibility_score >= complexity_threshold
553
+
554
+ def _calculate_functional_relationships(self, detected_objects: List[Dict]) -> float:
555
+ """
556
+ 計算物件間的功能關聯性評分
557
+ 基於常見的物件組合模式評估功能相關性
558
+ """
559
+ relationship_pairs = {
560
+ # 家具組合關係
561
+ frozenset([56, 60]): 1.0, # 椅子+桌子 (dining/work area)
562
+ frozenset([57, 62]): 0.9, # 沙發+電視 (living area)
563
+ frozenset([59, 58]): 0.7, # 床+植物 (bedroom decor)
564
+
565
+ # 工作相關組合
566
+ frozenset([63, 66]): 0.9, # 筆電+鍵盤 (workspace)
567
+ frozenset([63, 64]): 0.8, # 筆電+滑鼠 (workspace)
568
+ frozenset([60, 63]): 0.8, # 桌子+筆電 (workspace)
569
+
570
+ # 廚房相關組合
571
+ frozenset([68, 72]): 0.9, # 微波爐+冰箱 (kitchen)
572
+ frozenset([69, 71]): 0.8, # 烤箱+水槽 (kitchen)
573
+
574
+ # 用餐相關組合
575
+ frozenset([60, 40]): 0.8, # 桌子+酒杯 (dining)
576
+ frozenset([60, 41]): 0.8, # 桌子+杯子 (dining)
577
+ frozenset([56, 40]): 0.7, # 椅子+酒杯 (dining)
578
+
579
+ # 交通相關組合
580
+ frozenset([2, 9]): 0.8, # 汽車+交通燈 (traffic)
581
+ frozenset([0, 9]): 0.7, # 行人+交通燈 (crosswalk)
582
+ }
583
+
584
+ detected_class_ids = set(obj["class_id"] for obj in detected_objects)
585
+ max_possible_score = 0
586
+ actual_score = 0
587
+
588
+ for pair, score in relationship_pairs.items():
589
+ max_possible_score += score
590
+ if pair.issubset(detected_class_ids):
591
+ actual_score += score
592
+
593
+ return actual_score / max_possible_score if max_possible_score > 0 else 0
594
+
595
+ def _calculate_spatial_diversity(self, detected_objects: List[Dict]) -> float:
596
+ """
597
+ 計算物件空間分布的多樣性
598
+ 評估物件是否分散在不同區域,避免所有物件集中在單一區域
599
+ """
600
+ regions = set(obj.get("region", "center") for obj in detected_objects)
601
+ unique_regions = len(regions)
602
+
603
+ return min(unique_regions / 2.0, 1.0)
604
+
605
+ def _get_complexity_threshold(self, scene_type: str) -> float:
606
+ """
607
+ 可根據場景類型返回適當的複雜度閾值
608
+ 平衡不同場景的區域劃分需求
609
+ """
610
+ # 較簡單場景需要較高分數才進行區域劃分
611
+ simple_scenes = ["bedroom", "bathroom", "closet"]
612
+ # 較複雜場景可以較低分數進行區域劃分
613
+ complex_scenes = ["living_room", "kitchen", "office_workspace", "dining_area"]
614
+
615
+ if scene_type in simple_scenes:
616
+ return 0.65 # 較高閾值,避免過度細分
617
+ elif scene_type in complex_scenes:
618
+ return 0.45 # 較低閾值,允許合理劃分
619
+ else:
620
+ return 0.55 # 中等閾值,平衡策略
621
+
622
+ def _identify_indoor_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
623
+ """
624
+ 平衡化的室內功能區域識別
625
+ 採用通用的物件關聯性分析,避免場景特定的硬編碼
626
+ """
627
+ zones = {}
628
+
629
+ # 辨識到主要功能區域(基於物件關聯性而非場景類型)
630
+ primary_zone = self._identify_primary_functional_area(detected_objects)
631
+ if primary_zone:
632
+ zones["primary_area"] = primary_zone
633
+
634
+ # 只有明確證據且物件數量足夠時創建次要功能區域
635
+ if len(zones) >= 1 and len(detected_objects) >= 6:
636
+ secondary_zone = self._identify_secondary_functional_area(detected_objects, zones)
637
+ if secondary_zone:
638
+ zones["secondary_area"] = secondary_zone
639
 
640
  return zones
641
 
642
+ def _identify_primary_functional_area(self, detected_objects: List[Dict]) -> Dict:
643
+ """
644
+ 辨識主要功能區域,基於最強的物件關聯性組合
645
+ 採用通用邏輯處理各種室內場景
646
+ """
647
+ # 用餐區域檢測(桌椅組合)
648
+ dining_area = self._detect_functional_combination(
649
+ detected_objects,
650
+ primary_objects=[60], # dining table
651
+ supporting_objects=[56, 40, 41, 42, 43], # chair, wine glass, cup, fork, knife
652
+ min_supporting=2,
653
+ description_template="Dining area with table and seating arrangement"
654
+ )
655
+ if dining_area:
656
+ return dining_area
657
+
658
+ # 休息區域檢測(沙發電視組合或床)
659
+ seating_area = self._detect_functional_combination(
660
+ detected_objects,
661
+ primary_objects=[57, 59], # sofa, bed
662
+ supporting_objects=[62, 58, 56], # tv, potted plant, chair
663
+ min_supporting=1,
664
+ description_template="Seating and relaxation area"
665
+ )
666
+ if seating_area:
667
+ return seating_area
668
+
669
+ # 工作區域檢測(電子設備與家具組合)
670
+ work_area = self._detect_functional_combination(
671
+ detected_objects,
672
+ primary_objects=[63, 66], # laptop, keyboard
673
+ supporting_objects=[60, 56, 64], # dining table, chair, mouse
674
+ min_supporting=2,
675
+ description_template="Workspace area with electronics and furniture"
676
+ )
677
+ if work_area:
678
+ return work_area
679
+
680
+ return None
681
+
682
+ def _identify_secondary_functional_area(self, detected_objects: List[Dict], existing_zones: Dict) -> Dict:
683
+ """
684
+ 識別次要功能區域,避免與主要區域重疊
685
+ """
686
+ # 獲取已使用的區域
687
+ used_regions = set(zone["region"] for zone in existing_zones.values())
688
+
689
+ # 裝飾區域檢測(植物集中區域)
690
+ decorative_area = self._detect_functional_combination(
691
+ detected_objects,
692
+ primary_objects=[58], # potted plant
693
+ supporting_objects=[75], # vase
694
+ min_supporting=0,
695
+ min_primary=3, # 至少需要3個植物
696
+ description_template="Decorative area with plants and ornamental items",
697
+ exclude_regions=used_regions
698
+ )
699
+ if decorative_area:
700
+ return decorative_area
701
+
702
+ # 儲存區域檢測(廚房電器組合)
703
+ storage_area = self._detect_functional_combination(
704
+ detected_objects,
705
+ primary_objects=[72, 68, 69], # refrigerator, microwave, oven
706
+ supporting_objects=[71], # sink
707
+ min_supporting=0,
708
+ min_primary=2,
709
+ description_template="Kitchen appliance and storage area",
710
+ exclude_regions=used_regions
711
+ )
712
+ if storage_area:
713
+ return storage_area
714
+
715
+ return None
716
+
717
+ def _detect_functional_combination(self, detected_objects: List[Dict], primary_objects: List[int],
718
+ supporting_objects: List[int], min_supporting: int,
719
+ description_template: str, min_primary: int = 1,
720
+ exclude_regions: set = None) -> Dict:
721
+ """
722
+ 通用的功能組合檢測方法
723
+ 基於主要物件和支持物件的組合判斷功能區域
724
+
725
+ Args:
726
+ detected_objects: 檢測到的物件列表
727
+ primary_objects: 主要物件的class_id列表
728
+ supporting_objects: 支持物件的class_id列表
729
+ min_supporting: 最少需要的支持物件數量
730
+ description_template: 描述模板
731
+ min_primary: 最少需要的主要物件數量
732
+ exclude_regions: 需要排除的區域集合
733
+
734
+ Returns:
735
+ Dict: 功能區域資訊,如果不符合條件則返回None
736
+ """
737
+ if exclude_regions is None:
738
+ exclude_regions = set()
739
+
740
+ # 收集主要物件
741
+ primary_objs = [obj for obj in detected_objects
742
+ if obj["class_id"] in primary_objects and obj.get("confidence", 0) >= 0.4]
743
+
744
+ # 收集支持物件
745
+ supporting_objs = [obj for obj in detected_objects
746
+ if obj["class_id"] in supporting_objects and obj.get("confidence", 0) >= 0.4]
747
+
748
+ # 檢查是否滿足最少數量要求
749
+ if len(primary_objs) < min_primary or len(supporting_objs) < min_supporting:
750
+ return None
751
+
752
+ # 按區域組織物件
753
+ region_combinations = {}
754
+ all_relevant_objs = primary_objs + supporting_objs
755
+
756
+ for obj in all_relevant_objs:
757
+ region = obj["region"]
758
+
759
+ # 排除指定區域
760
+ if region in exclude_regions:
761
+ continue
762
+
763
+ if region not in region_combinations:
764
+ region_combinations[region] = {"primary": [], "supporting": [], "all": []}
765
+
766
+ region_combinations[region]["all"].append(obj)
767
+
768
+ if obj["class_id"] in primary_objects:
769
+ region_combinations[region]["primary"].append(obj)
770
+ else:
771
+ region_combinations[region]["supporting"].append(obj)
772
+
773
+ # 找到最佳區域組合
774
+ best_region = None
775
+ best_score = 0
776
+
777
+ for region, objs in region_combinations.items():
778
+ # 計算該區域的評分
779
+ primary_count = len(objs["primary"])
780
+ supporting_count = len(objs["supporting"])
781
+
782
+ # 必須滿足最低要求
783
+ if primary_count < min_primary or supporting_count < min_supporting:
784
+ continue
785
+
786
+ # 計算組合評分(主要物件權重較高)
787
+ score = primary_count * 2 + supporting_count
788
+
789
+ if score > best_score:
790
+ best_score = score
791
+ best_region = region
792
+
793
+ if best_region is None:
794
+ return None
795
+
796
+ best_combination = region_combinations[best_region]
797
+ all_objects = [obj["class_name"] for obj in best_combination["all"]]
798
+
799
+ return {
800
+ "region": best_region,
801
+ "objects": all_objects,
802
+ "description": description_template
803
+ }
804
+
805
  def _identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
806
  """
807
  Identify functional zones for urban intersections with enhanced spatial awareness.
 
856
 
857
  return zones
858
 
859
+ def _identify_landmark_zones(self, landmark_objects: List[Dict]) -> Dict:
860
+ """
861
+ 識別與地標相關的功能區域
862
+
863
+ Args:
864
+ landmark_objects: 被識別為地標的物體列表
865
+
866
+ Returns:
867
+ Dict: 地標相關的功能區域
868
+ """
869
+ landmark_zones = {}
870
+
871
+ if not landmark_objects:
872
+ print("Warning: No landmark objects provided to _identify_landmark_zones")
873
+ return landmark_zones
874
+
875
+ try:
876
+ for i, landmark in enumerate(landmark_objects):
877
+ if not isinstance(landmark, dict):
878
+ print(f"Warning: Landmark object at index {i} is not a dictionary: {type(landmark)}")
879
+ continue
880
+
881
+ landmark_id = landmark.get("landmark_id")
882
+ if not landmark_id:
883
+ print(f"Warning: Missing landmark_id for landmark at index {i}")
884
+ landmark_id = f"unknown_landmark_{i}"
885
+
886
+ landmark_name = landmark.get("class_name", "Landmark")
887
+ landmark_type = landmark.get("landmark_type", "architectural")
888
+ landmark_region = landmark.get("region", "middle_center")
889
+
890
+ # 為地標創建主要觀景區
891
+ zone_id = f"landmark_zone_{i+1}"
892
+ zone_name = f"{landmark_name} Viewing Area"
893
+
894
+ # 根據地標類型調整描述
895
+ if landmark_type == "natural":
896
+ zone_description = f"Scenic viewpoint for observing {landmark_name}, a notable natural landmark in {landmark.get('location', 'this area')}."
897
+ primary_function = "Nature observation and photography"
898
+ elif landmark_type == "monument":
899
+ zone_description = f"Viewing area around {landmark_name}, a significant monument in {landmark.get('location', 'this area')}."
900
+ primary_function = "Historical appreciation and cultural tourism"
901
+ else: # architectural
902
+ zone_description = f"Area centered around {landmark_name}, where visitors can observe and appreciate this iconic structure in {landmark.get('location', 'this area')}."
903
+ primary_function = "Architectural tourism and photography"
904
+
905
+ # 確定與地標相關的物體
906
+ related_objects = ["person", "camera", "cell phone", "backpack"]
907
+
908
+ # 創建功能區域
909
+ landmark_zones[zone_id] = {
910
+ "name": zone_name,
911
+ "description": zone_description,
912
+ "objects": ["landmark"] + [obj for obj in related_objects if obj in [o.get("class_name") for o in landmark_objects]],
913
+ "region": landmark_region,
914
+ "primary_function": primary_function
915
+ }
916
+
917
+ # 如果有建造年份信息,加到描述中
918
+ if "year_built" in landmark:
919
+ landmark_zones[zone_id]["description"] += f" Built in {landmark['year_built']}."
920
+
921
+ # 如果有建築風格信息,加到描述中
922
+ if "architectural_style" in landmark:
923
+ landmark_zones[zone_id]["description"] += f" Features {landmark['architectural_style']} architectural style."
924
+
925
+ # 如果有重要性信息,加到描述中
926
+ if "significance" in landmark:
927
+ landmark_zones[zone_id]["description"] += f" {landmark['significance']}."
928
+
929
+ try:
930
+ # 創建照相區
931
+ photo_region = landmark_region # 默認與地標在同一區域
932
+
933
+ # 根據地標位置調整照相區位置(地標前方通常是照相區)
934
+ region_mapping = {
935
+ "top_left": "bottom_right",
936
+ "top_center": "bottom_center",
937
+ "top_right": "bottom_left",
938
+ "middle_left": "middle_right",
939
+ "middle_center": "bottom_center",
940
+ "middle_right": "middle_left",
941
+ "bottom_left": "top_right",
942
+ "bottom_center": "top_center",
943
+ "bottom_right": "top_left"
944
+ }
945
+
946
+ if landmark_region in region_mapping:
947
+ photo_region = region_mapping[landmark_region]
948
+
949
+ landmark_zones[f"photo_spot_{i+1}"] = {
950
+ "name": f"{landmark_name} Photography Spot",
951
+ "description": f"Popular position for photographing {landmark_name} with optimal viewing angle.",
952
+ "objects": ["camera", "person", "cell phone"],
953
+ "region": photo_region,
954
+ "primary_function": "Tourist photography"
955
+ }
956
+ except Exception as e:
957
+ print(f"Error creating photo spot zone: {e}")
958
+
959
+ try:
960
+ # 如果是著名地標,可能有紀念品販售區
961
+ if landmark.get("confidence", 0) > 0.7: # 高置信度地標更可能有紀念品區
962
+ # 根據地標位置找到適合的紀念品區位置(通常在地標附近但不直接在地標上)
963
+ adjacent_regions = {
964
+ "top_left": ["top_center", "middle_left"],
965
+ "top_center": ["top_left", "top_right"],
966
+ "top_right": ["top_center", "middle_right"],
967
+ "middle_left": ["top_left", "bottom_left"],
968
+ "middle_center": ["middle_left", "middle_right"],
969
+ "middle_right": ["top_right", "bottom_right"],
970
+ "bottom_left": ["middle_left", "bottom_center"],
971
+ "bottom_center": ["bottom_left", "bottom_right"],
972
+ "bottom_right": ["bottom_center", "middle_right"]
973
+ }
974
+
975
+ if landmark_region in adjacent_regions:
976
+ souvenir_region = adjacent_regions[landmark_region][0] # 選擇第一個相鄰區域
977
+
978
+ landmark_zones[f"souvenir_area_{i+1}"] = {
979
+ "name": f"{landmark_name} Souvenir Area",
980
+ "description": f"Area where visitors can purchase souvenirs and memorabilia related to {landmark_name}.",
981
+ "objects": ["person", "handbag", "backpack"],
982
+ "region": souvenir_region,
983
+ "primary_function": "Tourism commerce"
984
+ }
985
+ except Exception as e:
986
+ print(f"Error creating souvenir area zone: {e}")
987
+
988
+ except Exception as e:
989
+ print(f"Error in _identify_landmark_zones: {e}")
990
+ import traceback
991
+ traceback.print_exc()
992
+
993
+ return landmark_zones
994
+
995
  def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict],
996
  region_distribution: Dict) -> Dict:
997
  """
 
1061
  if not vehicles:
1062
  return traffic_zones
1063
 
1064
+ # 把運輸工具歸成一區
1065
  vehicle_regions = {}
1066
  for v in vehicles:
1067
  region = v["region"]
 
1112
 
1113
  def _get_directional_description(self, region: str) -> str:
1114
  """
1115
+ 把方向轉換成方位(東西南北)
1116
 
1117
  Args:
1118
  region: Region name from the grid
 
1893
  return max(region_objects_dict.items(),
1894
  key=lambda x: len(x[1]),
1895
  default=("unknown", []))[0]
 
 
 
 
 
 
 
 
 
video_processor.py CHANGED
@@ -222,7 +222,7 @@ class VideoProcessor:
222
  else:
223
  obj_id = next_object_id
224
  next_object_id += 1
225
-
226
  # 使用更明顯的顏色
227
  bright_colors = [
228
  (0, 0, 255), # red
 
222
  else:
223
  obj_id = next_object_id
224
  next_object_id += 1
225
+
226
  # 使用更明顯的顏色
227
  bright_colors = [
228
  (0, 0, 255), # red