Spaces:
Running
on
Zero
Running
on
Zero
Upload 31 files
Browse files- activity_templates.py +56 -0
- app.py +189 -40
- clip_analyzer.py +316 -62
- clip_prompts.py +1 -1
- clip_zero_shot_classifier.py +1415 -0
- enhance_scene_describer.py +1108 -292
- evaluation_metrics.py +7 -8
- image_processor.py +302 -102
- landmark_activities.py +0 -0
- landmark_data.py +0 -0
- lighting_analyzer.py +0 -0
- lighting_conditions.py +40 -0
- llm_enhancer.py +198 -60
- object_template_fillers.py +7 -2
- places365_model.py +492 -0
- requirements.txt +17 -16
- scene_analyzer.py +0 -0
- scene_description.py +92 -21
- scene_detail_templates.py +15 -0
- scene_type.py +123 -0
- spatial_analyzer.py +589 -138
- video_processor.py +1 -1
activity_templates.py
CHANGED
@@ -320,5 +320,61 @@ ACTIVITY_TEMPLATES = {
|
|
320 |
"Chef activities",
|
321 |
"Commercial food handling",
|
322 |
"Restaurant meal preparation"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
]
|
324 |
}
|
|
|
320 |
"Chef activities",
|
321 |
"Commercial food handling",
|
322 |
"Restaurant meal preparation"
|
323 |
+
],
|
324 |
+
"tourist_landmark": [
|
325 |
+
"Sightseeing",
|
326 |
+
"Photography",
|
327 |
+
"Guided tours",
|
328 |
+
"Learning about landmark history",
|
329 |
+
"Souvenir shopping",
|
330 |
+
"Cultural appreciation",
|
331 |
+
"Architectural observation"
|
332 |
+
],
|
333 |
+
"natural_landmark": [
|
334 |
+
"Nature photography",
|
335 |
+
"Scenic viewing",
|
336 |
+
"Hiking",
|
337 |
+
"Nature appreciation",
|
338 |
+
"Wildlife watching",
|
339 |
+
"Outdoor recreation",
|
340 |
+
"Environmental education"
|
341 |
+
],
|
342 |
+
"historical_monument": [
|
343 |
+
"Historical tours",
|
344 |
+
"Cultural heritage appreciation",
|
345 |
+
"Educational visits",
|
346 |
+
"Historical photography",
|
347 |
+
"Learning about past events",
|
348 |
+
"Architectural study",
|
349 |
+
"Heritage tourism"
|
350 |
+
],
|
351 |
+
"general_indoor_space": [
|
352 |
+
"Engaging in general indoor activities",
|
353 |
+
"Resting or relaxing in an indoor setting",
|
354 |
+
"Possibly having a conversation or reading"
|
355 |
+
],
|
356 |
+
"generic_street_view": [
|
357 |
+
"People walking or commuting",
|
358 |
+
"Vehicles driving on the road",
|
359 |
+
"Observing street traffic and urban activity",
|
360 |
+
"Waiting at a crosswalk or bus stop (if applicable objects present)"
|
361 |
+
],
|
362 |
+
"desk_area_workspace": [
|
363 |
+
"Working on a computer or laptop",
|
364 |
+
"Studying or reading documents",
|
365 |
+
"Writing or taking notes",
|
366 |
+
"Participating in an online meeting (if computer present)"
|
367 |
+
],
|
368 |
+
"outdoor_gathering_spot": [
|
369 |
+
"People socializing outdoors",
|
370 |
+
"Relaxing on a bench or in a park-like setting",
|
371 |
+
"Engaging in light recreational activities",
|
372 |
+
"Having a picnic (if food items or backpacks are present)"
|
373 |
+
],
|
374 |
+
"kitchen_counter_or_utility_area": [
|
375 |
+
"Preparing food or drinks",
|
376 |
+
"Using kitchen appliances like a microwave or toaster",
|
377 |
+
"Washing dishes or cleaning",
|
378 |
+
"Storing food items"
|
379 |
]
|
380 |
}
|
app.py
CHANGED
@@ -19,8 +19,57 @@ from video_processor import VideoProcessor
|
|
19 |
from llm_enhancer import LLMEnhancer
|
20 |
|
21 |
# Initialize Processors with LLM support
|
22 |
-
image_processor =
|
23 |
-
video_processor =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# Helper Function
|
26 |
def get_all_classes():
|
@@ -58,14 +107,93 @@ def get_all_classes():
|
|
58 |
return sorted(default_classes.items())
|
59 |
|
60 |
@spaces.GPU
|
61 |
-
def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True):
|
62 |
"""Processes a single uploaded image."""
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
try:
|
65 |
image_processor.use_llm = use_llm
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
class_ids_to_filter = None
|
71 |
if filter_classes:
|
@@ -92,11 +220,13 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
|
|
92 |
print(f"Filtering image results for class IDs: {class_ids_to_filter}")
|
93 |
|
94 |
# Call the existing image processing logic
|
|
|
95 |
result_image, result_text, stats = image_processor.process_image(
|
96 |
image,
|
97 |
model_name,
|
98 |
confidence_threshold,
|
99 |
-
class_ids_to_filter
|
|
|
100 |
)
|
101 |
|
102 |
# Format stats for JSON display
|
@@ -191,15 +321,13 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
|
|
191 |
|
192 |
print(f"Original scene description (first 50 chars): {scene_desc[:50]}...")
|
193 |
|
194 |
-
#
|
195 |
clean_scene_desc = clean_description(scene_desc)
|
196 |
print(f"Cleaned scene description (first 50 chars): {clean_scene_desc[:50]}...")
|
197 |
|
198 |
-
# 即使清理後為空也確保顯示原始內容
|
199 |
if not clean_scene_desc.strip():
|
200 |
clean_scene_desc = scene_desc
|
201 |
|
202 |
-
# 創建原始描述的HTML
|
203 |
scene_desc_html = f"<div>{clean_scene_desc}</div>"
|
204 |
|
205 |
# 獲取LLM增強描述並且確保設置默認值為空字符串而非 None,不然會有None type Error
|
@@ -210,18 +338,18 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
|
|
210 |
if not enhanced_description or not enhanced_description.strip():
|
211 |
print("WARNING: LLM enhanced description is empty!")
|
212 |
|
213 |
-
#
|
214 |
llm_badge = ""
|
215 |
description_to_show = ""
|
216 |
|
|
|
217 |
if use_llm and enhanced_description:
|
218 |
llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background: linear-gradient(90deg, #38b2ac, #4299e1); color:white; font-size:0.7rem; font-weight:bold; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); border: 1px solid rgba(255, 255, 255, 0.2);">LLM Enhanced</span>'
|
219 |
description_to_show = enhanced_description
|
220 |
-
|
221 |
else:
|
222 |
llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background-color:#718096; color:white; font-size:0.7rem; font-weight:bold;">Basic</span>'
|
223 |
description_to_show = clean_scene_desc
|
224 |
-
# 不使用 LLM 時,折疊區不顯示內容
|
225 |
|
226 |
# 使用LLM敘述時會有徽章標籤在標題上
|
227 |
scene_description_html = f'''
|
@@ -271,7 +399,7 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
|
|
271 |
print("WARNING: LLM enhanced description is empty!")
|
272 |
|
273 |
return (result_image, result_text, formatted_stats, plot_figure,
|
274 |
-
scene_description_html, original_desc_html,
|
275 |
activities_list_data, safety_data, zones, lighting)
|
276 |
|
277 |
except Exception as e:
|
@@ -471,6 +599,12 @@ def create_interface():
|
|
471 |
info="Provides more detailed and natural language descriptions (may increase processing time)"
|
472 |
)
|
473 |
|
|
|
|
|
|
|
|
|
|
|
|
|
474 |
with gr.Accordion("Filter Classes", open=False):
|
475 |
gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
|
476 |
with gr.Row():
|
@@ -490,24 +624,39 @@ def create_interface():
|
|
490 |
with gr.Group(elem_classes="how-to-use"):
|
491 |
gr.HTML('<div class="section-heading">How to Use (Image)</div>')
|
492 |
gr.Markdown("""
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
|
|
|
|
|
|
|
|
498 |
""")
|
|
|
|
|
499 |
# Image Examples
|
500 |
gr.Examples(
|
501 |
examples=[
|
502 |
-
"room_01.jpg",
|
503 |
-
"room_02.jpg",
|
504 |
-
"
|
505 |
-
"
|
|
|
506 |
],
|
507 |
inputs=image_input,
|
508 |
label="Example Images"
|
509 |
)
|
510 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
511 |
# Right Column: Image Results
|
512 |
with gr.Column(scale=6, elem_classes="output-panel"):
|
513 |
with gr.Tabs(elem_classes="tabs"):
|
@@ -540,8 +689,8 @@ def create_interface():
|
|
540 |
</p>
|
541 |
</div>
|
542 |
''')
|
543 |
-
image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
|
544 |
-
|
545 |
# 使用LLM增強敘述時也會顯示原本敘述內容
|
546 |
with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
|
547 |
image_llm_description = gr.HTML(label=None, elem_id="original_scene_description_text")
|
@@ -709,7 +858,7 @@ def create_interface():
|
|
709 |
|
710 |
image_detect_btn.click(
|
711 |
fn=handle_image_upload,
|
712 |
-
inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter, use_llm],
|
713 |
outputs=[
|
714 |
image_result_image, image_result_text, image_stats_json, image_plot_output,
|
715 |
image_scene_description_html, image_llm_description, image_activities_list, image_safety_list, image_zones_json,
|
@@ -732,18 +881,18 @@ def create_interface():
|
|
732 |
|
733 |
# Footer
|
734 |
gr.HTML("""
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
|
748 |
return demo
|
749 |
|
@@ -751,4 +900,4 @@ def create_interface():
|
|
751 |
if __name__ == "__main__":
|
752 |
demo_interface = create_interface()
|
753 |
|
754 |
-
demo_interface.launch()
|
|
|
19 |
from llm_enhancer import LLMEnhancer
|
20 |
|
21 |
# Initialize Processors with LLM support
|
22 |
+
image_processor = None
|
23 |
+
video_processor = None
|
24 |
+
|
25 |
+
def initialize_processors():
|
26 |
+
global image_processor, video_processor
|
27 |
+
|
28 |
+
try:
|
29 |
+
print("Attempting to initialize ImageProcessor with LLM support...")
|
30 |
+
image_processor = ImageProcessor(use_llm=True, llm_model_path="meta-llama/Llama-3.2-3B-Instruct")
|
31 |
+
print("ImageProcessor initialized successfully with LLM")
|
32 |
+
|
33 |
+
# 添加診斷檢查
|
34 |
+
if hasattr(image_processor, 'scene_analyzer'):
|
35 |
+
if image_processor.scene_analyzer is not None:
|
36 |
+
print(f"scene_analyzer initialized: {type(image_processor.scene_analyzer)}")
|
37 |
+
if hasattr(image_processor.scene_analyzer, 'use_llm'):
|
38 |
+
print(f"scene_analyzer.use_llm available: {image_processor.scene_analyzer.use_llm}")
|
39 |
+
else:
|
40 |
+
print("WARNING: scene_analyzer is None after initialization")
|
41 |
+
else:
|
42 |
+
print("WARNING: scene_analyzer attribute not found in image_processor")
|
43 |
+
|
44 |
+
video_processor = VideoProcessor(image_processor)
|
45 |
+
print("VideoProcessor initialized successfully")
|
46 |
+
return True
|
47 |
+
|
48 |
+
except Exception as e:
|
49 |
+
print(f"Error initializing processors with LLM: {e}")
|
50 |
+
import traceback
|
51 |
+
traceback.print_exc()
|
52 |
+
|
53 |
+
# Create fallback processor without LLM
|
54 |
+
try:
|
55 |
+
print("Attempting fallback initialization without LLM...")
|
56 |
+
image_processor = ImageProcessor(use_llm=False, enable_places365=False)
|
57 |
+
video_processor = VideoProcessor(image_processor)
|
58 |
+
print("Fallback processors initialized successfully without LLM and Places365")
|
59 |
+
return True
|
60 |
+
|
61 |
+
except Exception as fallback_error:
|
62 |
+
print(f"Fatal error: Cannot initialize processors: {fallback_error}")
|
63 |
+
import traceback
|
64 |
+
traceback.print_exc()
|
65 |
+
image_processor = None
|
66 |
+
video_processor = None
|
67 |
+
return False
|
68 |
+
|
69 |
+
# Initialize processors
|
70 |
+
initialization_success = initialize_processors()
|
71 |
+
if not initialization_success:
|
72 |
+
print("WARNING: Failed to initialize processors. Application may not function correctly.")
|
73 |
|
74 |
# Helper Function
|
75 |
def get_all_classes():
|
|
|
107 |
return sorted(default_classes.items())
|
108 |
|
109 |
@spaces.GPU
|
110 |
+
def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True, enable_landmark=True):
|
111 |
"""Processes a single uploaded image."""
|
112 |
+
# Enhanced safety check for image_processor
|
113 |
+
if image_processor is None:
|
114 |
+
error_msg = "Image processor is not initialized. Please restart the application or check system dependencies."
|
115 |
+
print(f"ERROR: {error_msg}")
|
116 |
+
|
117 |
+
# Create error plot
|
118 |
+
fig, ax = plt.subplots(figsize=(8, 6))
|
119 |
+
ax.text(0.5, 0.5, "Initialization Error\nProcessor Not Available",
|
120 |
+
color="red", ha="center", va="center", fontsize=14, fontweight="bold")
|
121 |
+
ax.axis('off')
|
122 |
+
|
123 |
+
return (None, error_msg, {}, fig, f"<div style='color: red; font-weight: bold;'>Error: {error_msg}</div>",
|
124 |
+
"<div style='color: red;'>Error: System not initialized</div>",
|
125 |
+
[["System Error"]], [["System Error"]], {}, {"time_of_day": "error", "confidence": 0})
|
126 |
+
|
127 |
+
# Additional safety check for processor attributes
|
128 |
+
if not hasattr(image_processor, 'use_llm'):
|
129 |
+
error_msg = "Image processor is corrupted. Missing required attributes."
|
130 |
+
print(f"ERROR: {error_msg}")
|
131 |
+
|
132 |
+
fig, ax = plt.subplots(figsize=(8, 6))
|
133 |
+
ax.text(0.5, 0.5, "Processor Error\nCorrupted State",
|
134 |
+
color="red", ha="center", va="center", fontsize=14, fontweight="bold")
|
135 |
+
ax.axis('off')
|
136 |
+
|
137 |
+
return (None, error_msg, {}, fig, f"<div style='color: red; font-weight: bold;'>Error: {error_msg}</div>",
|
138 |
+
"<div style='color: red;'>Error: Processor corrupted</div>",
|
139 |
+
[["Processor Error"]], [["Processor Error"]], {}, {"time_of_day": "error", "confidence": 0})
|
140 |
+
|
141 |
+
print(f"DIAGNOSTIC: Image upload handled with enable_landmark={enable_landmark}, use_llm={use_llm}")
|
142 |
+
print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}, enable_landmark: {enable_landmark}")
|
143 |
try:
|
144 |
image_processor.use_llm = use_llm
|
145 |
+
|
146 |
+
# ���保 scene_analyzer 不是 None
|
147 |
+
if hasattr(image_processor, 'scene_analyzer') and image_processor.scene_analyzer is not None:
|
148 |
+
if hasattr(image_processor.scene_analyzer, 'use_llm'):
|
149 |
+
image_processor.scene_analyzer.use_llm = use_llm
|
150 |
+
print(f"Updated existing scene_analyzer use_llm setting to: {use_llm}")
|
151 |
+
|
152 |
+
# 檢查並設置 landmark detection
|
153 |
+
if hasattr(image_processor.scene_analyzer, 'use_landmark_detection'):
|
154 |
+
# 設置所有相關標記
|
155 |
+
image_processor.scene_analyzer.use_landmark_detection = enable_landmark
|
156 |
+
image_processor.scene_analyzer.enable_landmark = enable_landmark
|
157 |
+
|
158 |
+
# 確保處理器也設置了這選項
|
159 |
+
image_processor.enable_landmark = enable_landmark
|
160 |
+
|
161 |
+
# 檢查並設置更深層次的組件
|
162 |
+
if hasattr(image_processor.scene_analyzer, 'scene_describer') and image_processor.scene_analyzer.scene_describer is not None:
|
163 |
+
image_processor.scene_analyzer.scene_describer.enable_landmark = enable_landmark
|
164 |
+
|
165 |
+
# 檢查並設置CLIP分析器上的標記
|
166 |
+
if hasattr(image_processor.scene_analyzer, 'clip_analyzer') and image_processor.scene_analyzer.clip_analyzer is not None:
|
167 |
+
if hasattr(image_processor.scene_analyzer.clip_analyzer, 'enable_landmark'):
|
168 |
+
image_processor.scene_analyzer.clip_analyzer.enable_landmark = enable_landmark
|
169 |
+
|
170 |
+
# 檢查並設置LLM增強器
|
171 |
+
if hasattr(image_processor.scene_analyzer, 'llm_enhancer') and image_processor.scene_analyzer.llm_enhancer is not None:
|
172 |
+
if hasattr(image_processor.scene_analyzer.llm_enhancer, 'enable_landmark'):
|
173 |
+
image_processor.scene_analyzer.llm_enhancer.enable_landmark = enable_landmark
|
174 |
+
print(f"Updated LLM enhancer enable_landmark to: {enable_landmark}")
|
175 |
+
|
176 |
+
print(f"Updated all landmark detection settings to: {enable_landmark}")
|
177 |
+
else:
|
178 |
+
print("WARNING: scene_analyzer is None or not available")
|
179 |
+
if hasattr(image_processor, 'enable_landmark'):
|
180 |
+
image_processor.enable_landmark = enable_landmark
|
181 |
+
|
182 |
+
# 設置更深層次的組別
|
183 |
+
if hasattr(image_processor.scene_analyzer, 'scene_describer'):
|
184 |
+
image_processor.scene_analyzer.scene_describer.enable_landmark = enable_landmark
|
185 |
+
|
186 |
+
# 設置CLIP分析器上的標記
|
187 |
+
if hasattr(image_processor.scene_analyzer, 'clip_analyzer'):
|
188 |
+
if hasattr(image_processor.scene_analyzer.clip_analyzer, 'enable_landmark'):
|
189 |
+
image_processor.scene_analyzer.clip_analyzer.enable_landmark = enable_landmark
|
190 |
+
|
191 |
+
# 如果有LLM增強器,也設置它
|
192 |
+
if hasattr(image_processor.scene_analyzer, 'llm_enhancer'):
|
193 |
+
image_processor.scene_analyzer.llm_enhancer.enable_landmark = enable_landmark
|
194 |
+
print(f"Updated LLM enhancer enable_landmark to: {enable_landmark}")
|
195 |
+
|
196 |
+
print(f"Updated all landmark detection settings to: {enable_landmark}")
|
197 |
|
198 |
class_ids_to_filter = None
|
199 |
if filter_classes:
|
|
|
220 |
print(f"Filtering image results for class IDs: {class_ids_to_filter}")
|
221 |
|
222 |
# Call the existing image processing logic
|
223 |
+
print(f"DEBUG: app.py 傳遞 enable_landmark={enable_landmark} 到 process_image")
|
224 |
result_image, result_text, stats = image_processor.process_image(
|
225 |
image,
|
226 |
model_name,
|
227 |
confidence_threshold,
|
228 |
+
class_ids_to_filter,
|
229 |
+
enable_landmark
|
230 |
)
|
231 |
|
232 |
# Format stats for JSON display
|
|
|
321 |
|
322 |
print(f"Original scene description (first 50 chars): {scene_desc[:50]}...")
|
323 |
|
324 |
+
# determine original description
|
325 |
clean_scene_desc = clean_description(scene_desc)
|
326 |
print(f"Cleaned scene description (first 50 chars): {clean_scene_desc[:50]}...")
|
327 |
|
|
|
328 |
if not clean_scene_desc.strip():
|
329 |
clean_scene_desc = scene_desc
|
330 |
|
|
|
331 |
scene_desc_html = f"<div>{clean_scene_desc}</div>"
|
332 |
|
333 |
# 獲取LLM增強描述並且確保設置默認值為空字符串而非 None,不然會有None type Error
|
|
|
338 |
if not enhanced_description or not enhanced_description.strip():
|
339 |
print("WARNING: LLM enhanced description is empty!")
|
340 |
|
341 |
+
# bedge & label
|
342 |
llm_badge = ""
|
343 |
description_to_show = ""
|
344 |
|
345 |
+
# 在 Original Scene Analysis 折疊區顯示原始的描述
|
346 |
if use_llm and enhanced_description:
|
347 |
llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background: linear-gradient(90deg, #38b2ac, #4299e1); color:white; font-size:0.7rem; font-weight:bold; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); border: 1px solid rgba(255, 255, 255, 0.2);">LLM Enhanced</span>'
|
348 |
description_to_show = enhanced_description
|
349 |
+
|
350 |
else:
|
351 |
llm_badge = '<span style="display:inline-block; margin-left:8px; padding:3px 10px; border-radius:12px; background-color:#718096; color:white; font-size:0.7rem; font-weight:bold;">Basic</span>'
|
352 |
description_to_show = clean_scene_desc
|
|
|
353 |
|
354 |
# 使用LLM敘述時會有徽章標籤在標題上
|
355 |
scene_description_html = f'''
|
|
|
399 |
print("WARNING: LLM enhanced description is empty!")
|
400 |
|
401 |
return (result_image, result_text, formatted_stats, plot_figure,
|
402 |
+
scene_description_html, original_desc_html,
|
403 |
activities_list_data, safety_data, zones, lighting)
|
404 |
|
405 |
except Exception as e:
|
|
|
599 |
info="Provides more detailed and natural language descriptions (may increase processing time)"
|
600 |
)
|
601 |
|
602 |
+
use_landmark_detection = gr.Checkbox(
|
603 |
+
label="Use CLIP for Landmark Detection",
|
604 |
+
value=False,
|
605 |
+
info="Detect famous landmarks, monuments, and tourist attractions that standard object detection cannot recognize (increases processing time)"
|
606 |
+
)
|
607 |
+
|
608 |
with gr.Accordion("Filter Classes", open=False):
|
609 |
gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
|
610 |
with gr.Row():
|
|
|
624 |
with gr.Group(elem_classes="how-to-use"):
|
625 |
gr.HTML('<div class="section-heading">How to Use (Image)</div>')
|
626 |
gr.Markdown("""
|
627 |
+
1. Upload an image or use the camera
|
628 |
+
2. *(Optional)* Adjust settings like confidence threshold or model size (n, m = balanced, x = accurate)
|
629 |
+
3. In **Analysis Settings**, you can:
|
630 |
+
* Uncheck **Use LLM** to skip enhanced descriptions (faster)
|
631 |
+
* Check **Use CLIP for Landmark Detection** to identify famous landmarks like museums, monuments, and tourist attractions *(may take longer)*
|
632 |
+
* Filter object classes to focus on specific types of objects *(optional)*
|
633 |
+
4. Click **Analyze Image** button
|
634 |
+
|
635 |
+
**💡 Tip:** For landmark recognition (e.g. Louvre Museum), make sure to enable **CLIP for Landmark Detection** in the settings above.
|
636 |
""")
|
637 |
+
|
638 |
+
|
639 |
# Image Examples
|
640 |
gr.Examples(
|
641 |
examples=[
|
642 |
+
"/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/room_01.jpg",
|
643 |
+
"/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/room_02.jpg",
|
644 |
+
"/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/street_04.jpg",
|
645 |
+
"/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/street_05.jpg",
|
646 |
+
"/content/drive/Othercomputers/我的 MacBook Pro/Learning/VisionScout/test_images/landmark_Louvre_01.jpg",
|
647 |
],
|
648 |
inputs=image_input,
|
649 |
label="Example Images"
|
650 |
)
|
651 |
|
652 |
+
gr.HTML("""
|
653 |
+
<div style="text-align: center; margin-top: 8px; padding: 6px; background-color: #f8f9fa; border-radius: 4px; border: 1px solid #e2e8f0;">
|
654 |
+
<p style="font-size: 12px; color: #718096; margin: 0;">
|
655 |
+
📷 Sample images sourced from <a href="https://unsplash.com" target="_blank" style="color: #3182ce; text-decoration: underline;">Unsplash</a>
|
656 |
+
</p>
|
657 |
+
</div>
|
658 |
+
""")
|
659 |
+
|
660 |
# Right Column: Image Results
|
661 |
with gr.Column(scale=6, elem_classes="output-panel"):
|
662 |
with gr.Tabs(elem_classes="tabs"):
|
|
|
689 |
</p>
|
690 |
</div>
|
691 |
''')
|
692 |
+
image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
|
693 |
+
|
694 |
# 使用LLM增強敘述時也會顯示原本敘述內容
|
695 |
with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
|
696 |
image_llm_description = gr.HTML(label=None, elem_id="original_scene_description_text")
|
|
|
858 |
|
859 |
image_detect_btn.click(
|
860 |
fn=handle_image_upload,
|
861 |
+
inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter, use_llm, use_landmark_detection ],
|
862 |
outputs=[
|
863 |
image_result_image, image_result_text, image_stats_json, image_plot_output,
|
864 |
image_scene_description_html, image_llm_description, image_activities_list, image_safety_list, image_zones_json,
|
|
|
881 |
|
882 |
# Footer
|
883 |
gr.HTML("""
|
884 |
+
<div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
|
885 |
+
<div style="margin-bottom: 15px;">
|
886 |
+
<p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Places365, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
|
887 |
+
</div>
|
888 |
+
<div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
|
889 |
+
<p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
|
890 |
+
<a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" target="_blank" style="text-decoration: none;">
|
891 |
+
<img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
|
892 |
+
</a>
|
893 |
+
</div>
|
894 |
+
</div>
|
895 |
+
""")
|
896 |
|
897 |
return demo
|
898 |
|
|
|
900 |
if __name__ == "__main__":
|
901 |
demo_interface = create_interface()
|
902 |
|
903 |
+
demo_interface.launch(debug=True)
|
clip_analyzer.py
CHANGED
@@ -20,12 +20,12 @@ class CLIPAnalyzer:
|
|
20 |
Use Clip to intergrate scene understanding function
|
21 |
"""
|
22 |
|
23 |
-
def __init__(self, model_name: str = "ViT-
|
24 |
"""
|
25 |
初始化 CLIP 分析器。
|
26 |
|
27 |
Args:
|
28 |
-
model_name: CLIP Model name,
|
29 |
device: Use GPU if it can use
|
30 |
"""
|
31 |
# 自動選擇設備
|
@@ -55,49 +55,150 @@ class CLIPAnalyzer:
|
|
55 |
self._prepare_text_prompts()
|
56 |
|
57 |
def _prepare_text_prompts(self):
|
58 |
-
"""準備所有文本提示的 CLIP
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
self.
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
"""
|
91 |
分析圖像,預測場景類型和光照條件。
|
92 |
|
93 |
Args:
|
94 |
image: 輸入圖像 (PIL Image 或 numpy array)
|
95 |
include_cultural_analysis: 是否包含文化場景的詳細分析
|
|
|
|
|
|
|
|
|
96 |
|
97 |
Returns:
|
98 |
Dict: 包含場景類型預測和光照條件的分析結果
|
99 |
"""
|
100 |
try:
|
|
|
101 |
# 確保圖像是 PIL 格式
|
102 |
if not isinstance(image, Image.Image):
|
103 |
if isinstance(image, np.ndarray):
|
@@ -113,46 +214,127 @@ class CLIPAnalyzer:
|
|
113 |
image_features = self.model.encode_image(image_input)
|
114 |
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
115 |
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
-
# 分析光照條件
|
120 |
lighting_scores = self._analyze_lighting_condition(image_features)
|
121 |
-
|
122 |
-
# 文化場景的增強分析
|
123 |
cultural_analysis = {}
|
124 |
-
if include_cultural_analysis:
|
125 |
-
for
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
129 |
)
|
130 |
|
131 |
specialized_analysis = {}
|
132 |
-
for
|
133 |
-
if
|
134 |
-
specialized_analysis[
|
135 |
-
image_features,
|
136 |
)
|
137 |
|
138 |
viewpoint_scores = self._analyze_viewpoint(image_features)
|
139 |
-
|
140 |
object_combination_scores = self._analyze_object_combinations(image_features)
|
141 |
-
|
142 |
activity_scores = self._analyze_activities(image_features)
|
143 |
|
144 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
result = {
|
146 |
"scene_scores": scene_scores,
|
147 |
-
"top_scene":
|
148 |
-
"lighting_condition": max(lighting_scores.items(), key=lambda x: x[1]),
|
149 |
-
"embedding": image_features.cpu().numpy().tolist()[0]
|
150 |
-
"viewpoint": max(viewpoint_scores.items(), key=lambda x: x[1]),
|
151 |
-
"object_combinations": sorted(object_combination_scores.items(), key=lambda x: x[1], reverse=True)[:3],
|
152 |
-
"activities": sorted(activity_scores.items(), key=lambda x: x[1], reverse=True)[:3]
|
153 |
}
|
154 |
|
155 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
result["cultural_analysis"] = cultural_analysis
|
157 |
|
158 |
if specialized_analysis:
|
@@ -164,15 +346,49 @@ class CLIPAnalyzer:
|
|
164 |
print(f"Error analyzing image with CLIP: {e}")
|
165 |
import traceback
|
166 |
traceback.print_exc()
|
167 |
-
return {"error": str(e)}
|
|
|
|
|
|
|
|
|
168 |
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
171 |
with torch.no_grad():
|
172 |
# 計算場景類型文本特徵
|
173 |
text_features = self.model.encode_text(self.scene_type_tokens)
|
174 |
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
# 計算相似度分數
|
177 |
similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
|
178 |
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
@@ -180,7 +396,36 @@ class CLIPAnalyzer:
|
|
180 |
# 建立場景分數字典
|
181 |
scene_scores = {}
|
182 |
for i, scene_type in enumerate(self.scene_type_prompts.keys()):
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
return scene_scores
|
186 |
|
@@ -388,3 +633,12 @@ class CLIPAnalyzer:
|
|
388 |
result[query] = float(similarity[i])
|
389 |
|
390 |
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
Use Clip to intergrate scene understanding function
|
21 |
"""
|
22 |
|
23 |
+
def __init__(self, model_name: str = "ViT-L/14", device: str = None):
|
24 |
"""
|
25 |
初始化 CLIP 分析器。
|
26 |
|
27 |
Args:
|
28 |
+
model_name: CLIP Model name, 默認 "ViT-L/14"
|
29 |
device: Use GPU if it can use
|
30 |
"""
|
31 |
# 自動選擇設備
|
|
|
55 |
self._prepare_text_prompts()
|
56 |
|
57 |
def _prepare_text_prompts(self):
|
58 |
+
"""準備所有文本提示的 CLIP 特徵並存儲到 self.text_features_cache 中"""
|
59 |
+
self.text_features_cache = {}
|
60 |
+
|
61 |
+
# 處理基礎場景類型 (SCENE_TYPE_PROMPTS)
|
62 |
+
if hasattr(self, 'scene_type_prompts') and self.scene_type_prompts:
|
63 |
+
scene_texts = [prompt for scene_type, prompt in self.scene_type_prompts.items()]
|
64 |
+
if scene_texts:
|
65 |
+
self.text_features_cache["scene_type_keys"] = list(self.scene_type_prompts.keys())
|
66 |
+
try:
|
67 |
+
self.text_features_cache["scene_type_tokens"] = clip.tokenize(scene_texts).to(self.device)
|
68 |
+
except Exception as e:
|
69 |
+
print(f"Warning: Error tokenizing scene_type_prompts: {e}")
|
70 |
+
self.text_features_cache["scene_type_tokens"] = None # 標記錯誤或空
|
71 |
+
else:
|
72 |
+
self.text_features_cache["scene_type_keys"] = []
|
73 |
+
self.text_features_cache["scene_type_tokens"] = None
|
74 |
+
else:
|
75 |
+
self.text_features_cache["scene_type_keys"] = []
|
76 |
+
self.text_features_cache["scene_type_tokens"] = None
|
77 |
+
|
78 |
+
# 處理文化場景 (CULTURAL_SCENE_PROMPTS)
|
79 |
+
# cultural_tokens_dict 存儲的是 tokenized prompts
|
80 |
+
cultural_tokens_dict_val = {}
|
81 |
+
if hasattr(self, 'cultural_scene_prompts') and self.cultural_scene_prompts:
|
82 |
+
for scene_type, prompts in self.cultural_scene_prompts.items():
|
83 |
+
if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
|
84 |
+
try:
|
85 |
+
cultural_tokens_dict_val[scene_type] = clip.tokenize(prompts).to(self.device)
|
86 |
+
except Exception as e:
|
87 |
+
print(f"Warning: Error tokenizing cultural_scene_prompts for {scene_type}: {e}")
|
88 |
+
cultural_tokens_dict_val[scene_type] = None # 標記錯誤或空
|
89 |
+
else:
|
90 |
+
cultural_tokens_dict_val[scene_type] = None # prompts 不合規
|
91 |
+
self.text_features_cache["cultural_tokens_dict"] = cultural_tokens_dict_val
|
92 |
+
|
93 |
+
# 處理光照條件 (LIGHTING_CONDITION_PROMPTS)
|
94 |
+
if hasattr(self, 'lighting_condition_prompts') and self.lighting_condition_prompts:
|
95 |
+
lighting_texts = [prompt for cond, prompt in self.lighting_condition_prompts.items()]
|
96 |
+
if lighting_texts:
|
97 |
+
self.text_features_cache["lighting_condition_keys"] = list(self.lighting_condition_prompts.keys())
|
98 |
+
try:
|
99 |
+
self.text_features_cache["lighting_tokens"] = clip.tokenize(lighting_texts).to(self.device)
|
100 |
+
except Exception as e:
|
101 |
+
print(f"Warning: Error tokenizing lighting_condition_prompts: {e}")
|
102 |
+
self.text_features_cache["lighting_tokens"] = None
|
103 |
+
else:
|
104 |
+
self.text_features_cache["lighting_condition_keys"] = []
|
105 |
+
self.text_features_cache["lighting_tokens"] = None
|
106 |
+
else:
|
107 |
+
self.text_features_cache["lighting_condition_keys"] = []
|
108 |
+
self.text_features_cache["lighting_tokens"] = None
|
109 |
+
|
110 |
+
# 處理特殊場景 (SPECIALIZED_SCENE_PROMPTS)
|
111 |
+
specialized_tokens_dict_val = {}
|
112 |
+
if hasattr(self, 'specialized_scene_prompts') and self.specialized_scene_prompts:
|
113 |
+
for scene_type, prompts in self.specialized_scene_prompts.items():
|
114 |
+
if prompts and isinstance(prompts, list) and all(isinstance(p, str) for p in prompts):
|
115 |
+
try:
|
116 |
+
specialized_tokens_dict_val[scene_type] = clip.tokenize(prompts).to(self.device)
|
117 |
+
except Exception as e:
|
118 |
+
print(f"Warning: Error tokenizing specialized_scene_prompts for {scene_type}: {e}")
|
119 |
+
specialized_tokens_dict_val[scene_type] = None
|
120 |
+
else:
|
121 |
+
specialized_tokens_dict_val[scene_type] = None
|
122 |
+
self.text_features_cache["specialized_tokens_dict"] = specialized_tokens_dict_val
|
123 |
+
|
124 |
+
# 處理視角 (VIEWPOINT_PROMPTS)
|
125 |
+
if hasattr(self, 'viewpoint_prompts') and self.viewpoint_prompts:
|
126 |
+
viewpoint_texts = [prompt for viewpoint, prompt in self.viewpoint_prompts.items()]
|
127 |
+
if viewpoint_texts:
|
128 |
+
self.text_features_cache["viewpoint_keys"] = list(self.viewpoint_prompts.keys())
|
129 |
+
try:
|
130 |
+
self.text_features_cache["viewpoint_tokens"] = clip.tokenize(viewpoint_texts).to(self.device)
|
131 |
+
except Exception as e:
|
132 |
+
print(f"Warning: Error tokenizing viewpoint_prompts: {e}")
|
133 |
+
self.text_features_cache["viewpoint_tokens"] = None
|
134 |
+
else:
|
135 |
+
self.text_features_cache["viewpoint_keys"] = []
|
136 |
+
self.text_features_cache["viewpoint_tokens"] = None
|
137 |
+
else:
|
138 |
+
self.text_features_cache["viewpoint_keys"] = []
|
139 |
+
self.text_features_cache["viewpoint_tokens"] = None
|
140 |
+
|
141 |
+
# 處理物件組合 (OBJECT_COMBINATION_PROMPTS)
|
142 |
+
if hasattr(self, 'object_combination_prompts') and self.object_combination_prompts:
|
143 |
+
object_combination_texts = [prompt for combo, prompt in self.object_combination_prompts.items()]
|
144 |
+
if object_combination_texts:
|
145 |
+
self.text_features_cache["object_combination_keys"] = list(self.object_combination_prompts.keys())
|
146 |
+
try:
|
147 |
+
self.text_features_cache["object_combination_tokens"] = clip.tokenize(object_combination_texts).to(self.device)
|
148 |
+
except Exception as e:
|
149 |
+
print(f"Warning: Error tokenizing object_combination_prompts: {e}")
|
150 |
+
self.text_features_cache["object_combination_tokens"] = None
|
151 |
+
else:
|
152 |
+
self.text_features_cache["object_combination_keys"] = []
|
153 |
+
self.text_features_cache["object_combination_tokens"] = None
|
154 |
+
else:
|
155 |
+
self.text_features_cache["object_combination_keys"] = []
|
156 |
+
self.text_features_cache["object_combination_tokens"] = None
|
157 |
+
|
158 |
+
# 處理活動 (ACTIVITY_PROMPTS)
|
159 |
+
if hasattr(self, 'activity_prompts') and self.activity_prompts:
|
160 |
+
activity_texts = [prompt for activity, prompt in self.activity_prompts.items()]
|
161 |
+
if activity_texts:
|
162 |
+
self.text_features_cache["activity_keys"] = list(self.activity_prompts.keys())
|
163 |
+
try:
|
164 |
+
self.text_features_cache["activity_tokens"] = clip.tokenize(activity_texts).to(self.device)
|
165 |
+
except Exception as e:
|
166 |
+
print(f"Warning: Error tokenizing activity_prompts: {e}")
|
167 |
+
self.text_features_cache["activity_tokens"] = None
|
168 |
+
else:
|
169 |
+
self.text_features_cache["activity_keys"] = []
|
170 |
+
self.text_features_cache["activity_tokens"] = None
|
171 |
+
else:
|
172 |
+
self.text_features_cache["activity_keys"] = []
|
173 |
+
self.text_features_cache["activity_tokens"] = None
|
174 |
+
|
175 |
+
self.scene_type_tokens = self.text_features_cache["scene_type_tokens"]
|
176 |
+
self.lighting_tokens = self.text_features_cache["lighting_tokens"]
|
177 |
+
self.viewpoint_tokens = self.text_features_cache["viewpoint_tokens"]
|
178 |
+
self.object_combination_tokens = self.text_features_cache["object_combination_tokens"]
|
179 |
+
self.activity_tokens = self.text_features_cache["activity_tokens"]
|
180 |
+
self.cultural_tokens_dict = self.text_features_cache["cultural_tokens_dict"]
|
181 |
+
self.specialized_tokens_dict = self.text_features_cache["specialized_tokens_dict"]
|
182 |
+
|
183 |
+
print("CLIP text_features_cache prepared.")
|
184 |
+
|
185 |
+
def analyze_image(self, image, include_cultural_analysis=True, exclude_categories=None, enable_landmark=True, places365_guidance=None):
|
186 |
"""
|
187 |
分析圖像,預測場景類型和光照條件。
|
188 |
|
189 |
Args:
|
190 |
image: 輸入圖像 (PIL Image 或 numpy array)
|
191 |
include_cultural_analysis: 是否包含文化場景的詳細分析
|
192 |
+
exclude_categories: 要排除的類別列表
|
193 |
+
enable_landmark: 是否啟用地標檢測功能
|
194 |
+
places365_guidance: Places365 提供的場景指導信息 (可選)
|
195 |
+
|
196 |
|
197 |
Returns:
|
198 |
Dict: 包含場景類型預測和光照條件的分析結果
|
199 |
"""
|
200 |
try:
|
201 |
+
self.enable_landmark = enable_landmark # 更新實例的 enable_landmark 狀態
|
202 |
# 確保圖像是 PIL 格式
|
203 |
if not isinstance(image, Image.Image):
|
204 |
if isinstance(image, np.ndarray):
|
|
|
214 |
image_features = self.model.encode_image(image_input)
|
215 |
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
216 |
|
217 |
+
places365_focus_areas = []
|
218 |
+
places365_scene_context = "" # 用於存儲 Places365 提供的場景描述
|
219 |
+
|
220 |
+
if places365_guidance and isinstance(places365_guidance, dict) and places365_guidance.get('confidence', 0) > 0.4:
|
221 |
+
mapped_scene = places365_guidance.get('mapped_scene_type', '')
|
222 |
+
scene_label = places365_guidance.get('scene_label', '')
|
223 |
+
# is_indoor = places365_guidance.get('is_indoor', None) # 未使用,可註釋
|
224 |
+
attributes = places365_guidance.get('attributes', [])
|
225 |
+
|
226 |
+
places365_scene_context = f"Scene identified by Places365 as {scene_label}" # 更新上下文描述
|
227 |
+
|
228 |
+
# Adjust CLIP analysis focus based on Places365 scene type
|
229 |
+
if mapped_scene in ['kitchen', 'dining_area', 'restaurant']:
|
230 |
+
places365_focus_areas.extend(['food preparation', 'dining setup', 'kitchen appliances'])
|
231 |
+
elif mapped_scene in ['office_workspace', 'educational_setting', 'library', 'conference_room']:
|
232 |
+
places365_focus_areas.extend(['work environment', 'professional setting', 'learning space', 'study area'])
|
233 |
+
elif mapped_scene in ['retail_store', 'shopping_mall', 'market', 'supermarket']: # 擴展匹配
|
234 |
+
places365_focus_areas.extend(['commercial space', 'shopping environment', 'retail display', 'goods for sale'])
|
235 |
+
elif mapped_scene in ['park_area', 'beach', 'natural_outdoor_area', 'playground', 'sports_field']: # 擴展匹配
|
236 |
+
places365_focus_areas.extend(['outdoor recreation', 'natural environment', 'leisure activity', 'open space'])
|
237 |
+
|
238 |
+
# 根據屬性添加更通用的 focus areas
|
239 |
+
if isinstance(attributes, list): # 確保 attributes 是列表
|
240 |
+
if 'commercial' in attributes:
|
241 |
+
places365_focus_areas.append('business activity')
|
242 |
+
if 'recreational' in attributes:
|
243 |
+
places365_focus_areas.append('entertainment or leisure')
|
244 |
+
if 'residential' in attributes:
|
245 |
+
places365_focus_areas.append('living space')
|
246 |
+
|
247 |
+
# 去重
|
248 |
+
places365_focus_areas = list(set(places365_focus_areas))
|
249 |
+
|
250 |
+
if places365_focus_areas: # 只有在確實有 focus areas 時才打印
|
251 |
+
print(f"CLIP analysis guided by Places365: {places365_scene_context}, focus areas: {places365_focus_areas}")
|
252 |
+
|
253 |
+
# 分析場景類型,傳遞 enable_landmark 參數和 Places365 指導
|
254 |
+
scene_scores = self._analyze_scene_type(image_features,
|
255 |
+
enable_landmark=self.enable_landmark, # 使用更新後的實例屬性
|
256 |
+
places365_focus=places365_focus_areas)
|
257 |
+
|
258 |
+
# 如果禁用地標功能,確保排除地標相關類別
|
259 |
+
current_exclude_categories = list(exclude_categories) if exclude_categories is not None else []
|
260 |
+
if not self.enable_landmark: # 使用更新後的實例屬性
|
261 |
+
landmark_related_terms = ["landmark", "monument", "tower", "tourist", "attraction", "historical", "famous", "iconic"]
|
262 |
+
for term in landmark_related_terms:
|
263 |
+
if term not in current_exclude_categories:
|
264 |
+
current_exclude_categories.append(term)
|
265 |
+
|
266 |
+
if current_exclude_categories:
|
267 |
+
filtered_scores = {}
|
268 |
+
for scene, score in scene_scores.items():
|
269 |
+
# 檢查 scene 的鍵名(通常是英文)是否包含任何排除詞彙
|
270 |
+
if not any(cat.lower() in scene.lower() for cat in current_exclude_categories):
|
271 |
+
filtered_scores[scene] = score
|
272 |
+
|
273 |
+
if filtered_scores:
|
274 |
+
total_score = sum(filtered_scores.values())
|
275 |
+
if total_score > 1e-5: # 避免除以零或非常小的數
|
276 |
+
scene_scores = {k: v / total_score for k, v in filtered_scores.items()}
|
277 |
+
else: # 如果總分趨近於0,則保持原樣或設為0
|
278 |
+
scene_scores = {k: 0.0 for k in filtered_scores.keys()} # 或者 scene_scores = filtered_scores
|
279 |
+
else: # 如果過濾後沒有場景了
|
280 |
+
scene_scores = {k: (0.0 if any(cat.lower() in k.lower() for cat in current_exclude_categories) else v) for k,v in scene_scores.items()}
|
281 |
+
if not any(s > 1e-5 for s in scene_scores.values()): # 如果還是全0
|
282 |
+
scene_scores = {"unknown": 1.0} # 給一個默認值避免空字典
|
283 |
|
|
|
284 |
lighting_scores = self._analyze_lighting_condition(image_features)
|
|
|
|
|
285 |
cultural_analysis = {}
|
286 |
+
if include_cultural_analysis and self.enable_landmark: # 使用更新後的實例屬性
|
287 |
+
for scene_type_cultural_key in self.text_features_cache.get("cultural_tokens_dict", {}).keys():
|
288 |
+
# 確保 scene_type_cultural_key 是 SCENE_TYPE_PROMPTS 中的鍵,或者有一個映射關係
|
289 |
+
if scene_type_cultural_key in scene_scores and scene_scores[scene_type_cultural_key] > 0.2:
|
290 |
+
cultural_analysis[scene_type_cultural_key] = self._analyze_cultural_scene(
|
291 |
+
image_features, scene_type_cultural_key
|
292 |
)
|
293 |
|
294 |
specialized_analysis = {}
|
295 |
+
for scene_type_specialized_key in self.text_features_cache.get("specialized_tokens_dict", {}).keys():
|
296 |
+
if scene_type_specialized_key in scene_scores and scene_scores[scene_type_specialized_key] > 0.2:
|
297 |
+
specialized_analysis[scene_type_specialized_key] = self._analyze_specialized_scene(
|
298 |
+
image_features, scene_type_specialized_key
|
299 |
)
|
300 |
|
301 |
viewpoint_scores = self._analyze_viewpoint(image_features)
|
|
|
302 |
object_combination_scores = self._analyze_object_combinations(image_features)
|
|
|
303 |
activity_scores = self._analyze_activities(image_features)
|
304 |
|
305 |
+
if scene_scores: # 確保 scene_scores 不是空的
|
306 |
+
top_scene = max(scene_scores.items(), key=lambda x: x[1])
|
307 |
+
# 如果禁用地標,再次確認 top_scene 不是地標相關
|
308 |
+
if not self.enable_landmark and any(cat.lower() in top_scene[0].lower() for cat in current_exclude_categories):
|
309 |
+
non_excluded_scores = {k:v for k,v in scene_scores.items() if not any(cat.lower() in k.lower() for cat in current_exclude_categories)}
|
310 |
+
if non_excluded_scores:
|
311 |
+
top_scene = max(non_excluded_scores.items(), key=lambda x: x[1])
|
312 |
+
else:
|
313 |
+
top_scene = ("unknown", 0.0) # 或其他合適的默認值
|
314 |
+
else:
|
315 |
+
top_scene = ("unknown", 0.0)
|
316 |
+
|
317 |
+
|
318 |
result = {
|
319 |
"scene_scores": scene_scores,
|
320 |
+
"top_scene": top_scene,
|
321 |
+
"lighting_condition": max(lighting_scores.items(), key=lambda x: x[1]) if lighting_scores else ("unknown", 0.0),
|
322 |
+
"embedding": image_features.cpu().numpy().tolist()[0], # 簡化
|
323 |
+
"viewpoint": max(viewpoint_scores.items(), key=lambda x: x[1]) if viewpoint_scores else ("unknown", 0.0),
|
324 |
+
"object_combinations": sorted(object_combination_scores.items(), key=lambda x: x[1], reverse=True)[:3] if object_combination_scores else [],
|
325 |
+
"activities": sorted(activity_scores.items(), key=lambda x: x[1], reverse=True)[:3] if activity_scores else []
|
326 |
}
|
327 |
|
328 |
+
if places365_guidance and isinstance(places365_guidance, dict) and places365_focus_areas: # 檢查 places365_focus_areas 是否被填充
|
329 |
+
result["places365_guidance"] = {
|
330 |
+
"scene_context": places365_scene_context,
|
331 |
+
"focus_areas": places365_focus_areas, # 現在這個會包含基於 guidance 的內容
|
332 |
+
"guided_analysis": True,
|
333 |
+
"original_places365_scene": places365_guidance.get('scene_label', 'N/A'),
|
334 |
+
"original_places365_confidence": places365_guidance.get('confidence', 0.0)
|
335 |
+
}
|
336 |
+
|
337 |
+
if cultural_analysis and self.enable_landmark:
|
338 |
result["cultural_analysis"] = cultural_analysis
|
339 |
|
340 |
if specialized_analysis:
|
|
|
346 |
print(f"Error analyzing image with CLIP: {e}")
|
347 |
import traceback
|
348 |
traceback.print_exc()
|
349 |
+
return {"error": str(e), "scene_scores": {}, "top_scene": ("error", 0.0)}
|
350 |
+
|
351 |
+
def _analyze_scene_type(self, image_features: torch.Tensor, enable_landmark: bool = True, places365_focus: List[str] = None) -> Dict[str, float]:
|
352 |
+
"""
|
353 |
+
分析圖像特徵與各場景類型的相似度,並可選擇性地排除地標相關場景
|
354 |
|
355 |
+
Args:
|
356 |
+
image_features: 經過 CLIP 編碼的圖像特徵
|
357 |
+
enable_landmark: 是否啟用地標識別功能
|
358 |
+
|
359 |
+
Returns:
|
360 |
+
Dict[str, float]: 各場景類型的相似度分數字典
|
361 |
+
"""
|
362 |
with torch.no_grad():
|
363 |
# 計算場景類型文本特徵
|
364 |
text_features = self.model.encode_text(self.scene_type_tokens)
|
365 |
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
366 |
|
367 |
+
# Apply Places365 guidance if available
|
368 |
+
if places365_focus and len(places365_focus) > 0:
|
369 |
+
# Create enhanced prompts that incorporate Places365 guidance
|
370 |
+
enhanced_prompts = []
|
371 |
+
for scene_type in self.scene_type_prompts.keys():
|
372 |
+
base_prompt = self.scene_type_prompts[scene_type]
|
373 |
+
|
374 |
+
# Check if this scene type should be emphasized based on Places365 guidance
|
375 |
+
scene_lower = scene_type.lower()
|
376 |
+
should_enhance = False
|
377 |
+
|
378 |
+
for focus_area in places365_focus:
|
379 |
+
if any(keyword in scene_lower for keyword in focus_area.split()):
|
380 |
+
should_enhance = True
|
381 |
+
enhanced_prompts.append(f"{base_prompt} with {focus_area}")
|
382 |
+
break
|
383 |
+
|
384 |
+
if not should_enhance:
|
385 |
+
enhanced_prompts.append(base_prompt)
|
386 |
+
|
387 |
+
# Re-tokenize and encode enhanced prompts
|
388 |
+
enhanced_tokens = clip.tokenize(enhanced_prompts).to(self.device)
|
389 |
+
text_features = self.model.encode_text(enhanced_tokens)
|
390 |
+
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
391 |
+
|
392 |
# 計算相似度分數
|
393 |
similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
|
394 |
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
|
|
396 |
# 建立場景分數字典
|
397 |
scene_scores = {}
|
398 |
for i, scene_type in enumerate(self.scene_type_prompts.keys()):
|
399 |
+
# 如果未啟用地標功能,則跳過地標相關場景類型
|
400 |
+
if not enable_landmark and scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
|
401 |
+
scene_scores[scene_type] = 0.0 # 將地標場景分數設為零
|
402 |
+
else:
|
403 |
+
base_score = float(similarity[i])
|
404 |
+
|
405 |
+
# Apply Places365 guidance boost if applicable
|
406 |
+
if places365_focus:
|
407 |
+
scene_lower = scene_type.lower()
|
408 |
+
boost_factor = 1.0
|
409 |
+
|
410 |
+
for focus_area in places365_focus:
|
411 |
+
if any(keyword in scene_lower for keyword in focus_area.split()):
|
412 |
+
boost_factor = 1.15 # 15% boost for matching scenes
|
413 |
+
break
|
414 |
+
|
415 |
+
scene_scores[scene_type] = base_score * boost_factor
|
416 |
+
else:
|
417 |
+
scene_scores[scene_type] = base_score
|
418 |
+
|
419 |
+
# 如果禁用地標功能,確保重新歸一化剩餘場景分數
|
420 |
+
if not enable_landmark:
|
421 |
+
# 獲取所有非零分數
|
422 |
+
non_zero_scores = {k: v for k, v in scene_scores.items() if v > 0}
|
423 |
+
if non_zero_scores:
|
424 |
+
# 計算總和並歸一化
|
425 |
+
total_score = sum(non_zero_scores.values())
|
426 |
+
if total_score > 0:
|
427 |
+
for scene_type in non_zero_scores:
|
428 |
+
scene_scores[scene_type] = non_zero_scores[scene_type] / total_score
|
429 |
|
430 |
return scene_scores
|
431 |
|
|
|
633 |
result[query] = float(similarity[i])
|
634 |
|
635 |
return result
|
636 |
+
|
637 |
+
def get_clip_instance(self):
|
638 |
+
"""
|
639 |
+
獲取初始化好的CLIP模型實例,便於其他模組重用
|
640 |
+
|
641 |
+
Returns:
|
642 |
+
tuple: (模型實例, 預處理函數, 設備名稱)
|
643 |
+
"""
|
644 |
+
return self.model, self.preprocess, self.device
|
clip_prompts.py
CHANGED
@@ -137,7 +137,7 @@ COMPARATIVE_PROMPTS = {
|
|
137 |
"asian_vs_western_commercial": [
|
138 |
"An Asian shopping street with vertical signage and compact multi-level shops.",
|
139 |
"A Western commercial street with horizontal storefronts and wider sidewalks.",
|
140 |
-
"An East Asian retail area with dense signage in Asian scripts and narrow walkways."
|
141 |
"A Western shopping district with uniform building heights and Latin alphabetic signs."
|
142 |
],
|
143 |
"daytime_vs_nighttime": [
|
|
|
137 |
"asian_vs_western_commercial": [
|
138 |
"An Asian shopping street with vertical signage and compact multi-level shops.",
|
139 |
"A Western commercial street with horizontal storefronts and wider sidewalks.",
|
140 |
+
"An East Asian retail area with dense signage in Asian scripts and narrow walkways."
|
141 |
"A Western shopping district with uniform building heights and Latin alphabetic signs."
|
142 |
],
|
143 |
"daytime_vs_nighttime": [
|
clip_zero_shot_classifier.py
ADDED
@@ -0,0 +1,1415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
import clip
|
4 |
+
from PIL import Image
|
5 |
+
import numpy as np
|
6 |
+
from typing import List, Dict, Tuple, Optional, Union, Any
|
7 |
+
|
8 |
+
from landmark_data import ALL_LANDMARKS, get_all_landmark_prompts
|
9 |
+
|
10 |
+
class CLIPZeroShotClassifier:
|
11 |
+
"""
|
12 |
+
使用CLIP模型進行零樣本分類,專注於識別世界知名地標。
|
13 |
+
作為YOLO檢測的補充,處理標準對象檢測無法識別的地標建築。
|
14 |
+
"""
|
15 |
+
def __init__(self, model_name: str = "ViT-L/14", device: str = None):
|
16 |
+
"""
|
17 |
+
初始化CLIP零樣本分類器
|
18 |
+
|
19 |
+
Args:
|
20 |
+
model_name: CLIP模型名稱,默認為"ViT-L/14"
|
21 |
+
device: 運行設備,None則自動選擇
|
22 |
+
"""
|
23 |
+
# 設置運行設備
|
24 |
+
if device is None:
|
25 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
26 |
+
else:
|
27 |
+
self.device = device
|
28 |
+
|
29 |
+
print(f"Initializing CLIP Zero-Shot Landmark Classifier ({model_name}) on {self.device}")
|
30 |
+
try:
|
31 |
+
self.model, self.preprocess = clip.load(model_name, device=self.device)
|
32 |
+
print(f"Successfully loaded CLIP model")
|
33 |
+
except Exception as e:
|
34 |
+
print(f"Error loading CLIP model: {e}")
|
35 |
+
raise
|
36 |
+
|
37 |
+
# 加載地標數據
|
38 |
+
try:
|
39 |
+
self.landmark_data = ALL_LANDMARKS
|
40 |
+
self.landmark_prompts = get_all_landmark_prompts()
|
41 |
+
print(f"Loaded {len(self.landmark_prompts)} landmark prompts for classification")
|
42 |
+
|
43 |
+
# 預計算地標文本特徵
|
44 |
+
self.landmark_text_features = self._precompute_text_features(self.landmark_prompts)
|
45 |
+
|
46 |
+
# 創建地標ID到索引的映射,可快速查找
|
47 |
+
self.landmark_id_to_index = {landmark_id: i for i, landmark_id in enumerate(ALL_LANDMARKS.keys())}
|
48 |
+
|
49 |
+
# 初始化批處理參數
|
50 |
+
self.batch_size = 16 # 默認批處理大小
|
51 |
+
self.confidence_threshold_multipliers = {
|
52 |
+
"close_up": 0.9, # 近景標準閾值
|
53 |
+
"partial": 0.6, # 部分可見降低閾值要求
|
54 |
+
"distant": 0.5, # 遠景更低閾值要求
|
55 |
+
"full_image": 0.7 # 整張圖像需要更高閾值
|
56 |
+
}
|
57 |
+
|
58 |
+
self.landmark_type_thresholds = {
|
59 |
+
"tower": 0.5, # 塔型建築需要更高閾值
|
60 |
+
"skyscraper": 0.4, # 摩天大樓使用較低閾值
|
61 |
+
"building": 0.55, # 一般建築物閾值略微降低
|
62 |
+
"monument": 0.5, # 紀念碑閾值
|
63 |
+
"natural": 0.6 # 自然地標可以使用較低閾值
|
64 |
+
}
|
65 |
+
|
66 |
+
# 初始化結果快取
|
67 |
+
self.results_cache = {} # 使用圖像hash作為鍵
|
68 |
+
self.cache_max_size = 100 # 最大快取項目數
|
69 |
+
|
70 |
+
except ImportError:
|
71 |
+
print("Warning: landmark_data.py not found. Landmark classification will be limited")
|
72 |
+
self.landmark_data = {}
|
73 |
+
self.landmark_prompts = []
|
74 |
+
self.landmark_text_features = None
|
75 |
+
self.landmark_id_to_index = {}
|
76 |
+
self.results_cache = {}
|
77 |
+
|
78 |
+
def _get_image_hash(self, image):
|
79 |
+
"""
|
80 |
+
為圖像生成簡單的 hash 值用於快取
|
81 |
+
|
82 |
+
Args:
|
83 |
+
image: PIL Image 或 numpy 數組
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
str: 圖像的 hash 值
|
87 |
+
"""
|
88 |
+
if isinstance(image, np.ndarray):
|
89 |
+
# 對於 numpy 數組,降採樣並計算簡單 hash
|
90 |
+
small_img = image[::10, ::10] if image.ndim == 3 else image
|
91 |
+
return hash(small_img.tobytes())
|
92 |
+
else:
|
93 |
+
# 對於 PIL 圖像,調整大小後轉換為 bytes
|
94 |
+
small_img = image.resize((32, 32))
|
95 |
+
return hash(small_img.tobytes())
|
96 |
+
|
97 |
+
def _manage_cache(self):
|
98 |
+
"""
|
99 |
+
管理結果快取大小
|
100 |
+
"""
|
101 |
+
if len(self.results_cache) > self.cache_max_size:
|
102 |
+
oldest_key = next(iter(self.results_cache))
|
103 |
+
del self.results_cache[oldest_key]
|
104 |
+
|
105 |
+
def set_batch_size(self, batch_size: int):
|
106 |
+
"""
|
107 |
+
設置批處理大小
|
108 |
+
|
109 |
+
Args:
|
110 |
+
batch_size: 新的批處理大小
|
111 |
+
"""
|
112 |
+
self.batch_size = max(1, batch_size)
|
113 |
+
print(f"Batch size set to {self.batch_size}")
|
114 |
+
|
115 |
+
|
116 |
+
def adjust_confidence_threshold(self, detection_type: str, multiplier: float):
|
117 |
+
"""
|
118 |
+
調整特定檢測類型的置信度閾值乘數
|
119 |
+
|
120 |
+
Args:
|
121 |
+
detection_type: 檢測類型 ('close_up', 'partial', 'distant', 'full_image')
|
122 |
+
multiplier: 置信度閾值乘數
|
123 |
+
"""
|
124 |
+
if detection_type in self.confidence_threshold_multipliers:
|
125 |
+
self.confidence_threshold_multipliers[detection_type] = max(0.1, min(1.5, multiplier))
|
126 |
+
print(f"Adjusted confidence threshold multiplier for {detection_type} to {multiplier}")
|
127 |
+
else:
|
128 |
+
print(f"Unknown detection type: {detection_type}")
|
129 |
+
|
130 |
+
|
131 |
+
def _precompute_text_features(self, text_prompts: List[str]) -> torch.Tensor:
|
132 |
+
"""
|
133 |
+
預計算文本提示的CLIP特徵,提高批處理效率
|
134 |
+
|
135 |
+
Args:
|
136 |
+
text_prompts: 文本提示列表
|
137 |
+
|
138 |
+
Returns:
|
139 |
+
torch.Tensor: 預計算的文本特徵
|
140 |
+
"""
|
141 |
+
if not text_prompts:
|
142 |
+
return None
|
143 |
+
|
144 |
+
with torch.no_grad():
|
145 |
+
# Process in batches to avoid CUDA memory issues
|
146 |
+
batch_size = 128 # Adjust based on GPU memory
|
147 |
+
features_list = []
|
148 |
+
|
149 |
+
for i in range(0, len(text_prompts), batch_size):
|
150 |
+
batch_prompts = text_prompts[i:i+batch_size]
|
151 |
+
text_tokens = clip.tokenize(batch_prompts).to(self.device)
|
152 |
+
batch_features = self.model.encode_text(text_tokens)
|
153 |
+
batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True)
|
154 |
+
features_list.append(batch_features)
|
155 |
+
|
156 |
+
# Concatenate all batches
|
157 |
+
if len(features_list) > 1:
|
158 |
+
text_features = torch.cat(features_list, dim=0)
|
159 |
+
else:
|
160 |
+
text_features = features_list[0]
|
161 |
+
|
162 |
+
return text_features
|
163 |
+
|
164 |
+
def _perform_pyramid_analysis(self,
|
165 |
+
image: Union[Image.Image, np.ndarray],
|
166 |
+
levels: int = 4,
|
167 |
+
base_threshold: float = 0.25,
|
168 |
+
aspect_ratios: List[float] = [1.0, 0.75, 1.5]) -> Dict[str, Any]:
|
169 |
+
"""
|
170 |
+
Performs multi-scale pyramid analysis on the image to improve landmark detection.
|
171 |
+
|
172 |
+
Args:
|
173 |
+
image: Input image
|
174 |
+
levels: Number of pyramid levels
|
175 |
+
base_threshold: Base confidence threshold
|
176 |
+
aspect_ratios: Different aspect ratios to try (for tall buildings vs wide landscapes)
|
177 |
+
|
178 |
+
Returns:
|
179 |
+
Dict: Results of pyramid analysis
|
180 |
+
"""
|
181 |
+
# Ensure image is PIL format
|
182 |
+
if not isinstance(image, Image.Image):
|
183 |
+
if isinstance(image, np.ndarray):
|
184 |
+
image = Image.fromarray(image)
|
185 |
+
else:
|
186 |
+
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
|
187 |
+
|
188 |
+
width, height = image.size
|
189 |
+
pyramid_results = []
|
190 |
+
|
191 |
+
# 對每個縮放和縱橫比組合進行處理
|
192 |
+
for level in range(levels):
|
193 |
+
# 計算縮放因子
|
194 |
+
scale_factor = 1.0 - (level * 0.2)
|
195 |
+
|
196 |
+
for aspect_ratio in aspect_ratios:
|
197 |
+
# 計算新尺寸,保持面積近似不變
|
198 |
+
if aspect_ratio != 1.0:
|
199 |
+
# 保持面積近似不變的情況下調整縱橫比
|
200 |
+
new_width = int(width * scale_factor * (1/aspect_ratio)**0.5)
|
201 |
+
new_height = int(height * scale_factor * aspect_ratio**0.5)
|
202 |
+
else:
|
203 |
+
new_width = int(width * scale_factor)
|
204 |
+
new_height = int(height * scale_factor)
|
205 |
+
|
206 |
+
# 調整圖像大小
|
207 |
+
scaled_image = image.resize((new_width, new_height), Image.LANCZOS)
|
208 |
+
|
209 |
+
# 預處理圖像
|
210 |
+
image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)
|
211 |
+
|
212 |
+
# 獲取圖像特徵
|
213 |
+
with torch.no_grad():
|
214 |
+
image_features = self.model.encode_image(image_input)
|
215 |
+
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
216 |
+
|
217 |
+
# 計算相似度
|
218 |
+
similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
|
219 |
+
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
220 |
+
|
221 |
+
# 找到最佳匹配
|
222 |
+
best_idx = similarity.argmax().item()
|
223 |
+
best_score = similarity[best_idx]
|
224 |
+
|
225 |
+
if best_score >= base_threshold:
|
226 |
+
landmark_id = list(self.landmark_data.keys())[best_idx]
|
227 |
+
landmark_info = self.landmark_data[landmark_id]
|
228 |
+
|
229 |
+
pyramid_results.append({
|
230 |
+
"landmark_id": landmark_id,
|
231 |
+
"landmark_name": landmark_info["name"],
|
232 |
+
"confidence": float(best_score),
|
233 |
+
"scale_factor": scale_factor,
|
234 |
+
"aspect_ratio": aspect_ratio,
|
235 |
+
"location": landmark_info["location"]
|
236 |
+
})
|
237 |
+
|
238 |
+
# 按置信度排序
|
239 |
+
pyramid_results.sort(key=lambda x: x["confidence"], reverse=True)
|
240 |
+
|
241 |
+
return {
|
242 |
+
"is_landmark": len(pyramid_results) > 0,
|
243 |
+
"results": pyramid_results,
|
244 |
+
"best_result": pyramid_results[0] if pyramid_results else None
|
245 |
+
}
|
246 |
+
|
247 |
+
def _enhance_features(self, image: Union[Image.Image, np.ndarray]) -> Image.Image:
|
248 |
+
"""
|
249 |
+
Enhances image features to improve landmark detection.
|
250 |
+
|
251 |
+
Args:
|
252 |
+
image: Input image
|
253 |
+
|
254 |
+
Returns:
|
255 |
+
PIL.Image: Enhanced image
|
256 |
+
"""
|
257 |
+
# Ensure image is PIL format
|
258 |
+
if not isinstance(image, Image.Image):
|
259 |
+
if isinstance(image, np.ndarray):
|
260 |
+
image = Image.fromarray(image)
|
261 |
+
else:
|
262 |
+
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
|
263 |
+
|
264 |
+
# Convert to numpy for processing
|
265 |
+
img_array = np.array(image)
|
266 |
+
|
267 |
+
# Skip processing for grayscale images
|
268 |
+
if len(img_array.shape) < 3:
|
269 |
+
return image
|
270 |
+
|
271 |
+
# Apply adaptive contrast enhancement
|
272 |
+
# Convert to LAB color space
|
273 |
+
from skimage import color, exposure
|
274 |
+
try:
|
275 |
+
# Convert to LAB color space
|
276 |
+
if img_array.shape[2] == 4: # Handle RGBA
|
277 |
+
img_array = img_array[:,:,:3]
|
278 |
+
|
279 |
+
lab = color.rgb2lab(img_array[:,:,:3] / 255.0)
|
280 |
+
l_channel = lab[:,:,0]
|
281 |
+
|
282 |
+
# Enhance contrast of L channel
|
283 |
+
p2, p98 = np.percentile(l_channel, (2, 98))
|
284 |
+
l_channel_enhanced = exposure.rescale_intensity(l_channel, in_range=(p2, p98))
|
285 |
+
|
286 |
+
# Replace L channel and convert back to RGB
|
287 |
+
lab[:,:,0] = l_channel_enhanced
|
288 |
+
enhanced_img = color.lab2rgb(lab) * 255.0
|
289 |
+
enhanced_img = enhanced_img.astype(np.uint8)
|
290 |
+
|
291 |
+
return Image.fromarray(enhanced_img)
|
292 |
+
except ImportError:
|
293 |
+
print("Warning: skimage not available for feature enhancement")
|
294 |
+
return image
|
295 |
+
except Exception as e:
|
296 |
+
print(f"Error in feature enhancement: {e}")
|
297 |
+
return image
|
298 |
+
|
299 |
+
def _determine_landmark_type(self, landmark_id):
|
300 |
+
"""
|
301 |
+
自動判斷地標類型,基於地標數據和命名
|
302 |
+
|
303 |
+
Returns:
|
304 |
+
str: 地標類型,用於調整閾值
|
305 |
+
"""
|
306 |
+
if not landmark_id:
|
307 |
+
return "building" # 預設類型
|
308 |
+
|
309 |
+
# 獲取地標詳細數據
|
310 |
+
landmark_data = self.landmark_data if hasattr(self, 'landmark_data') else {}
|
311 |
+
landmark_info = landmark_data.get(landmark_id, {})
|
312 |
+
|
313 |
+
# 獲取地標相關文本
|
314 |
+
landmark_id_lower = landmark_id.lower()
|
315 |
+
landmark_name = landmark_info.get("name", "").lower()
|
316 |
+
landmark_location = landmark_info.get("location", "").lower()
|
317 |
+
landmark_aliases = [alias.lower() for alias in landmark_info.get("aliases", [])]
|
318 |
+
|
319 |
+
# 合併所有文本數據用於特徵判斷
|
320 |
+
combined_text = " ".join([landmark_id_lower, landmark_name] + landmark_aliases)
|
321 |
+
|
322 |
+
# 地標類型的特色特徵
|
323 |
+
type_features = {
|
324 |
+
"skyscraper": ["skyscraper", "tall", "tower", "高樓", "摩天", "大厦", "タワー"],
|
325 |
+
"tower": ["tower", "bell", "clock", "塔", "鐘樓", "タワー", "campanile"],
|
326 |
+
"monument": ["monument", "memorial", "statue", "紀念", "雕像", "像", "memorial"],
|
327 |
+
"natural": ["mountain", "lake", "canyon", "falls", "beach", "山", "湖", "峽谷", "瀑布", "海灘"],
|
328 |
+
"temple": ["temple", "shrine", "寺", "神社", "廟"],
|
329 |
+
"palace": ["palace", "castle", "宮", "城", "皇宮", "宫殿"],
|
330 |
+
"distinctive": ["unique", "leaning", "slanted", "傾斜", "斜", "獨特", "傾く"]
|
331 |
+
}
|
332 |
+
|
333 |
+
# 檢查是否位於亞洲地區
|
334 |
+
asian_regions = ["china", "japan", "korea", "taiwan", "singapore", "vietnam", "thailand",
|
335 |
+
"hong kong", "中國", "日本", "韓國", "台灣", "新加坡", "越南", "泰國", "香港"]
|
336 |
+
is_asian = any(region in landmark_location for region in asian_regions)
|
337 |
+
|
338 |
+
# 判斷地標類型
|
339 |
+
best_type = None
|
340 |
+
max_matches = 0
|
341 |
+
|
342 |
+
for type_name, features in type_features.items():
|
343 |
+
# 計算特徵詞匹配數量
|
344 |
+
matches = sum(1 for feature in features if feature in combined_text)
|
345 |
+
if matches > max_matches:
|
346 |
+
max_matches = matches
|
347 |
+
best_type = type_name
|
348 |
+
|
349 |
+
# 處理亞洲地區特例
|
350 |
+
if is_asian and best_type == "tower":
|
351 |
+
best_type = "skyscraper" # 亞洲地區的塔型建築閾值較低
|
352 |
+
|
353 |
+
# 特例處理:檢測傾斜建築
|
354 |
+
if any(term in combined_text for term in ["leaning", "slanted", "tilt", "inclined", "斜", "傾斜"]):
|
355 |
+
return "distinctive" # 傾斜建築需要特殊處理
|
356 |
+
|
357 |
+
return best_type if best_type and max_matches > 0 else "building" # 預設為一般建築
|
358 |
+
|
359 |
+
def classify_image_region(self,
|
360 |
+
image: Union[Image.Image, np.ndarray],
|
361 |
+
box: List[float],
|
362 |
+
threshold: float = 0.25,
|
363 |
+
detection_type: str = "close_up") -> Dict[str, Any]:
|
364 |
+
"""
|
365 |
+
對圖像的特定區域進行地標分類,具有增強的多尺度和部分識別能力
|
366 |
+
|
367 |
+
Args:
|
368 |
+
image: 原始圖像 (PIL Image 或 numpy數組)
|
369 |
+
box: 邊界框 [x1, y1, x2, y2]
|
370 |
+
threshold: 基礎分類置信度閾值
|
371 |
+
detection_type: 檢測類型,影響置信度調整
|
372 |
+
|
373 |
+
Returns:
|
374 |
+
Dict: 地標���類結果
|
375 |
+
"""
|
376 |
+
# 確保圖像是PIL格式
|
377 |
+
if not isinstance(image, Image.Image):
|
378 |
+
if isinstance(image, np.ndarray):
|
379 |
+
image = Image.fromarray(image)
|
380 |
+
else:
|
381 |
+
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
|
382 |
+
|
383 |
+
# 生成圖像區域的hash用於快取
|
384 |
+
region_key = (self._get_image_hash(image), tuple(box), detection_type)
|
385 |
+
if region_key in self.results_cache:
|
386 |
+
return self.results_cache[region_key]
|
387 |
+
|
388 |
+
# 裁剪區域
|
389 |
+
x1, y1, x2, y2 = map(int, box)
|
390 |
+
cropped_image = image.crop((x1, y1, x2, y2))
|
391 |
+
enhanced_image = self._enhance_features(cropped_image)
|
392 |
+
|
393 |
+
# 分析視角信息
|
394 |
+
viewpoint_info = self._analyze_viewpoint(enhanced_image)
|
395 |
+
dominant_viewpoint = viewpoint_info["dominant_viewpoint"]
|
396 |
+
|
397 |
+
# 計算區域信息
|
398 |
+
region_width = x2 - x1
|
399 |
+
region_height = y2 - y1
|
400 |
+
image_width, image_height = image.size
|
401 |
+
|
402 |
+
# 根據區域大小判斷可能的檢測類型
|
403 |
+
region_area_ratio = (region_width * region_height) / (image_width * image_height)
|
404 |
+
if detection_type == "auto":
|
405 |
+
if region_area_ratio > 0.5:
|
406 |
+
detection_type = "close_up"
|
407 |
+
elif region_area_ratio > 0.2:
|
408 |
+
detection_type = "partial"
|
409 |
+
else:
|
410 |
+
detection_type = "distant"
|
411 |
+
|
412 |
+
# 根據視角調整檢測類型
|
413 |
+
if dominant_viewpoint == "close_up" and detection_type != "close_up":
|
414 |
+
detection_type = "close_up"
|
415 |
+
elif dominant_viewpoint == "distant" and detection_type != "distant":
|
416 |
+
detection_type = "distant"
|
417 |
+
elif dominant_viewpoint == "angled_view":
|
418 |
+
detection_type = "partial" # 角度視圖可能是部分可見
|
419 |
+
|
420 |
+
# 調整置信度閾值
|
421 |
+
base_multiplier = self.confidence_threshold_multipliers.get(detection_type, 1.0)
|
422 |
+
adjusted_threshold = threshold * base_multiplier
|
423 |
+
|
424 |
+
# 調整多尺度處理的尺度範圍和縱橫比 - 增強對傾斜建築的支持
|
425 |
+
scales = [1.0] # 默認尺度
|
426 |
+
|
427 |
+
# 基於視角選擇合適的尺度和縱橫比
|
428 |
+
if detection_type in ["partial", "distant"]:
|
429 |
+
scales = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3] # 標準範圍
|
430 |
+
|
431 |
+
# 如果是特殊視角,進一步調整尺度和縱橫比 - 新增
|
432 |
+
if dominant_viewpoint in ["angled_view", "low_angle"]:
|
433 |
+
scales = [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4] # 更寬的範圍
|
434 |
+
|
435 |
+
# 準備縱橫比 - 同時支持水平和垂直地標
|
436 |
+
aspect_ratios = [1.0, 0.8, 1.2] # 標準縱橫比
|
437 |
+
|
438 |
+
# 針對可能的傾斜建築增加更多縱橫比 - 新增
|
439 |
+
if dominant_viewpoint in ["angled_view", "unique_feature"]:
|
440 |
+
aspect_ratios = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5] # 更多樣的縱橫比
|
441 |
+
|
442 |
+
best_result = {
|
443 |
+
"landmark_id": None,
|
444 |
+
"landmark_name": None,
|
445 |
+
"confidence": 0.0,
|
446 |
+
"is_landmark": False
|
447 |
+
}
|
448 |
+
|
449 |
+
# 多尺度和縱橫比分析
|
450 |
+
for scale in scales:
|
451 |
+
for aspect_ratio in aspect_ratios:
|
452 |
+
# 縮放裁剪區域
|
453 |
+
current_width, current_height = cropped_image.size
|
454 |
+
|
455 |
+
# 計算新尺寸,保持面積不變但調整縱橫比
|
456 |
+
if aspect_ratio != 1.0:
|
457 |
+
new_width = int(current_width * scale * (1/aspect_ratio)**0.5)
|
458 |
+
new_height = int(current_height * scale * aspect_ratio**0.5)
|
459 |
+
else:
|
460 |
+
new_width = int(current_width * scale)
|
461 |
+
new_height = int(current_height * scale)
|
462 |
+
|
463 |
+
# 確保尺寸至少為1像素
|
464 |
+
new_width = max(1, new_width)
|
465 |
+
new_height = max(1, new_height)
|
466 |
+
|
467 |
+
# 縮放圖像
|
468 |
+
try:
|
469 |
+
scaled_image = cropped_image.resize((new_width, new_height), Image.LANCZOS)
|
470 |
+
except Exception as e:
|
471 |
+
print(f"Failed to resize image to {new_width}x{new_height}: {e}")
|
472 |
+
continue
|
473 |
+
|
474 |
+
# 預處理裁剪圖像
|
475 |
+
try:
|
476 |
+
image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)
|
477 |
+
except Exception as e:
|
478 |
+
print(f"Failed to preprocess image: {e}")
|
479 |
+
continue
|
480 |
+
|
481 |
+
# 獲取圖像特徵
|
482 |
+
with torch.no_grad():
|
483 |
+
try:
|
484 |
+
image_features = self.model.encode_image(image_input)
|
485 |
+
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
486 |
+
|
487 |
+
# 計算與地標提示的相似度
|
488 |
+
similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
|
489 |
+
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
490 |
+
|
491 |
+
# 找到最佳匹配
|
492 |
+
best_idx = similarity.argmax().item()
|
493 |
+
best_score = similarity[best_idx]
|
494 |
+
|
495 |
+
# 如果當前尺度結果更好,則更新
|
496 |
+
if best_score > best_result["confidence"]:
|
497 |
+
landmark_id = list(self.landmark_data.keys())[best_idx]
|
498 |
+
landmark_info = self.landmark_data[landmark_id]
|
499 |
+
|
500 |
+
best_result = {
|
501 |
+
"landmark_id": landmark_id,
|
502 |
+
"landmark_name": landmark_info["name"],
|
503 |
+
"location": landmark_info["location"],
|
504 |
+
"confidence": float(best_score),
|
505 |
+
"is_landmark": best_score >= adjusted_threshold,
|
506 |
+
"scale_used": scale,
|
507 |
+
"aspect_ratio_used": aspect_ratio,
|
508 |
+
"viewpoint": dominant_viewpoint
|
509 |
+
}
|
510 |
+
|
511 |
+
# 添加額外可用信息
|
512 |
+
for key in ["year_built", "architectural_style", "significance"]:
|
513 |
+
if key in landmark_info:
|
514 |
+
best_result[key] = landmark_info[key]
|
515 |
+
except Exception as e:
|
516 |
+
print(f"Error in calculating similarity: {e}")
|
517 |
+
continue
|
518 |
+
|
519 |
+
# 只有在有識別出地標ID且信心度足夠高時才應用地標類型閾值調整
|
520 |
+
if best_result["landmark_id"]:
|
521 |
+
landmark_type = self._determine_landmark_type(best_result["landmark_id"])
|
522 |
+
|
523 |
+
# 檢測是否為特殊類型的建築如斜塔
|
524 |
+
if landmark_type == "distinctive":
|
525 |
+
# 特殊建築的閾值降低25%
|
526 |
+
type_multiplier = 0.75
|
527 |
+
else:
|
528 |
+
# 使用已有的類型閾值
|
529 |
+
type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5
|
530 |
+
|
531 |
+
# 更新判斷是否為地標的標準
|
532 |
+
final_threshold = adjusted_threshold * type_multiplier
|
533 |
+
best_result["is_landmark"] = best_result["confidence"] >= final_threshold
|
534 |
+
best_result["landmark_type"] = landmark_type # 添加地標類型信息
|
535 |
+
best_result["threshold_applied"] = final_threshold # 記錄應用的閾值
|
536 |
+
|
537 |
+
# 快取結果
|
538 |
+
self.results_cache[region_key] = best_result
|
539 |
+
self._manage_cache()
|
540 |
+
|
541 |
+
return best_result
|
542 |
+
|
543 |
+
def classify_batch_regions(self,
|
544 |
+
image: Union[Image.Image, np.ndarray],
|
545 |
+
boxes: List[List[float]],
|
546 |
+
threshold: float = 0.28) -> List[Dict[str, Any]]:
|
547 |
+
"""
|
548 |
+
批量處理多個圖像區域,提高效率
|
549 |
+
|
550 |
+
Args:
|
551 |
+
image: 原始圖像
|
552 |
+
boxes: 邊界框列表
|
553 |
+
threshold: 置信度閾值
|
554 |
+
|
555 |
+
Returns:
|
556 |
+
List[Dict]: 分類結果列表
|
557 |
+
"""
|
558 |
+
if not self.landmark_text_features is not None:
|
559 |
+
return [{"is_landmark": False, "confidence": 0.0} for _ in boxes]
|
560 |
+
|
561 |
+
# 確保圖像是PIL格式
|
562 |
+
if not isinstance(image, Image.Image):
|
563 |
+
if isinstance(image, np.ndarray):
|
564 |
+
image = Image.fromarray(image)
|
565 |
+
else:
|
566 |
+
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
|
567 |
+
|
568 |
+
# 無框可處理時
|
569 |
+
if not boxes:
|
570 |
+
return []
|
571 |
+
|
572 |
+
# 裁剪並預處理所有區域
|
573 |
+
cropped_inputs = []
|
574 |
+
for box in boxes:
|
575 |
+
x1, y1, x2, y2 = map(int, box)
|
576 |
+
cropped_image = image.crop((x1, y1, x2, y2))
|
577 |
+
processed_image = self.preprocess(cropped_image).unsqueeze(0)
|
578 |
+
cropped_inputs.append(processed_image)
|
579 |
+
|
580 |
+
# batch process
|
581 |
+
batch_tensor = torch.cat(cropped_inputs).to(self.device)
|
582 |
+
|
583 |
+
# batch encoding
|
584 |
+
with torch.no_grad():
|
585 |
+
image_features = self.model.encode_image(batch_tensor)
|
586 |
+
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
587 |
+
|
588 |
+
# 計算相似度
|
589 |
+
similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
|
590 |
+
similarity = similarity.cpu().numpy() if self.device == "cuda" else similarity.numpy()
|
591 |
+
|
592 |
+
# 處理每個區域的結果
|
593 |
+
results = []
|
594 |
+
for i, sim in enumerate(similarity):
|
595 |
+
best_idx = sim.argmax().item()
|
596 |
+
best_score = sim[best_idx]
|
597 |
+
|
598 |
+
if best_score >= threshold:
|
599 |
+
landmark_id = list(self.landmark_data.keys())[best_idx]
|
600 |
+
landmark_info = self.landmark_data[landmark_id]
|
601 |
+
|
602 |
+
results.append({
|
603 |
+
"landmark_id": landmark_id,
|
604 |
+
"landmark_name": landmark_info["name"],
|
605 |
+
"location": landmark_info["location"],
|
606 |
+
"confidence": float(best_score),
|
607 |
+
"is_landmark": True,
|
608 |
+
"box": boxes[i]
|
609 |
+
})
|
610 |
+
else:
|
611 |
+
results.append({
|
612 |
+
"landmark_id": None,
|
613 |
+
"landmark_name": None,
|
614 |
+
"confidence": float(best_score),
|
615 |
+
"is_landmark": False,
|
616 |
+
"box": boxes[i]
|
617 |
+
})
|
618 |
+
|
619 |
+
return results
|
620 |
+
|
621 |
+
def search_entire_image(self,
|
622 |
+
image: Union[Image.Image, np.ndarray],
|
623 |
+
threshold: float = 0.35,
|
624 |
+
detailed_analysis: bool = False) -> Dict[str, Any]:
|
625 |
+
"""
|
626 |
+
檢查整張圖像是否包含地標,具有增強的分析能力
|
627 |
+
|
628 |
+
Args:
|
629 |
+
image: 原始圖像
|
630 |
+
threshold: 置信度閾值
|
631 |
+
detailed_analysis: 是否進行詳細分析,包括多區域檢測
|
632 |
+
|
633 |
+
Returns:
|
634 |
+
Dict: 地標分類結果
|
635 |
+
"""
|
636 |
+
# 確保圖像是PIL格式
|
637 |
+
if not isinstance(image, Image.Image):
|
638 |
+
if isinstance(image, np.ndarray):
|
639 |
+
image = Image.fromarray(image)
|
640 |
+
else:
|
641 |
+
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
|
642 |
+
|
643 |
+
# 檢查快取
|
644 |
+
image_key = (self._get_image_hash(image), "entire_image", detailed_analysis)
|
645 |
+
if image_key in self.results_cache:
|
646 |
+
return self.results_cache[image_key]
|
647 |
+
|
648 |
+
# 調整閾值
|
649 |
+
adjusted_threshold = threshold * self.confidence_threshold_multipliers.get("full_image", 1.0)
|
650 |
+
|
651 |
+
# 預處理圖像
|
652 |
+
image_input = self.preprocess(image).unsqueeze(0).to(self.device)
|
653 |
+
|
654 |
+
# 獲取圖像特徵
|
655 |
+
with torch.no_grad():
|
656 |
+
image_features = self.model.encode_image(image_input)
|
657 |
+
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
658 |
+
|
659 |
+
# 計算與地標提示的相似度
|
660 |
+
similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
|
661 |
+
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
662 |
+
|
663 |
+
# 找到最佳匹配
|
664 |
+
best_idx = similarity.argmax().item()
|
665 |
+
best_score = similarity[best_idx]
|
666 |
+
|
667 |
+
# top3 landmark
|
668 |
+
top_indices = similarity.argsort()[-3:][::-1]
|
669 |
+
top_landmarks = []
|
670 |
+
|
671 |
+
for idx in top_indices:
|
672 |
+
score = similarity[idx]
|
673 |
+
landmark_id = list(self.landmark_data.keys())[idx]
|
674 |
+
landmark_info = self.landmark_data[landmark_id]
|
675 |
+
|
676 |
+
landmark_result = {
|
677 |
+
"landmark_id": landmark_id,
|
678 |
+
"landmark_name": landmark_info["name"],
|
679 |
+
"location": landmark_info["location"],
|
680 |
+
"confidence": float(score)
|
681 |
+
}
|
682 |
+
|
683 |
+
# 添加額外可用信息
|
684 |
+
if "year_built" in landmark_info:
|
685 |
+
landmark_result["year_built"] = landmark_info["year_built"]
|
686 |
+
if "architectural_style" in landmark_info:
|
687 |
+
landmark_result["architectural_style"] = landmark_info["architectural_style"]
|
688 |
+
if "significance" in landmark_info:
|
689 |
+
landmark_result["significance"] = landmark_info["significance"]
|
690 |
+
|
691 |
+
top_landmarks.append(landmark_result)
|
692 |
+
|
693 |
+
# main result
|
694 |
+
result = {}
|
695 |
+
if best_score >= adjusted_threshold:
|
696 |
+
landmark_id = list(self.landmark_data.keys())[best_idx]
|
697 |
+
landmark_info = self.landmark_data[landmark_id]
|
698 |
+
|
699 |
+
# 應用地標類型特定閾值
|
700 |
+
landmark_type = self._determine_landmark_type(landmark_id)
|
701 |
+
type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5
|
702 |
+
final_threshold = adjusted_threshold * type_multiplier
|
703 |
+
|
704 |
+
if best_score >= final_threshold:
|
705 |
+
result = {
|
706 |
+
"landmark_id": landmark_id,
|
707 |
+
"landmark_name": landmark_info["name"],
|
708 |
+
"location": landmark_info["location"],
|
709 |
+
"confidence": float(best_score),
|
710 |
+
"is_landmark": True,
|
711 |
+
"landmark_type": landmark_type,
|
712 |
+
"top_landmarks": top_landmarks
|
713 |
+
}
|
714 |
+
|
715 |
+
# 添加額外可用信息
|
716 |
+
if "year_built" in landmark_info:
|
717 |
+
result["year_built"] = landmark_info["year_built"]
|
718 |
+
if "architectural_style" in landmark_info:
|
719 |
+
result["architectural_style"] = landmark_info["architectural_style"]
|
720 |
+
if "significance" in landmark_info:
|
721 |
+
result["significance"] = landmark_info["significance"]
|
722 |
+
else:
|
723 |
+
result = {
|
724 |
+
"landmark_id": None,
|
725 |
+
"landmark_name": None,
|
726 |
+
"confidence": float(best_score),
|
727 |
+
"is_landmark": False,
|
728 |
+
"top_landmarks": top_landmarks
|
729 |
+
}
|
730 |
+
|
731 |
+
# 如果請求詳細分析且是地標,進一步分析圖像區域
|
732 |
+
if detailed_analysis and result.get("is_landmark", False):
|
733 |
+
# 創建不同區域進行更深入分析
|
734 |
+
width, height = image.size
|
735 |
+
regions = [
|
736 |
+
# 中心區域
|
737 |
+
[width * 0.25, height * 0.25, width * 0.75, height * 0.75],
|
738 |
+
# 左半部
|
739 |
+
[0, 0, width * 0.5, height],
|
740 |
+
# 右半部
|
741 |
+
[width * 0.5, 0, width, height],
|
742 |
+
# 上半部
|
743 |
+
[0, 0, width, height * 0.5],
|
744 |
+
# 下半部
|
745 |
+
[0, height * 0.5, width, height]
|
746 |
+
]
|
747 |
+
|
748 |
+
region_results = []
|
749 |
+
for i, box in enumerate(regions):
|
750 |
+
region_result = self.classify_image_region(
|
751 |
+
image,
|
752 |
+
box,
|
753 |
+
threshold=threshold * 0.9,
|
754 |
+
detection_type="partial"
|
755 |
+
)
|
756 |
+
if region_result["is_landmark"]:
|
757 |
+
region_result["region_name"] = ["center", "left", "right", "top", "bottom"][i]
|
758 |
+
region_results.append(region_result)
|
759 |
+
|
760 |
+
# 添加區域分析結果
|
761 |
+
if region_results:
|
762 |
+
result["region_analyses"] = region_results
|
763 |
+
|
764 |
+
# 快取結果
|
765 |
+
self.results_cache[image_key] = result
|
766 |
+
self._manage_cache()
|
767 |
+
|
768 |
+
return result
|
769 |
+
|
770 |
+
def enhanced_landmark_detection(self,
|
771 |
+
image: Union[Image.Image, np.ndarray],
|
772 |
+
threshold: float = 0.3) -> Dict[str, Any]:
|
773 |
+
"""
|
774 |
+
Enhanced landmark detection using multiple analysis techniques.
|
775 |
+
|
776 |
+
Args:
|
777 |
+
image: Input image
|
778 |
+
threshold: Base confidence threshold
|
779 |
+
|
780 |
+
Returns:
|
781 |
+
Dict: Comprehensive landmark detection results
|
782 |
+
"""
|
783 |
+
# Ensure image is PIL format
|
784 |
+
if not isinstance(image, Image.Image):
|
785 |
+
if isinstance(image, np.ndarray):
|
786 |
+
image = Image.fromarray(image)
|
787 |
+
else:
|
788 |
+
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
|
789 |
+
|
790 |
+
# Phase 1: Analyze viewpoint to adjust detection parameters
|
791 |
+
viewpoint_info = self._analyze_viewpoint(image)
|
792 |
+
viewpoint = viewpoint_info["dominant_viewpoint"]
|
793 |
+
|
794 |
+
# Adjust threshold based on viewpoint
|
795 |
+
if viewpoint == "distant":
|
796 |
+
adjusted_threshold = threshold * 0.7 # Lower threshold for distant views
|
797 |
+
elif viewpoint == "close_up":
|
798 |
+
adjusted_threshold = threshold * 1.1 # Higher threshold for close-ups
|
799 |
+
else:
|
800 |
+
adjusted_threshold = threshold
|
801 |
+
|
802 |
+
# Phase 2: Perform multi-scale pyramid analysis
|
803 |
+
pyramid_results = self._perform_pyramid_analysis(image, levels=3, base_threshold=adjusted_threshold)
|
804 |
+
|
805 |
+
# Phase 3: Perform grid-based region analysis
|
806 |
+
grid_results = []
|
807 |
+
width, height = image.size
|
808 |
+
|
809 |
+
# Create adaptive grid based on viewpoint
|
810 |
+
if viewpoint == "distant":
|
811 |
+
grid_size = 3 # Coarser grid for distant views
|
812 |
+
elif viewpoint == "close_up":
|
813 |
+
grid_size = 5 # Finer grid for close-ups
|
814 |
+
else:
|
815 |
+
grid_size = 4 # Default grid size
|
816 |
+
|
817 |
+
# Generate grid regions
|
818 |
+
for i in range(grid_size):
|
819 |
+
for j in range(grid_size):
|
820 |
+
box = [
|
821 |
+
width * (j/grid_size),
|
822 |
+
height * (i/grid_size),
|
823 |
+
width * ((j+1)/grid_size),
|
824 |
+
height * ((i+1)/grid_size)
|
825 |
+
]
|
826 |
+
|
827 |
+
# Apply feature enhancement
|
828 |
+
region_result = self.classify_image_region(
|
829 |
+
image,
|
830 |
+
box,
|
831 |
+
threshold=adjusted_threshold,
|
832 |
+
detection_type="auto"
|
833 |
+
)
|
834 |
+
|
835 |
+
if region_result["is_landmark"]:
|
836 |
+
region_result["grid_position"] = (i, j)
|
837 |
+
grid_results.append(region_result)
|
838 |
+
|
839 |
+
# Phase 4: Cross-validate and combine results
|
840 |
+
all_detections = []
|
841 |
+
|
842 |
+
# Add pyramid results
|
843 |
+
if pyramid_results["is_landmark"] and pyramid_results["best_result"]:
|
844 |
+
all_detections.append({
|
845 |
+
"source": "pyramid",
|
846 |
+
"landmark_id": pyramid_results["best_result"]["landmark_id"],
|
847 |
+
"landmark_name": pyramid_results["best_result"]["landmark_name"],
|
848 |
+
"confidence": pyramid_results["best_result"]["confidence"],
|
849 |
+
"scale_factor": pyramid_results["best_result"].get("scale_factor", 1.0)
|
850 |
+
})
|
851 |
+
|
852 |
+
# Add grid results
|
853 |
+
for result in grid_results:
|
854 |
+
all_detections.append({
|
855 |
+
"source": "grid",
|
856 |
+
"landmark_id": result["landmark_id"],
|
857 |
+
"landmark_name": result["landmark_name"],
|
858 |
+
"confidence": result["confidence"],
|
859 |
+
"grid_position": result.get("grid_position", (0, 0))
|
860 |
+
})
|
861 |
+
|
862 |
+
# Search entire image
|
863 |
+
full_image_result = self.search_entire_image(image, threshold=adjusted_threshold)
|
864 |
+
if full_image_result and full_image_result.get("is_landmark", False):
|
865 |
+
all_detections.append({
|
866 |
+
"source": "full_image",
|
867 |
+
"landmark_id": full_image_result["landmark_id"],
|
868 |
+
"landmark_name": full_image_result["landmark_name"],
|
869 |
+
"confidence": full_image_result["confidence"]
|
870 |
+
})
|
871 |
+
|
872 |
+
# Group by landmark_id and calculate aggregate confidence
|
873 |
+
landmark_groups = {}
|
874 |
+
for detection in all_detections:
|
875 |
+
landmark_id = detection["landmark_id"]
|
876 |
+
if landmark_id not in landmark_groups:
|
877 |
+
landmark_groups[landmark_id] = {
|
878 |
+
"landmark_id": landmark_id,
|
879 |
+
"landmark_name": detection["landmark_name"],
|
880 |
+
"detections": [],
|
881 |
+
"sources": set()
|
882 |
+
}
|
883 |
+
|
884 |
+
landmark_groups[landmark_id]["detections"].append(detection)
|
885 |
+
landmark_groups[landmark_id]["sources"].add(detection["source"])
|
886 |
+
|
887 |
+
# Calculate aggregate confidence for each landmark
|
888 |
+
for landmark_id, group in landmark_groups.items():
|
889 |
+
detections = group["detections"]
|
890 |
+
|
891 |
+
# Base confidence is the maximum confidence from any source
|
892 |
+
max_confidence = max(d["confidence"] for d in detections)
|
893 |
+
|
894 |
+
# Bonus for detection from multiple sources
|
895 |
+
source_count = len(group["sources"])
|
896 |
+
source_bonus = min(0.15, (source_count - 1) * 0.05) # Up to 15% bonus
|
897 |
+
|
898 |
+
# Consistency bonus for multiple detections of the same landmark
|
899 |
+
detection_count = len(detections)
|
900 |
+
consistency_bonus = min(0.1, (detection_count - 1) * 0.02) # Up to 10% bonus
|
901 |
+
|
902 |
+
# Calculate final confidence
|
903 |
+
aggregate_confidence = min(1.0, max_confidence + source_bonus + consistency_bonus)
|
904 |
+
|
905 |
+
group["confidence"] = aggregate_confidence
|
906 |
+
group["detection_count"] = detection_count
|
907 |
+
group["source_count"] = source_count
|
908 |
+
|
909 |
+
# Sort landmarks by confidence
|
910 |
+
sorted_landmarks = sorted(
|
911 |
+
landmark_groups.values(),
|
912 |
+
key=lambda x: x["confidence"],
|
913 |
+
reverse=True
|
914 |
+
)
|
915 |
+
|
916 |
+
return {
|
917 |
+
"is_landmark_scene": len(sorted_landmarks) > 0,
|
918 |
+
"detected_landmarks": sorted_landmarks,
|
919 |
+
"viewpoint_info": viewpoint_info,
|
920 |
+
"primary_landmark": sorted_landmarks[0] if sorted_landmarks else None
|
921 |
+
}
|
922 |
+
|
923 |
+
def _analyze_architectural_features(self, image):
|
924 |
+
"""
|
925 |
+
Analyzes the architectural features of a structure in the image without hardcoding specific landmarks.
|
926 |
+
|
927 |
+
Args:
|
928 |
+
image: Input image
|
929 |
+
|
930 |
+
Returns:
|
931 |
+
Dict: Architectural feature analysis results
|
932 |
+
"""
|
933 |
+
# Define universal architectural feature prompts that apply to all types of landmarks
|
934 |
+
architecture_prompts = {
|
935 |
+
"tall_structure": "a tall vertical structure standing alone",
|
936 |
+
"tiered_building": "a building with multiple stacked tiers or segments",
|
937 |
+
"historical_structure": "a building with historical architectural elements",
|
938 |
+
"modern_design": "a modern structure with contemporary architectural design",
|
939 |
+
"segmented_exterior": "a structure with visible segmented or sectioned exterior",
|
940 |
+
"viewing_platform": "a tall structure with observation area at the top",
|
941 |
+
"time_display": "a structure with timepiece features",
|
942 |
+
"glass_facade": "a building with prominent glass exterior surfaces",
|
943 |
+
"memorial_structure": "a monument or memorial structure",
|
944 |
+
"ancient_construction": "ancient constructed elements or archaeological features",
|
945 |
+
"natural_landmark": "a natural geographic formation or landmark",
|
946 |
+
"slanted_design": "a structure with non-vertical or leaning profile"
|
947 |
+
}
|
948 |
+
|
949 |
+
# Calculate similarity scores against universal architectural patterns
|
950 |
+
context_scores = self.calculate_similarity_scores(image, architecture_prompts)
|
951 |
+
|
952 |
+
# Determine most relevant architectural features
|
953 |
+
top_features = sorted(context_scores.items(), key=lambda x: x[1], reverse=True)[:3]
|
954 |
+
|
955 |
+
# Calculate feature confidence
|
956 |
+
context_confidence = sum(score for _, score in top_features) / 3
|
957 |
+
|
958 |
+
# Determine primary architectural category based on top features
|
959 |
+
architectural_categories = {
|
960 |
+
"tower": ["tall_structure", "viewing_platform", "time_display"],
|
961 |
+
"skyscraper": ["tall_structure", "modern_design", "glass_facade"],
|
962 |
+
"historical": ["historical_structure", "ancient_construction", "memorial_structure"],
|
963 |
+
"natural": ["natural_landmark"],
|
964 |
+
"distinctive": ["tiered_building", "segmented_exterior", "slanted_design"]
|
965 |
+
}
|
966 |
+
|
967 |
+
# Score each category based on the top features
|
968 |
+
category_scores = {}
|
969 |
+
for category, features in architectural_categories.items():
|
970 |
+
category_score = 0
|
971 |
+
for feature, score in context_scores.items():
|
972 |
+
if feature in features:
|
973 |
+
category_score += score
|
974 |
+
category_scores[category] = category_score
|
975 |
+
|
976 |
+
primary_category = max(category_scores.items(), key=lambda x: x[1])[0]
|
977 |
+
|
978 |
+
return {
|
979 |
+
"architectural_features": top_features,
|
980 |
+
"context_confidence": context_confidence,
|
981 |
+
"primary_category": primary_category,
|
982 |
+
"category_scores": category_scores
|
983 |
+
}
|
984 |
+
|
985 |
+
def intelligent_landmark_search(self,
|
986 |
+
image: Union[Image.Image, np.ndarray],
|
987 |
+
yolo_boxes: Optional[List[List[float]]] = None,
|
988 |
+
base_threshold: float = 0.25) -> Dict[str, Any]:
|
989 |
+
"""
|
990 |
+
對圖像進行智能地標搜索,綜合整張圖像分析和區域分析
|
991 |
+
|
992 |
+
Args:
|
993 |
+
image: 原始圖像
|
994 |
+
yolo_boxes: YOLO檢測到的邊界框 (可選)
|
995 |
+
base_threshold: 基礎置信度閾值
|
996 |
+
|
997 |
+
Returns:
|
998 |
+
Dict: 包含所有檢測結果的綜合分析
|
999 |
+
"""
|
1000 |
+
# 確保圖像是PIL格式
|
1001 |
+
if not isinstance(image, Image.Image):
|
1002 |
+
if isinstance(image, np.ndarray):
|
1003 |
+
image = Image.fromarray(image)
|
1004 |
+
else:
|
1005 |
+
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
|
1006 |
+
|
1007 |
+
# No YOLO 框時,可以稍微降低閾值以提高召回率
|
1008 |
+
actual_threshold = base_threshold * 0.85 if yolo_boxes is None or len(yolo_boxes) == 0 else base_threshold
|
1009 |
+
|
1010 |
+
# 首先對整張圖像進行分析
|
1011 |
+
try:
|
1012 |
+
full_image_result = self.search_entire_image(
|
1013 |
+
image,
|
1014 |
+
threshold=actual_threshold,
|
1015 |
+
detailed_analysis=True # 確保詳細分析開啟
|
1016 |
+
)
|
1017 |
+
|
1018 |
+
# No YOLO 框,則進行多尺度分析以提高檢測機會
|
1019 |
+
if (yolo_boxes is None or len(yolo_boxes) == 0) and (not full_image_result or not full_image_result.get("is_landmark", False)):
|
1020 |
+
print("No YOLO boxes provided, attempting multi-scale pyramid analysis")
|
1021 |
+
try:
|
1022 |
+
if hasattr(self, '_perform_pyramid_analysis'):
|
1023 |
+
pyramid_results = self._perform_pyramid_analysis(
|
1024 |
+
image,
|
1025 |
+
levels=4, #
|
1026 |
+
base_threshold=actual_threshold,
|
1027 |
+
aspect_ratios=[1.0, 0.75, 1.5, 0.5, 2.0]
|
1028 |
+
)
|
1029 |
+
|
1030 |
+
if pyramid_results and pyramid_results.get("is_landmark", False) and pyramid_results.get("best_result", {}).get("confidence", 0) > actual_threshold:
|
1031 |
+
# 使用金字塔分析結果增強或替代全圖結果
|
1032 |
+
if not full_image_result or not full_image_result.get("is_landmark", False):
|
1033 |
+
full_image_result = {
|
1034 |
+
"is_landmark": True,
|
1035 |
+
"landmark_id": pyramid_results["best_result"]["landmark_id"],
|
1036 |
+
"landmark_name": pyramid_results["best_result"]["landmark_name"],
|
1037 |
+
"confidence": pyramid_results["best_result"]["confidence"],
|
1038 |
+
"location": pyramid_results["best_result"].get("location", "Unknown Location")
|
1039 |
+
}
|
1040 |
+
print(f"Pyramid analysis detected landmark: {pyramid_results['best_result']['landmark_name']} with confidence {pyramid_results['best_result']['confidence']:.3f}")
|
1041 |
+
else:
|
1042 |
+
print("Pyramid analysis not available, skipping multi-scale detection")
|
1043 |
+
except Exception as e:
|
1044 |
+
print(f"Error in pyramid analysis: {e}")
|
1045 |
+
except Exception as e:
|
1046 |
+
print(f"Error in search_entire_image: {e}")
|
1047 |
+
import traceback
|
1048 |
+
traceback.print_exc()
|
1049 |
+
full_image_result = None
|
1050 |
+
|
1051 |
+
# 初始化結果字典
|
1052 |
+
result = {
|
1053 |
+
"full_image_analysis": full_image_result if full_image_result else {},
|
1054 |
+
"is_landmark_scene": False, # 默認值
|
1055 |
+
"detected_landmarks": []
|
1056 |
+
}
|
1057 |
+
|
1058 |
+
# 上下文感知比較,處理接近的排名結果
|
1059 |
+
if full_image_result and "top_landmarks" in full_image_result and len(full_image_result["top_landmarks"]) >= 2:
|
1060 |
+
top_landmarks = full_image_result["top_landmarks"]
|
1061 |
+
|
1062 |
+
# 檢查前兩個結果是否非常接近(信心度差異小於 0.1)
|
1063 |
+
if len(top_landmarks) >= 2 and abs(top_landmarks[0]["confidence"] - top_landmarks[1]["confidence"]) < 0.1:
|
1064 |
+
# 對於接近的結果,使用通用建築特徵分析進行區分
|
1065 |
+
try:
|
1066 |
+
# 分析建築特徵
|
1067 |
+
if hasattr(self, '_analyze_architectural_features'):
|
1068 |
+
architectural_analysis = self._analyze_architectural_features(image)
|
1069 |
+
top_features = architectural_analysis.get("architectural_features", [])
|
1070 |
+
primary_category = architectural_analysis.get("primary_category", "")
|
1071 |
+
|
1072 |
+
# 根據建築特徵調整地標置信度
|
1073 |
+
for i, landmark in enumerate(top_landmarks[:2]):
|
1074 |
+
if i >= len(top_landmarks):
|
1075 |
+
continue
|
1076 |
+
|
1077 |
+
landmark_id = landmark.get("landmark_id", "").lower()
|
1078 |
+
confidence_boost = 0
|
1079 |
+
|
1080 |
+
# 使用主要建築類別來調整置信度,使用通用條件而非特定地標名稱
|
1081 |
+
if primary_category == "tower" and any(term in landmark_id for term in ["tower", "spire", "needle"]):
|
1082 |
+
confidence_boost += 0.05
|
1083 |
+
elif primary_category == "skyscraper" and any(term in landmark_id for term in ["building", "skyscraper", "tall"]):
|
1084 |
+
confidence_boost += 0.05
|
1085 |
+
elif primary_category == "historical" and any(term in landmark_id for term in ["monument", "castle", "palace", "temple"]):
|
1086 |
+
confidence_boost += 0.05
|
1087 |
+
elif primary_category == "distinctive" and any(term in landmark_id for term in ["unusual", "unique", "special", "famous"]):
|
1088 |
+
confidence_boost += 0.05
|
1089 |
+
|
1090 |
+
# 根據特定特徵進一步微調,使用通用特徵描述而非特定地標
|
1091 |
+
for feature, score in top_features:
|
1092 |
+
if feature == "time_display" and "clock" in landmark_id:
|
1093 |
+
confidence_boost += 0.03
|
1094 |
+
elif feature == "segmented_exterior" and "segmented" in landmark_id:
|
1095 |
+
confidence_boost += 0.03
|
1096 |
+
elif feature == "slanted_design" and "leaning" in landmark_id:
|
1097 |
+
confidence_boost += 0.03
|
1098 |
+
|
1099 |
+
# 應用信心度調整
|
1100 |
+
if confidence_boost > 0 and i < len(top_landmarks):
|
1101 |
+
top_landmarks[i]["confidence"] += confidence_boost
|
1102 |
+
print(f"Boosted {landmark['landmark_name']} confidence by {confidence_boost:.2f} based on architectural features ({primary_category})")
|
1103 |
+
|
1104 |
+
# 重新排序
|
1105 |
+
top_landmarks.sort(key=lambda x: x["confidence"], reverse=True)
|
1106 |
+
full_image_result["top_landmarks"] = top_landmarks
|
1107 |
+
if top_landmarks:
|
1108 |
+
full_image_result["landmark_id"] = top_landmarks[0]["landmark_id"]
|
1109 |
+
full_image_result["landmark_name"] = top_landmarks[0]["landmark_name"]
|
1110 |
+
full_image_result["confidence"] = top_landmarks[0]["confidence"]
|
1111 |
+
full_image_result["location"] = top_landmarks[0].get("location", "Unknown Location")
|
1112 |
+
except Exception as e:
|
1113 |
+
print(f"Error in architectural feature analysis: {e}")
|
1114 |
+
import traceback
|
1115 |
+
traceback.print_exc()
|
1116 |
+
|
1117 |
+
if full_image_result and full_image_result.get("is_landmark", False):
|
1118 |
+
result["is_landmark_scene"] = True
|
1119 |
+
landmark_id = full_image_result.get("landmark_id", "unknown")
|
1120 |
+
|
1121 |
+
# extract landmark info
|
1122 |
+
landmark_specific_info = self._extract_landmark_specific_info(landmark_id)
|
1123 |
+
|
1124 |
+
landmark_info = {
|
1125 |
+
"landmark_id": landmark_id,
|
1126 |
+
"landmark_name": full_image_result.get("landmark_name", "Unknown Landmark"),
|
1127 |
+
"confidence": full_image_result.get("confidence", 0.0),
|
1128 |
+
"location": full_image_result.get("location", "Unknown Location"),
|
1129 |
+
"region_type": "full_image",
|
1130 |
+
"box": [0, 0, getattr(image, 'width', 0), getattr(image, 'height', 0)]
|
1131 |
+
}
|
1132 |
+
|
1133 |
+
# 整合地標特定info,確保正確的名稱被使用
|
1134 |
+
landmark_info.update(landmark_specific_info)
|
1135 |
+
|
1136 |
+
# 如果特定信息中有更準確的地標名稱,使用它
|
1137 |
+
if landmark_specific_info.get("landmark_name"):
|
1138 |
+
landmark_info["landmark_name"] = landmark_specific_info["landmark_name"]
|
1139 |
+
|
1140 |
+
result["detected_landmarks"].append(landmark_info)
|
1141 |
+
|
1142 |
+
# 確保地標特定活動被正確設置為主要結果
|
1143 |
+
if landmark_specific_info.get("has_specific_activities", False):
|
1144 |
+
result["primary_landmark_activities"] = landmark_specific_info.get("landmark_specific_activities", [])
|
1145 |
+
print(f"Set primary landmark activities: {len(result['primary_landmark_activities'])} activities for {landmark_info['landmark_name']}")
|
1146 |
+
|
1147 |
+
# 如果提供了YOLO邊界框,分析這些區域
|
1148 |
+
if yolo_boxes and len(yolo_boxes) > 0:
|
1149 |
+
for box in yolo_boxes:
|
1150 |
+
try:
|
1151 |
+
if hasattr(self, 'classify_image_region'):
|
1152 |
+
box_result = self.classify_image_region(
|
1153 |
+
image,
|
1154 |
+
box,
|
1155 |
+
threshold=base_threshold,
|
1156 |
+
detection_type="auto"
|
1157 |
+
)
|
1158 |
+
|
1159 |
+
# 如果檢測到地標
|
1160 |
+
if box_result and box_result.get("is_landmark", False):
|
1161 |
+
# 檢查是否與已檢測的地標重複
|
1162 |
+
is_duplicate = False
|
1163 |
+
for existing in result["detected_landmarks"]:
|
1164 |
+
if existing.get("landmark_id") == box_result.get("landmark_id"):
|
1165 |
+
# 如果新的置信度更高,則更新
|
1166 |
+
if box_result.get("confidence", 0) > existing.get("confidence", 0):
|
1167 |
+
existing.update({
|
1168 |
+
"confidence": box_result.get("confidence", 0),
|
1169 |
+
"region_type": "yolo_box",
|
1170 |
+
"box": box
|
1171 |
+
})
|
1172 |
+
is_duplicate = True
|
1173 |
+
break
|
1174 |
+
|
1175 |
+
# 如果不是重複的,添加到列表
|
1176 |
+
if not is_duplicate:
|
1177 |
+
result["detected_landmarks"].append({
|
1178 |
+
"landmark_id": box_result.get("landmark_id", "unknown"),
|
1179 |
+
"landmark_name": box_result.get("landmark_name", "Unknown Landmark"),
|
1180 |
+
"confidence": box_result.get("confidence", 0.0),
|
1181 |
+
"location": box_result.get("location", "Unknown Location"),
|
1182 |
+
"region_type": "yolo_box",
|
1183 |
+
"box": box
|
1184 |
+
})
|
1185 |
+
except Exception as e:
|
1186 |
+
print(f"Error in analyzing YOLO box: {e}")
|
1187 |
+
continue
|
1188 |
+
|
1189 |
+
# 最後,執行額外的網格搜索以捕獲可能被遺漏的地標
|
1190 |
+
# 但只有在尚未發現地標或僅發現低置信度地標時
|
1191 |
+
should_do_grid_search = (
|
1192 |
+
len(result["detected_landmarks"]) == 0 or
|
1193 |
+
max([landmark.get("confidence", 0) for landmark in result["detected_landmarks"]], default=0) < 0.5
|
1194 |
+
)
|
1195 |
+
|
1196 |
+
if should_do_grid_search and hasattr(self, 'classify_image_region'):
|
1197 |
+
try:
|
1198 |
+
# 創建5x5網格
|
1199 |
+
width, height = getattr(image, 'size', (getattr(image, 'width', 0), getattr(image, 'height', 0)))
|
1200 |
+
if not isinstance(width, (int, float)) or width <= 0:
|
1201 |
+
width = getattr(image, 'width', 0)
|
1202 |
+
if not isinstance(height, (int, float)) or height <= 0:
|
1203 |
+
height = getattr(image, 'height', 0)
|
1204 |
+
|
1205 |
+
if width > 0 and height > 0:
|
1206 |
+
grid_boxes = []
|
1207 |
+
for i in range(5):
|
1208 |
+
for j in range(5):
|
1209 |
+
grid_boxes.append([
|
1210 |
+
width * (j/5), height * (i/5),
|
1211 |
+
width * ((j+1)/5), height * ((i+1)/5)
|
1212 |
+
])
|
1213 |
+
|
1214 |
+
# 分析每個網格區域
|
1215 |
+
for box in grid_boxes:
|
1216 |
+
try:
|
1217 |
+
grid_result = self.classify_image_region(
|
1218 |
+
image,
|
1219 |
+
box,
|
1220 |
+
threshold=base_threshold * 0.9, # 稍微降低網格搜索閾值
|
1221 |
+
detection_type="partial"
|
1222 |
+
)
|
1223 |
+
|
1224 |
+
# 如果檢測到地標
|
1225 |
+
if grid_result and grid_result.get("is_landmark", False):
|
1226 |
+
# 檢查是否與已檢測的地標重複
|
1227 |
+
is_duplicate = False
|
1228 |
+
for existing in result["detected_landmarks"]:
|
1229 |
+
if existing.get("landmark_id") == grid_result.get("landmark_id"):
|
1230 |
+
is_duplicate = True
|
1231 |
+
break
|
1232 |
+
|
1233 |
+
# 如果不是重複的,添加到列表
|
1234 |
+
if not is_duplicate:
|
1235 |
+
result["detected_landmarks"].append({
|
1236 |
+
"landmark_id": grid_result.get("landmark_id", "unknown"),
|
1237 |
+
"landmark_name": grid_result.get("landmark_name", "Unknown Landmark"),
|
1238 |
+
"confidence": grid_result.get("confidence", 0.0),
|
1239 |
+
"location": grid_result.get("location", "Unknown Location"),
|
1240 |
+
"region_type": "grid",
|
1241 |
+
"box": box
|
1242 |
+
})
|
1243 |
+
except Exception as e:
|
1244 |
+
print(f"Error in analyzing grid region: {e}")
|
1245 |
+
continue
|
1246 |
+
except Exception as e:
|
1247 |
+
print(f"Error in grid search: {e}")
|
1248 |
+
import traceback
|
1249 |
+
traceback.print_exc()
|
1250 |
+
|
1251 |
+
# 按置信度排序檢測結果
|
1252 |
+
result["detected_landmarks"].sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
1253 |
+
|
1254 |
+
# 更新整體場景類型判斷
|
1255 |
+
if len(result["detected_landmarks"]) > 0:
|
1256 |
+
result["is_landmark_scene"] = True
|
1257 |
+
result["primary_landmark"] = result["detected_landmarks"][0]
|
1258 |
+
|
1259 |
+
# 添加 clip_analysis_on_full_image 結果,以便給 LLM 提供更多上下文
|
1260 |
+
if full_image_result and "clip_analysis" in full_image_result:
|
1261 |
+
result["clip_analysis_on_full_image"] = full_image_result["clip_analysis"]
|
1262 |
+
|
1263 |
+
return result
|
1264 |
+
|
1265 |
+
def _extract_landmark_specific_info(self, landmark_id: str) -> Dict[str, Any]:
|
1266 |
+
"""
|
1267 |
+
提取特定地標的詳細信息,包括特色模板和活動建議
|
1268 |
+
|
1269 |
+
Args:
|
1270 |
+
landmark_id: 地標ID
|
1271 |
+
|
1272 |
+
Returns:
|
1273 |
+
Dict: 地標特定信息
|
1274 |
+
"""
|
1275 |
+
if not landmark_id or landmark_id == "unknown":
|
1276 |
+
return {"has_specific_activities": False}
|
1277 |
+
|
1278 |
+
specific_info = {"has_specific_activities": False}
|
1279 |
+
|
1280 |
+
# 從 ALL_LANDMARKS 或 self.landmark_data 中提取基本信息
|
1281 |
+
landmark_data_source = None
|
1282 |
+
|
1283 |
+
# 優先嘗試從類屬性獲取
|
1284 |
+
if hasattr(self, 'landmark_data') and self.landmark_data and landmark_id in self.landmark_data:
|
1285 |
+
landmark_data_source = self.landmark_data[landmark_id]
|
1286 |
+
print(f"Using landmark data from class attribute for {landmark_id}")
|
1287 |
+
else:
|
1288 |
+
try:
|
1289 |
+
if landmark_id in ALL_LANDMARKS:
|
1290 |
+
landmark_data_source = ALL_LANDMARKS[landmark_id]
|
1291 |
+
print(f"Using landmark data from ALL_LANDMARKS for {landmark_id}")
|
1292 |
+
except ImportError:
|
1293 |
+
print("Warning: Could not import ALL_LANDMARKS from landmark_data")
|
1294 |
+
except Exception as e:
|
1295 |
+
print(f"Error accessing ALL_LANDMARKS: {e}")
|
1296 |
+
|
1297 |
+
# 處理地標基本數據
|
1298 |
+
if landmark_data_source:
|
1299 |
+
# 提取正確的地標名稱
|
1300 |
+
if "name" in landmark_data_source:
|
1301 |
+
specific_info["landmark_name"] = landmark_data_source["name"]
|
1302 |
+
|
1303 |
+
# 提取所有可用的 prompts 作為特色模板
|
1304 |
+
if "prompts" in landmark_data_source:
|
1305 |
+
specific_info["feature_templates"] = landmark_data_source["prompts"][:5]
|
1306 |
+
specific_info["primary_template"] = landmark_data_source["prompts"][0]
|
1307 |
+
|
1308 |
+
# 提取別名info
|
1309 |
+
if "aliases" in landmark_data_source:
|
1310 |
+
specific_info["aliases"] = landmark_data_source["aliases"]
|
1311 |
+
|
1312 |
+
# 提取位置信息
|
1313 |
+
if "location" in landmark_data_source:
|
1314 |
+
specific_info["location"] = landmark_data_source["location"]
|
1315 |
+
|
1316 |
+
# 提取其他相關信息
|
1317 |
+
for key in ["year_built", "architectural_style", "significance", "description"]:
|
1318 |
+
if key in landmark_data_source:
|
1319 |
+
specific_info[key] = landmark_data_source[key]
|
1320 |
+
|
1321 |
+
# 嘗試從 LANDMARK_ACTIVITIES 中提取活動建議
|
1322 |
+
try:
|
1323 |
+
if landmark_id in LANDMARK_ACTIVITIES:
|
1324 |
+
activities = LANDMARK_ACTIVITIES[landmark_id]
|
1325 |
+
specific_info["landmark_specific_activities"] = activities
|
1326 |
+
specific_info["has_specific_activities"] = True
|
1327 |
+
print(f"Found {len(activities)} specific activities for landmark {landmark_id}")
|
1328 |
+
else:
|
1329 |
+
print(f"No specific activities found for landmark {landmark_id} in LANDMARK_ACTIVITIES")
|
1330 |
+
specific_info["has_specific_activities"] = False
|
1331 |
+
except ImportError:
|
1332 |
+
print("Warning: Could not import LANDMARK_ACTIVITIES from landmark_activities")
|
1333 |
+
specific_info["has_specific_activities"] = False
|
1334 |
+
except Exception as e:
|
1335 |
+
print(f"Error loading landmark activities for {landmark_id}: {e}")
|
1336 |
+
specific_info["has_specific_activities"] = False
|
1337 |
+
|
1338 |
+
return specific_info
|
1339 |
+
|
1340 |
+
def _analyze_viewpoint(self, image: Union[Image.Image, np.ndarray]) -> Dict[str, float]:
|
1341 |
+
"""
|
1342 |
+
Analyzes the image viewpoint to adjust detection parameters.
|
1343 |
+
|
1344 |
+
Args:
|
1345 |
+
image: Input image
|
1346 |
+
|
1347 |
+
Returns:
|
1348 |
+
Dict: Viewpoint analysis results
|
1349 |
+
"""
|
1350 |
+
viewpoint_prompts = {
|
1351 |
+
"aerial_view": "an aerial view from above looking down",
|
1352 |
+
"street_level": "a street level view looking up at a tall structure",
|
1353 |
+
"eye_level": "an eye-level horizontal view of a landmark",
|
1354 |
+
"distant": "a distant view of a landmark on the horizon",
|
1355 |
+
"close_up": "a close-up detailed view of architectural features",
|
1356 |
+
"interior": "an interior view inside a structure"
|
1357 |
+
}
|
1358 |
+
|
1359 |
+
# Calculate similarity scores
|
1360 |
+
viewpoint_scores = self.calculate_similarity_scores(image, viewpoint_prompts)
|
1361 |
+
|
1362 |
+
# Find dominant viewpoint
|
1363 |
+
dominant_viewpoint = max(viewpoint_scores.items(), key=lambda x: x[1])
|
1364 |
+
|
1365 |
+
return {
|
1366 |
+
"viewpoint_scores": viewpoint_scores,
|
1367 |
+
"dominant_viewpoint": dominant_viewpoint[0],
|
1368 |
+
"confidence": dominant_viewpoint[1]
|
1369 |
+
}
|
1370 |
+
|
1371 |
+
def calculate_similarity_scores(self, image: Union[Image.Image, np.ndarray],
|
1372 |
+
prompts: Dict[str, str]) -> Dict[str, float]:
|
1373 |
+
"""
|
1374 |
+
計算圖像與一組特定提示之間的相似度分數
|
1375 |
+
|
1376 |
+
Args:
|
1377 |
+
image: 輸入圖像
|
1378 |
+
prompts: 提示詞字典 {名稱: 提示文本}
|
1379 |
+
|
1380 |
+
Returns:
|
1381 |
+
Dict[str, float]: 每個提示的相似度分數
|
1382 |
+
"""
|
1383 |
+
# 確保圖像是PIL格式
|
1384 |
+
if not isinstance(image, Image.Image):
|
1385 |
+
if isinstance(image, np.ndarray):
|
1386 |
+
image = Image.fromarray(image)
|
1387 |
+
else:
|
1388 |
+
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
|
1389 |
+
|
1390 |
+
# 預處理圖像
|
1391 |
+
image_input = self.preprocess(image).unsqueeze(0).to(self.device)
|
1392 |
+
|
1393 |
+
# 獲取圖像特徵
|
1394 |
+
with torch.no_grad():
|
1395 |
+
image_features = self.model.encode_image(image_input)
|
1396 |
+
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
1397 |
+
|
1398 |
+
# 計算與每個提示的相似度
|
1399 |
+
scores = {}
|
1400 |
+
prompt_texts = list(prompts.values())
|
1401 |
+
prompt_tokens = clip.tokenize(prompt_texts).to(self.device)
|
1402 |
+
|
1403 |
+
with torch.no_grad():
|
1404 |
+
prompt_features = self.model.encode_text(prompt_tokens)
|
1405 |
+
prompt_features = prompt_features / prompt_features.norm(dim=-1, keepdim=True)
|
1406 |
+
|
1407 |
+
# calculate similarity
|
1408 |
+
similarity = (100.0 * image_features @ prompt_features.T).softmax(dim=-1)
|
1409 |
+
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
1410 |
+
|
1411 |
+
# 填充結果字典
|
1412 |
+
for i, (name, _) in enumerate(prompts.items()):
|
1413 |
+
scores[name] = float(similarity[i])
|
1414 |
+
|
1415 |
+
return scores
|
enhance_scene_describer.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import os
|
2 |
import re
|
3 |
import json
|
|
|
4 |
import random
|
5 |
import numpy as np
|
6 |
from typing import Dict, List, Tuple, Any, Optional
|
@@ -12,6 +13,7 @@ from lighting_conditions import LIGHTING_CONDITIONS
|
|
12 |
from viewpoint_templates import VIEWPOINT_TEMPLATES
|
13 |
from cultural_templates import CULTURAL_TEMPLATES
|
14 |
from confifence_templates import CONFIDENCE_TEMPLATES
|
|
|
15 |
|
16 |
class EnhancedSceneDescriber:
|
17 |
"""
|
@@ -21,7 +23,7 @@ class EnhancedSceneDescriber:
|
|
21 |
detection results and scene classification.
|
22 |
"""
|
23 |
|
24 |
-
def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None):
|
25 |
"""
|
26 |
Initialize the enhanced scene describer.
|
27 |
|
@@ -29,6 +31,15 @@ class EnhancedSceneDescriber:
|
|
29 |
templates_db: Optional custom templates database
|
30 |
scene_types: Dictionary of scene type definitions
|
31 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
# Load or use provided scene types
|
33 |
self.scene_types = scene_types or self._load_default_scene_types()
|
34 |
|
@@ -57,7 +68,7 @@ class EnhancedSceneDescriber:
|
|
57 |
"""
|
58 |
templates = {}
|
59 |
|
60 |
-
#
|
61 |
templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES
|
62 |
templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS
|
63 |
templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES
|
@@ -100,19 +111,19 @@ class EnhancedSceneDescriber:
|
|
100 |
"low": "This might be {description}, but the confidence is low. {details}"
|
101 |
}
|
102 |
|
103 |
-
# 場景細節模板
|
104 |
if "scene_detail_templates" not in templates:
|
105 |
templates["scene_detail_templates"] = {
|
106 |
"default": ["A space with various objects."]
|
107 |
}
|
108 |
|
109 |
-
#
|
110 |
if "object_template_fillers" not in templates:
|
111 |
templates["object_template_fillers"] = {
|
112 |
"default": ["various items"]
|
113 |
}
|
114 |
|
115 |
-
#
|
116 |
if "viewpoint_templates" not in templates:
|
117 |
# 使用簡化版的默認視角模板
|
118 |
templates["viewpoint_templates"] = {
|
@@ -147,6 +158,7 @@ class EnhancedSceneDescriber:
|
|
147 |
"unknown": "The lighting conditions are not easily determined."
|
148 |
}
|
149 |
|
|
|
150 |
def _initialize_viewpoint_parameters(self):
|
151 |
"""
|
152 |
Initialize parameters used for viewpoint detection.
|
@@ -165,232 +177,444 @@ class EnhancedSceneDescriber:
|
|
165 |
"elevated_top_threshold": 0.3 # Few objects at top of frame
|
166 |
}
|
167 |
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
"""
|
176 |
-
|
177 |
-
and additional contextual information.
|
178 |
-
|
179 |
-
This is the main entry point that replaces the original _generate_scene_description.
|
180 |
|
181 |
Args:
|
182 |
-
scene_type:
|
183 |
-
detected_objects:
|
184 |
-
confidence:
|
185 |
-
lighting_info:
|
186 |
-
functional_zones:
|
|
|
187 |
|
188 |
Returns:
|
189 |
-
str:
|
190 |
"""
|
191 |
-
#
|
192 |
-
if
|
193 |
-
|
194 |
-
|
195 |
-
#
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
else:
|
207 |
-
|
208 |
|
209 |
-
#
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
elif confidence > 0.5:
|
218 |
-
confidence_level = "medium"
|
219 |
-
else:
|
220 |
-
confidence_level = "low"
|
221 |
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
base_description = "An aerial view showing the layout and movement patterns from above"
|
226 |
-
elif scene_type in self.scene_types:
|
227 |
-
base_description = self.scene_types[scene_type].get("description", "A scene")
|
228 |
-
else:
|
229 |
-
base_description = "A scene"
|
230 |
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
detected_objects,
|
235 |
-
lighting_info,
|
236 |
-
viewpoint
|
237 |
-
)
|
238 |
|
239 |
-
|
240 |
-
|
|
|
241 |
|
242 |
-
#
|
243 |
-
if
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
|
248 |
-
#
|
249 |
-
|
250 |
-
if people_objs:
|
251 |
-
people_count = len(people_objs)
|
252 |
-
if people_count > 5:
|
253 |
-
people_phrase = f"numerous people ({people_count})"
|
254 |
-
else:
|
255 |
-
people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}"
|
256 |
|
257 |
-
|
258 |
-
|
259 |
-
description = self._smart_append(description, f"The scene includes {people_phrase}")
|
260 |
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
|
|
|
|
266 |
|
267 |
-
|
268 |
-
if scene_details:
|
269 |
-
# Use smart_append to ensure proper formatting between base description and details
|
270 |
-
description = self._smart_append(description, scene_details)
|
271 |
|
272 |
-
#
|
273 |
-
lighting_description = ""
|
274 |
if lighting_info and "time_of_day" in lighting_info:
|
275 |
lighting_type = lighting_info["time_of_day"]
|
276 |
if lighting_type in self.templates.get("lighting_templates", {}):
|
277 |
lighting_description = self.templates["lighting_templates"][lighting_type]
|
|
|
278 |
|
279 |
-
#
|
280 |
-
if lighting_description and lighting_description not in description:
|
281 |
-
description = self._smart_append(description, lighting_description)
|
282 |
-
|
283 |
-
# Process viewpoint information
|
284 |
if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
|
285 |
viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
|
286 |
|
287 |
-
#
|
288 |
prefix = viewpoint_template.get('prefix', '')
|
289 |
if prefix and not description.startswith(prefix):
|
290 |
-
#
|
291 |
if description and description[0].isupper():
|
292 |
-
# Maintain the flow by lowercasing the first letter after the prefix
|
293 |
description = prefix + description[0].lower() + description[1:]
|
294 |
else:
|
295 |
description = prefix + description
|
296 |
|
297 |
-
#
|
298 |
-
if viewpoint == "aerial":
|
299 |
-
scene_elements = "the crossing patterns and pedestrian movement"
|
300 |
-
else:
|
301 |
-
scene_elements = "objects and layout"
|
302 |
-
|
303 |
viewpoint_desc = viewpoint_template.get("observation", "").format(
|
304 |
-
scene_elements=
|
305 |
)
|
306 |
|
307 |
-
# Add viewpoint observation if not already included
|
308 |
if viewpoint_desc and viewpoint_desc not in description:
|
309 |
description = self._smart_append(description, viewpoint_desc)
|
310 |
|
311 |
-
#
|
312 |
if functional_zones and len(functional_zones) > 0:
|
313 |
zones_desc = self._describe_functional_zones(functional_zones)
|
314 |
if zones_desc:
|
315 |
description = self._smart_append(description, zones_desc)
|
316 |
|
317 |
-
#
|
318 |
-
|
319 |
-
|
320 |
-
#
|
321 |
-
if
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
]
|
329 |
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
for match in matches:
|
335 |
-
# Extract the number from the match
|
336 |
-
number_match = re.search(r'\d+', match)
|
337 |
-
if number_match:
|
338 |
-
try:
|
339 |
-
people_mentioned = int(number_match.group())
|
340 |
-
# If the mentioned count is less than total, remove the entire sentence
|
341 |
-
if people_mentioned < people_count:
|
342 |
-
# Split description into sentences
|
343 |
-
sentences = re.split(r'(?<=[.!?])\s+', filtered_description)
|
344 |
-
# Remove sentences containing the match
|
345 |
-
filtered_sentences = []
|
346 |
-
for sentence in sentences:
|
347 |
-
if match not in sentence:
|
348 |
-
filtered_sentences.append(sentence)
|
349 |
-
# Recombine the description
|
350 |
-
filtered_description = " ".join(filtered_sentences)
|
351 |
-
except ValueError:
|
352 |
-
# Failed number conversion, continue processing
|
353 |
-
continue
|
354 |
|
355 |
-
|
356 |
-
|
357 |
|
358 |
-
|
359 |
-
|
|
|
360 |
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
|
381 |
-
|
382 |
-
if skip_block and line.strip() == "":
|
383 |
-
skip_block = False
|
384 |
|
385 |
-
# 如果不需要跳過,添加這行到結果
|
386 |
-
if not skip_block:
|
387 |
-
clean_description.append(line)
|
388 |
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
394 |
|
395 |
def _smart_append(self, current_text: str, new_fragment: str) -> str:
|
396 |
"""
|
@@ -424,13 +648,17 @@ class EnhancedSceneDescriber:
|
|
424 |
(new_fragment.startswith("A ") or new_fragment.startswith("An ")):
|
425 |
return current_text + ". " + new_fragment
|
426 |
|
|
|
|
|
|
|
|
|
427 |
# Decide how to join the texts
|
428 |
if ends_with_sentence:
|
429 |
# After a sentence, start with uppercase and add proper spacing
|
430 |
joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:])
|
431 |
elif ends_with_comma:
|
432 |
# After a comma, maintain flow with lowercase unless it's a proper noun or special case
|
433 |
-
if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
|
434 |
joined_text = current_text + " " + new_fragment
|
435 |
else:
|
436 |
joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:]
|
@@ -440,7 +668,7 @@ class EnhancedSceneDescriber:
|
|
440 |
else:
|
441 |
# For other cases, decide based on the content
|
442 |
if self._is_related_phrases(current_text, new_fragment):
|
443 |
-
if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
|
444 |
joined_text = current_text + ", " + new_fragment
|
445 |
else:
|
446 |
joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:]
|
@@ -489,88 +717,78 @@ class EnhancedSceneDescriber:
|
|
489 |
|
490 |
return False
|
491 |
|
|
|
492 |
def _format_final_description(self, text: str) -> str:
|
493 |
"""
|
494 |
Format the final description text to ensure correct punctuation,
|
495 |
capitalization, and spacing.
|
496 |
-
|
497 |
-
Args:
|
498 |
-
text: The text to format
|
499 |
-
|
500 |
-
Returns:
|
501 |
-
str: The properly formatted text
|
502 |
"""
|
503 |
-
|
504 |
-
|
505 |
-
if not text:
|
506 |
return ""
|
507 |
|
508 |
-
#
|
509 |
-
text =
|
510 |
-
text = re.sub(r'(An\s[^.!?]+?)\s+(An?\s)', r'\1. \2', text, flags=re.IGNORECASE)
|
511 |
|
512 |
-
#
|
513 |
-
text =
|
|
|
514 |
|
515 |
-
#
|
516 |
-
|
517 |
-
|
518 |
|
519 |
-
#
|
520 |
-
text = re.sub(r'
|
521 |
-
text = re.sub(r'([a-zA-Z])with', r'\1 with', text) # "xxx"和"with"間加空格
|
522 |
-
text = re.sub(r'plants(and|with|or)', r'plants \1', text) # 修正"plantsand"這類問題
|
523 |
|
524 |
-
#
|
525 |
-
|
|
|
|
|
526 |
|
527 |
-
#
|
528 |
def fix_capitalization_after_comma(match):
|
529 |
-
|
530 |
-
#
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
text = re.sub(r'(,\s+)([A-Z][a-zA-Z]
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
|
|
|
|
|
|
|
|
|
|
554 |
|
555 |
-
|
556 |
-
|
557 |
-
text = re.sub(r'(?<=[.!?]\s)' + phrase, replacement, text)
|
558 |
-
# 修改句中的術語,但保留句首的大寫
|
559 |
-
text = re.sub(r'(?<=,\s)' + phrase, replacement, text)
|
560 |
|
561 |
-
#
|
562 |
-
|
563 |
-
|
564 |
|
565 |
-
#
|
566 |
-
text = re.sub(r'
|
567 |
-
text = re.sub(r',{2,}', ',', text) # 多個逗號變一個
|
568 |
|
569 |
-
|
570 |
-
if text and not text[-1] in '.!?':
|
571 |
-
text += '.'
|
572 |
-
|
573 |
-
return text
|
574 |
|
575 |
def _is_intersection(self, detected_objects: List[Dict]) -> bool:
|
576 |
"""
|
@@ -652,65 +870,585 @@ class EnhancedSceneDescriber:
|
|
652 |
|
653 |
return base_desc
|
654 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
655 |
def _generate_scene_details(self,
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
|
|
|
|
|
|
|
|
|
|
|
660 |
"""
|
661 |
Generate detailed description based on scene type and detected objects.
|
|
|
662 |
|
663 |
Args:
|
664 |
-
scene_type: Identified scene type
|
665 |
-
detected_objects: List of detected objects
|
666 |
-
lighting_info: Optional lighting condition information
|
667 |
-
viewpoint: Detected viewpoint (aerial, eye_level, etc.)
|
|
|
|
|
|
|
|
|
668 |
|
669 |
Returns:
|
670 |
-
str: Detailed scene description
|
671 |
"""
|
672 |
-
# Get scene-specific templates
|
673 |
scene_details = ""
|
674 |
scene_templates = self.templates.get("scene_detail_templates", {})
|
675 |
|
676 |
-
#
|
677 |
-
|
678 |
-
|
679 |
-
|
|
|
|
|
680 |
|
681 |
-
|
682 |
-
|
683 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
684 |
else:
|
685 |
-
#
|
686 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
687 |
|
688 |
-
# Select a random template from the list
|
689 |
if templates_list:
|
690 |
detail_template = random.choice(templates_list)
|
691 |
-
|
692 |
-
# Fill the template with object information
|
693 |
scene_details = self._fill_detail_template(
|
694 |
detail_template,
|
695 |
detected_objects,
|
696 |
-
scene_type
|
697 |
-
|
698 |
-
|
699 |
-
# Use default templates if specific ones aren't available
|
700 |
-
if "default" in scene_templates:
|
701 |
-
detail_template = random.choice(scene_templates["default"])
|
702 |
-
scene_details = self._fill_detail_template(
|
703 |
-
detail_template,
|
704 |
-
detected_objects,
|
705 |
-
"default"
|
706 |
)
|
707 |
else:
|
708 |
-
|
709 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
710 |
|
711 |
-
return scene_details
|
712 |
|
713 |
-
def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str) -> str:
|
714 |
"""
|
715 |
Fill a template with specific details based on detected objects.
|
716 |
|
@@ -731,6 +1469,41 @@ class EnhancedSceneDescriber:
|
|
731 |
# Get object template fillers
|
732 |
fillers = self.templates.get("object_template_fillers", {})
|
733 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
734 |
# 為所有可能的變數設置默認值
|
735 |
default_replacements = {
|
736 |
# 室內相關
|
@@ -910,6 +1683,36 @@ class EnhancedSceneDescriber:
|
|
910 |
"knowledge_transfer": "learning exchanges"
|
911 |
}
|
912 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
913 |
# For each placeholder, try to fill with appropriate content
|
914 |
for placeholder in placeholders:
|
915 |
if placeholder in fillers:
|
@@ -1137,7 +1940,7 @@ class EnhancedSceneDescriber:
|
|
1137 |
if not detected_objects:
|
1138 |
return "eye_level" # default
|
1139 |
|
1140 |
-
#
|
1141 |
top_region_count = 0
|
1142 |
bottom_region_count = 0
|
1143 |
total_objects = len(detected_objects)
|
@@ -1153,29 +1956,29 @@ class EnhancedSceneDescriber:
|
|
1153 |
crosswalk_pattern_detected = False
|
1154 |
|
1155 |
for obj in detected_objects:
|
1156 |
-
#
|
1157 |
region = obj["region"]
|
1158 |
if "top" in region:
|
1159 |
top_region_count += 1
|
1160 |
elif "bottom" in region:
|
1161 |
bottom_region_count += 1
|
1162 |
|
1163 |
-
#
|
1164 |
if "normalized_area" in obj:
|
1165 |
sizes.append(obj["normalized_area"])
|
1166 |
|
1167 |
-
#
|
1168 |
if "normalized_size" in obj:
|
1169 |
width, height = obj["normalized_size"]
|
1170 |
if width > 0:
|
1171 |
height_width_ratios.append(height / width)
|
1172 |
|
1173 |
-
#
|
1174 |
if obj["class_id"] == 0: # 人
|
1175 |
if "normalized_center" in obj:
|
1176 |
people_positions.append(obj["normalized_center"])
|
1177 |
|
1178 |
-
#
|
1179 |
# 檢查是否有明顯的垂直和水平行人分布
|
1180 |
people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # 人
|
1181 |
|
@@ -1194,7 +1997,7 @@ class EnhancedSceneDescriber:
|
|
1194 |
y_range = max(y_coords) - min(y_coords)
|
1195 |
|
1196 |
# 嘗試檢測十字形分布
|
1197 |
-
# 如果 x 和 y
|
1198 |
if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
|
1199 |
|
1200 |
# 計算到中心點的距離
|
@@ -1391,7 +2194,6 @@ class EnhancedSceneDescriber:
|
|
1391 |
description = description.replace("a bed in the room", "a bed")
|
1392 |
|
1393 |
# 處理重複的物品列表
|
1394 |
-
# 尋找格式如 "item, item, item" 的模式
|
1395 |
object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
|
1396 |
|
1397 |
for obj_list in object_lists:
|
@@ -1441,6 +2243,20 @@ class EnhancedSceneDescriber:
|
|
1441 |
if not functional_zones:
|
1442 |
return ""
|
1443 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1444 |
# 計算場景中的總人數
|
1445 |
total_people_count = 0
|
1446 |
people_by_zone = {}
|
@@ -1480,12 +2296,12 @@ class EnhancedSceneDescriber:
|
|
1480 |
|
1481 |
# 生成匯總描述
|
1482 |
summary = ""
|
1483 |
-
max_mentioned_people = 0 #
|
1484 |
|
1485 |
# 如果總人數顯著且還沒在主描述中提到,添加總人數描述
|
1486 |
if total_people_count > 5:
|
1487 |
summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
|
1488 |
-
max_mentioned_people = total_people_count #
|
1489 |
|
1490 |
# 處理每個區域的描述,確保人數信息的一致性
|
1491 |
processed_zones = []
|
@@ -1494,7 +2310,7 @@ class EnhancedSceneDescriber:
|
|
1494 |
zone_desc = zone_info.get("description", "a functional zone")
|
1495 |
zone_people_count = people_by_zone.get(zone_name, 0)
|
1496 |
|
1497 |
-
#
|
1498 |
contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())
|
1499 |
|
1500 |
# 如果描述包含人數信息,且人數較小(小於已提到的最大人數),則修改描述
|
|
|
1 |
import os
|
2 |
import re
|
3 |
import json
|
4 |
+
import logging
|
5 |
import random
|
6 |
import numpy as np
|
7 |
from typing import Dict, List, Tuple, Any, Optional
|
|
|
13 |
from viewpoint_templates import VIEWPOINT_TEMPLATES
|
14 |
from cultural_templates import CULTURAL_TEMPLATES
|
15 |
from confifence_templates import CONFIDENCE_TEMPLATES
|
16 |
+
from landmark_data import ALL_LANDMARKS
|
17 |
|
18 |
class EnhancedSceneDescriber:
|
19 |
"""
|
|
|
23 |
detection results and scene classification.
|
24 |
"""
|
25 |
|
26 |
+
def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None, spatial_analyzer_instance: Optional[Any] = None):
|
27 |
"""
|
28 |
Initialize the enhanced scene describer.
|
29 |
|
|
|
31 |
templates_db: Optional custom templates database
|
32 |
scene_types: Dictionary of scene type definitions
|
33 |
"""
|
34 |
+
self.logger = logging.getLogger(self.__class__.__name__) # Use class name for logger
|
35 |
+
self.logger.setLevel(logging.INFO) # Or your desired logging level
|
36 |
+
# Optional: Add a handler if not configured globally
|
37 |
+
if not self.logger.hasHandlers():
|
38 |
+
handler = logging.StreamHandler()
|
39 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
40 |
+
handler.setFormatter(formatter)
|
41 |
+
self.logger.addHandler(handler)
|
42 |
+
|
43 |
# Load or use provided scene types
|
44 |
self.scene_types = scene_types or self._load_default_scene_types()
|
45 |
|
|
|
68 |
"""
|
69 |
templates = {}
|
70 |
|
71 |
+
# 載入事先準備的模板
|
72 |
templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES
|
73 |
templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS
|
74 |
templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES
|
|
|
111 |
"low": "This might be {description}, but the confidence is low. {details}"
|
112 |
}
|
113 |
|
114 |
+
# 場景細節模板
|
115 |
if "scene_detail_templates" not in templates:
|
116 |
templates["scene_detail_templates"] = {
|
117 |
"default": ["A space with various objects."]
|
118 |
}
|
119 |
|
120 |
+
# 物體填充模板,用於生成物體描述
|
121 |
if "object_template_fillers" not in templates:
|
122 |
templates["object_template_fillers"] = {
|
123 |
"default": ["various items"]
|
124 |
}
|
125 |
|
126 |
+
# 視角模板,雖然現在從專門模組導入,但可作為備份
|
127 |
if "viewpoint_templates" not in templates:
|
128 |
# 使用簡化版的默認視角模板
|
129 |
templates["viewpoint_templates"] = {
|
|
|
158 |
"unknown": "The lighting conditions are not easily determined."
|
159 |
}
|
160 |
|
161 |
+
|
162 |
def _initialize_viewpoint_parameters(self):
|
163 |
"""
|
164 |
Initialize parameters used for viewpoint detection.
|
|
|
177 |
"elevated_top_threshold": 0.3 # Few objects at top of frame
|
178 |
}
|
179 |
|
180 |
+
def _generate_landmark_description(self,
|
181 |
+
scene_type: str,
|
182 |
+
detected_objects: List[Dict],
|
183 |
+
confidence: float,
|
184 |
+
lighting_info: Optional[Dict] = None,
|
185 |
+
functional_zones: Optional[Dict] = None,
|
186 |
+
landmark_objects: Optional[List[Dict]] = None) -> str:
|
187 |
"""
|
188 |
+
生成包含地標信息的場景描述
|
|
|
|
|
|
|
189 |
|
190 |
Args:
|
191 |
+
scene_type: 識別的場景類型
|
192 |
+
detected_objects: 檢測到的物體列表
|
193 |
+
confidence: 場景分類置信度
|
194 |
+
lighting_info: 照明條件信息���可選)
|
195 |
+
functional_zones: 功能區域信息(可選)
|
196 |
+
landmark_objects: 識別為地標的物體列表(可選)
|
197 |
|
198 |
Returns:
|
199 |
+
str: 包含地標信息的自然語言場景描述
|
200 |
"""
|
201 |
+
# 如果沒有提供地標物體,則從檢測物體中篩選
|
202 |
+
if landmark_objects is None:
|
203 |
+
landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
|
204 |
+
|
205 |
+
# 如果沒有地標,退回到標準描述
|
206 |
+
if not landmark_objects:
|
207 |
+
if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
|
208 |
+
# 場景類型是地標但沒有具體地標物體
|
209 |
+
base_description = "A scenic area that appears to be a tourist destination, though specific landmarks are not clearly identifiable."
|
210 |
+
else:
|
211 |
+
# 使用標準方法生成基本描述
|
212 |
+
return self._format_final_description(self._generate_scene_details(
|
213 |
+
scene_type,
|
214 |
+
detected_objects,
|
215 |
+
lighting_info,
|
216 |
+
self._detect_viewpoint(detected_objects)
|
217 |
+
))
|
218 |
+
else:
|
219 |
+
# 獲取主要地標(信心度最高的)
|
220 |
+
primary_landmark = max(landmark_objects, key=lambda x: x.get("confidence", 0))
|
221 |
+
landmark_name = primary_landmark.get("class_name", "landmark")
|
222 |
+
landmark_location = primary_landmark.get("location", "")
|
223 |
+
|
224 |
+
# 根據地標類型選擇適當的描述模板
|
225 |
+
if scene_type == "natural_landmark" or primary_landmark.get("landmark_type") == "natural":
|
226 |
+
base_description = f"A natural landmark scene featuring {landmark_name} in {landmark_location}."
|
227 |
+
elif scene_type == "historical_monument" or primary_landmark.get("landmark_type") == "monument":
|
228 |
+
base_description = f"A historical monument scene showcasing {landmark_name}, a significant landmark in {landmark_location}."
|
229 |
else:
|
230 |
+
base_description = f"A tourist landmark scene centered around {landmark_name}, an iconic structure in {landmark_location}."
|
231 |
|
232 |
+
# 加地標的額外信息
|
233 |
+
landmark_details = []
|
234 |
+
for landmark in landmark_objects:
|
235 |
+
details = []
|
236 |
|
237 |
+
# 加建造年份
|
238 |
+
if "year_built" in landmark:
|
239 |
+
details.append(f"built in {landmark['year_built']}")
|
|
|
|
|
|
|
|
|
240 |
|
241 |
+
# 加建築風格
|
242 |
+
if "architectural_style" in landmark:
|
243 |
+
details.append(f"featuring {landmark['architectural_style']} architectural style")
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
+
# 加重要性
|
246 |
+
if "significance" in landmark:
|
247 |
+
details.append(landmark["significance"])
|
|
|
|
|
|
|
|
|
248 |
|
249 |
+
# 如果有詳細信息,加到描述中
|
250 |
+
if details:
|
251 |
+
landmark_details.append(f"{landmark['class_name']} ({', '.join(details)})")
|
252 |
|
253 |
+
# 將詳細信息添加到基本描述中
|
254 |
+
if landmark_details:
|
255 |
+
description = base_description + " " + "The scene features " + ", ".join(landmark_details) + "."
|
256 |
+
else:
|
257 |
+
description = base_description
|
258 |
|
259 |
+
# 獲取視角
|
260 |
+
viewpoint = self._detect_viewpoint(detected_objects)
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
+
# 生成人員活動描述
|
263 |
+
people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) # 人的類別ID通常為0
|
|
|
264 |
|
265 |
+
if people_count > 0:
|
266 |
+
if people_count == 1:
|
267 |
+
people_description = "There is one person in the scene, likely a tourist or visitor."
|
268 |
+
elif people_count < 5:
|
269 |
+
people_description = f"There are {people_count} people in the scene, possibly tourists visiting the landmark."
|
270 |
+
else:
|
271 |
+
people_description = f"The scene includes a group of {people_count} people, indicating this is a popular tourist destination."
|
272 |
|
273 |
+
description = self._smart_append(description, people_description)
|
|
|
|
|
|
|
274 |
|
275 |
+
# 添加照明信息
|
|
|
276 |
if lighting_info and "time_of_day" in lighting_info:
|
277 |
lighting_type = lighting_info["time_of_day"]
|
278 |
if lighting_type in self.templates.get("lighting_templates", {}):
|
279 |
lighting_description = self.templates["lighting_templates"][lighting_type]
|
280 |
+
description = self._smart_append(description, lighting_description)
|
281 |
|
282 |
+
# 添加視角描述
|
|
|
|
|
|
|
|
|
283 |
if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
|
284 |
viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
|
285 |
|
286 |
+
# 添加視角前綴
|
287 |
prefix = viewpoint_template.get('prefix', '')
|
288 |
if prefix and not description.startswith(prefix):
|
289 |
+
# 保持句子流暢性
|
290 |
if description and description[0].isupper():
|
|
|
291 |
description = prefix + description[0].lower() + description[1:]
|
292 |
else:
|
293 |
description = prefix + description
|
294 |
|
295 |
+
# 添加視角觀察描述
|
|
|
|
|
|
|
|
|
|
|
296 |
viewpoint_desc = viewpoint_template.get("observation", "").format(
|
297 |
+
scene_elements="the landmark and surrounding area"
|
298 |
)
|
299 |
|
|
|
300 |
if viewpoint_desc and viewpoint_desc not in description:
|
301 |
description = self._smart_append(description, viewpoint_desc)
|
302 |
|
303 |
+
# 添加功能區域描述
|
304 |
if functional_zones and len(functional_zones) > 0:
|
305 |
zones_desc = self._describe_functional_zones(functional_zones)
|
306 |
if zones_desc:
|
307 |
description = self._smart_append(description, zones_desc)
|
308 |
|
309 |
+
# 描述可能的活動
|
310 |
+
landmark_activities = []
|
311 |
+
|
312 |
+
# 根據地標類型生成通用活動
|
313 |
+
if scene_type == "natural_landmark" or any(obj.get("landmark_type") == "natural" for obj in landmark_objects):
|
314 |
+
landmark_activities = [
|
315 |
+
"nature photography",
|
316 |
+
"scenic viewing",
|
317 |
+
"hiking or walking",
|
318 |
+
"guided nature tours",
|
319 |
+
"outdoor appreciation"
|
320 |
+
]
|
321 |
+
elif scene_type == "historical_monument" or any(obj.get("landmark_type") == "monument" for obj in landmark_objects):
|
322 |
+
landmark_activities = [
|
323 |
+
"historical sightseeing",
|
324 |
+
"educational tours",
|
325 |
+
"cultural appreciation",
|
326 |
+
"photography of historical architecture",
|
327 |
+
"learning about historical significance"
|
328 |
+
]
|
329 |
+
else:
|
330 |
+
landmark_activities = [
|
331 |
+
"sightseeing",
|
332 |
+
"taking photographs",
|
333 |
+
"guided tours",
|
334 |
+
"cultural tourism",
|
335 |
+
"souvenir shopping"
|
336 |
]
|
337 |
|
338 |
+
# 添加活動描述
|
339 |
+
if landmark_activities:
|
340 |
+
activities_text = "Common activities at this location include " + ", ".join(landmark_activities[:3]) + "."
|
341 |
+
description = self._smart_append(description, activities_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
|
343 |
+
# 最後格式化描述
|
344 |
+
return self._format_final_description(description)
|
345 |
|
346 |
+
def filter_landmark_references(self, text, enable_landmark=True):
|
347 |
+
"""
|
348 |
+
動態過濾文本中的地標引用
|
349 |
|
350 |
+
Args:
|
351 |
+
text: 需要過濾的文本
|
352 |
+
enable_landmark: 是否啟用地標功能
|
353 |
|
354 |
+
Returns:
|
355 |
+
str: 過濾後的文本
|
356 |
+
"""
|
357 |
+
if enable_landmark or not text:
|
358 |
+
return text
|
359 |
+
|
360 |
+
try:
|
361 |
+
# 動態收集所有地標名稱和位置
|
362 |
+
landmark_names = []
|
363 |
+
locations = []
|
364 |
+
|
365 |
+
for landmark_id, info in ALL_LANDMARKS.items():
|
366 |
+
# 收集地標名稱及其別名
|
367 |
+
landmark_names.append(info["name"])
|
368 |
+
landmark_names.extend(info.get("aliases", []))
|
369 |
+
|
370 |
+
# 收集地理位置
|
371 |
+
if "location" in info:
|
372 |
+
location = info["location"]
|
373 |
+
locations.append(location)
|
374 |
+
|
375 |
+
# 處理分離的城市和國家名稱
|
376 |
+
parts = location.split(",")
|
377 |
+
if len(parts) >= 1:
|
378 |
+
locations.append(parts[0].strip())
|
379 |
+
if len(parts) >= 2:
|
380 |
+
locations.append(parts[1].strip())
|
381 |
+
|
382 |
+
# 使用正則表達式動態替換所有地標名稱
|
383 |
+
import re
|
384 |
+
for name in landmark_names:
|
385 |
+
if name and len(name) > 2: # 避免過短的名稱
|
386 |
+
text = re.sub(r'\b' + re.escape(name) + r'\b', "tall structure", text, flags=re.IGNORECASE)
|
387 |
+
|
388 |
+
# 動態替換所有位置引用
|
389 |
+
for location in locations:
|
390 |
+
if location and len(location) > 2:
|
391 |
+
# 替換常見位置表述模式
|
392 |
+
text = re.sub(r'in ' + re.escape(location), "in the urban area", text, flags=re.IGNORECASE)
|
393 |
+
text = re.sub(r'of ' + re.escape(location), "of the urban area", text, flags=re.IGNORECASE)
|
394 |
+
text = re.sub(r'\b' + re.escape(location) + r'\b', "the urban area", text, flags=re.IGNORECASE)
|
395 |
+
|
396 |
+
except ImportError:
|
397 |
+
# 如果無法導入,使用基本模式
|
398 |
+
pass
|
399 |
+
|
400 |
+
# 通用地標描述模式替換
|
401 |
+
landmark_patterns = [
|
402 |
+
(r'a (tourist|popular|famous) landmark', r'an urban structure'),
|
403 |
+
(r'an iconic structure in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'),
|
404 |
+
(r'a famous (monument|tower|landmark) in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'),
|
405 |
+
(r'(centered|built|located|positioned) around the ([A-Z][a-zA-Z\s]+? (Tower|Monument|Landmark))', r'located in this area'),
|
406 |
+
(r'(sightseeing|guided tours|cultural tourism) (at|around|near) (this landmark|the [A-Z][a-zA-Z\s]+)', r'\1 in this area'),
|
407 |
+
(r'this (famous|iconic|historic|well-known) (landmark|monument|tower|structure)', r'this urban structure'),
|
408 |
+
(r'([A-Z][a-zA-Z\s]+) Tower', r'tall structure'),
|
409 |
+
(r'a (tower|structure) in ([A-Z][a-zA-Z\s,]+)', r'a \1 in the area'),
|
410 |
+
(r'landmark scene', r'urban scene'),
|
411 |
+
(r'tourist destination', r'urban area'),
|
412 |
+
(r'tourist attraction', r'urban area')
|
413 |
+
]
|
414 |
+
|
415 |
+
for pattern, replacement in landmark_patterns:
|
416 |
+
text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
|
417 |
|
418 |
+
return text
|
|
|
|
|
419 |
|
|
|
|
|
|
|
420 |
|
421 |
+
def generate_description(self, scene_type: str, detected_objects: List[Dict], confidence: float,
|
422 |
+
lighting_info: Dict, functional_zones: List[str], enable_landmark: bool = True,
|
423 |
+
scene_scores: Optional[Dict] = None, spatial_analysis: Optional[Dict] = None,
|
424 |
+
image_dimensions: Optional[Dict] = None, places365_info: Optional[Dict] = None,
|
425 |
+
object_statistics: Optional[Dict] = None) -> str:
|
426 |
+
"""
|
427 |
+
Generate enhanced scene description based on detection results, scene type,
|
428 |
+
and additional contextual information.
|
429 |
+
This version ensures that the main scene_details (from the first call)
|
430 |
+
is properly integrated and not overwritten by a simplified second call.
|
431 |
+
"""
|
432 |
+
# Handle unknown scene type or very low confidence as an early exit
|
433 |
+
if scene_type == "unknown" or confidence < 0.4:
|
434 |
+
# _generate_generic_description should also ideally use image_dimensions if it does spatial reasoning
|
435 |
+
generic_desc = self._generate_generic_description(detected_objects, lighting_info)
|
436 |
+
return self._format_final_description(generic_desc)
|
437 |
+
|
438 |
+
# Filter out landmark objects if landmark detection is disabled for this run
|
439 |
+
current_detected_objects = detected_objects
|
440 |
+
if not enable_landmark:
|
441 |
+
current_detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
|
442 |
+
|
443 |
+
# Log Places365 context if available
|
444 |
+
places365_context = ""
|
445 |
+
if places365_info and places365_info.get('confidence', 0) > 0.3:
|
446 |
+
scene_label = places365_info.get('scene_label', '')
|
447 |
+
attributes = places365_info.get('attributes', [])
|
448 |
+
is_indoor = places365_info.get('is_indoor', None)
|
449 |
+
|
450 |
+
if scene_label:
|
451 |
+
places365_context = f"Scene context: {scene_label}"
|
452 |
+
if attributes:
|
453 |
+
places365_context += f" with characteristics: {', '.join(attributes[:3])}"
|
454 |
+
if is_indoor is not None:
|
455 |
+
indoor_outdoor = "indoor" if is_indoor else "outdoor"
|
456 |
+
places365_context += f" ({indoor_outdoor} environment)"
|
457 |
+
|
458 |
+
print(f"Enhanced description incorporating Places365 context: {places365_context}")
|
459 |
+
|
460 |
+
landmark_objects_in_scene = [obj for obj in current_detected_objects if obj.get("is_landmark", False)]
|
461 |
+
has_landmark_in_scene = len(landmark_objects_in_scene) > 0
|
462 |
+
|
463 |
+
# If landmark processing is enabled and it's a landmark scene or landmarks are detected
|
464 |
+
if enable_landmark and (scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"] or has_landmark_in_scene):
|
465 |
+
landmark_desc = self._generate_landmark_description(
|
466 |
+
scene_type,
|
467 |
+
current_detected_objects, # Pass potentially filtered list
|
468 |
+
confidence,
|
469 |
+
lighting_info,
|
470 |
+
functional_zones,
|
471 |
+
landmark_objects_in_scene # Pass the explicitly filtered landmark objects
|
472 |
+
)
|
473 |
+
return self._format_final_description(landmark_desc)
|
474 |
+
|
475 |
+
# **[Start of main description construction for non-landmark or landmark-disabled everyday scenes]**
|
476 |
+
|
477 |
+
# Detect viewpoint based on current (potentially filtered) objects
|
478 |
+
viewpoint = self._detect_viewpoint(current_detected_objects)
|
479 |
+
current_scene_type = scene_type # Use a mutable variable for scene_type if it can change
|
480 |
+
|
481 |
+
# Process aerial viewpoint scene types (may re-assign current_scene_type)
|
482 |
+
if viewpoint == "aerial":
|
483 |
+
if "intersection" in current_scene_type.lower() or self._is_intersection(current_detected_objects): # Use lower for robustness
|
484 |
+
current_scene_type = "aerial_view_intersection"
|
485 |
+
elif any(keyword in current_scene_type.lower() for keyword in ["commercial", "shopping", "retail"]):
|
486 |
+
current_scene_type = "aerial_view_commercial_area"
|
487 |
+
elif any(keyword in current_scene_type.lower() for keyword in ["plaza", "square"]):
|
488 |
+
current_scene_type = "aerial_view_plaza"
|
489 |
+
else: # Default aerial if specific not matched
|
490 |
+
current_scene_type = "aerial_view_general" # Or use a specific default like aerial_view_intersection
|
491 |
+
|
492 |
+
# Detect cultural context (only for non-aerial viewpoints)
|
493 |
+
cultural_context = None
|
494 |
+
if viewpoint != "aerial":
|
495 |
+
cultural_context = self._detect_cultural_context(current_scene_type, current_detected_objects)
|
496 |
+
|
497 |
+
# Get base description for the (potentially updated) scene type
|
498 |
+
base_description = "A scene" # Default initialization
|
499 |
+
if viewpoint == "aerial":
|
500 |
+
# Check if current_scene_type (which might be an aerial type) has a base description
|
501 |
+
if current_scene_type in self.scene_types:
|
502 |
+
base_description = self.scene_types[current_scene_type].get("description", "An aerial view showing the layout and movement patterns from above")
|
503 |
+
else:
|
504 |
+
base_description = "An aerial view showing the layout and movement patterns from above"
|
505 |
+
elif current_scene_type in self.scene_types:
|
506 |
+
base_description = self.scene_types[current_scene_type].get("description", "A scene")
|
507 |
+
|
508 |
+
# spatial analysis, and image dimensions. This is where dynamic description or template filling happens.
|
509 |
+
core_scene_details = self._generate_scene_details(
|
510 |
+
current_scene_type, # Use the potentially updated scene_type
|
511 |
+
current_detected_objects,
|
512 |
+
lighting_info,
|
513 |
+
viewpoint,
|
514 |
+
spatial_analysis=spatial_analysis, # Pass this through
|
515 |
+
image_dimensions=image_dimensions, # Pass this through
|
516 |
+
places365_info=places365_info, # Pass Places365 info
|
517 |
+
object_statistics=object_statistics # Pass object statistics
|
518 |
+
)
|
519 |
+
|
520 |
+
# Start with the base description derived from SCENE_TYPES or a default.
|
521 |
+
description = base_description
|
522 |
+
if core_scene_details and core_scene_details.strip() != "": # Ensure core_scene_details is not empty
|
523 |
+
# If base_description is generic like "A scene", consider replacing it or appending smartly.
|
524 |
+
if base_description.lower() == "a scene" and len(core_scene_details) > len(base_description):
|
525 |
+
description = core_scene_details # Prioritize dynamic/template-filled details if base is too generic
|
526 |
+
else:
|
527 |
+
description = self._smart_append(description, core_scene_details)
|
528 |
+
elif not core_scene_details and not description: # If both are empty, use a generic fallback
|
529 |
+
description = self._generate_generic_description(current_detected_objects, lighting_info)
|
530 |
+
|
531 |
+
|
532 |
+
# Append secondary description from scene type template, if any
|
533 |
+
if current_scene_type in self.scene_types and "secondary_description" in self.scene_types[current_scene_type]:
|
534 |
+
secondary_desc = self.scene_types[current_scene_type]["secondary_description"]
|
535 |
+
if secondary_desc:
|
536 |
+
description = self._smart_append(description, secondary_desc)
|
537 |
+
|
538 |
+
# Append people count information
|
539 |
+
people_objs = [obj for obj in current_detected_objects if obj.get("class_id") == 0]
|
540 |
+
if people_objs:
|
541 |
+
people_count = len(people_objs)
|
542 |
+
|
543 |
+
if people_count == 1: people_phrase = "a single person"
|
544 |
+
elif people_count > 1 and people_count <= 3: people_phrase = f"{people_count} people" # Accurate for small counts
|
545 |
+
elif people_count > 3 and people_count <=7: people_phrase = "several people"
|
546 |
+
else: people_phrase = "multiple people" # For larger counts, or use "numerous"
|
547 |
+
|
548 |
+
# Only add if not already well covered in core_scene_details or base_description
|
549 |
+
if "person" not in description.lower() and "people" not in description.lower() and "pedestrian" not in description.lower():
|
550 |
+
description = self._smart_append(description, f"The scene includes {people_phrase}.")
|
551 |
+
|
552 |
+
# Append cultural context
|
553 |
+
if cultural_context and viewpoint != "aerial": # Already checked viewpoint
|
554 |
+
cultural_elements = self._generate_cultural_elements(cultural_context)
|
555 |
+
if cultural_elements:
|
556 |
+
description = self._smart_append(description, cultural_elements)
|
557 |
+
|
558 |
+
# Append lighting information
|
559 |
+
lighting_description_text = ""
|
560 |
+
if lighting_info and "time_of_day" in lighting_info:
|
561 |
+
lighting_type = lighting_info["time_of_day"]
|
562 |
+
lighting_desc_template = self.templates.get("lighting_templates", {}).get(lighting_type)
|
563 |
+
if lighting_desc_template:
|
564 |
+
lighting_description_text = lighting_desc_template
|
565 |
+
if lighting_description_text and lighting_description_text.lower() not in description.lower():
|
566 |
+
description = self._smart_append(description, lighting_description_text)
|
567 |
+
|
568 |
+
# Append viewpoint information (if not eye-level)
|
569 |
+
if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
|
570 |
+
viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
|
571 |
+
prefix = viewpoint_template.get('prefix', '')
|
572 |
+
observation_template = viewpoint_template.get("observation", "")
|
573 |
+
|
574 |
+
# Determine scene_elements for the observation template
|
575 |
+
scene_elements_for_vp = "the overall layout and objects" # Generic default
|
576 |
+
if viewpoint == "aerial":
|
577 |
+
scene_elements_for_vp = "crossing patterns and general layout"
|
578 |
+
|
579 |
+
viewpoint_observation_text = observation_template.format(scene_elements=scene_elements_for_vp)
|
580 |
+
|
581 |
+
# Combine prefix and observation carefully
|
582 |
+
full_viewpoint_text = ""
|
583 |
+
if prefix:
|
584 |
+
full_viewpoint_text = prefix.strip() + " "
|
585 |
+
if viewpoint_observation_text and viewpoint_observation_text[0].islower():
|
586 |
+
full_viewpoint_text += viewpoint_observation_text
|
587 |
+
elif viewpoint_observation_text:
|
588 |
+
full_viewpoint_text = prefix + viewpoint_observation_text[0].lower() + viewpoint_observation_text[1:] if description else prefix + viewpoint_observation_text
|
589 |
+
|
590 |
+
elif viewpoint_observation_text: # No prefix, but observation exists
|
591 |
+
full_viewpoint_text = viewpoint_observation_text[0].upper() + viewpoint_observation_text[1:]
|
592 |
+
|
593 |
+
|
594 |
+
if full_viewpoint_text and full_viewpoint_text.lower() not in description.lower():
|
595 |
+
description = self._smart_append(description, full_viewpoint_text)
|
596 |
+
|
597 |
+
|
598 |
+
# Append functional zones information
|
599 |
+
if functional_zones and len(functional_zones) > 0:
|
600 |
+
zones_desc_text = self._describe_functional_zones(functional_zones)
|
601 |
+
if zones_desc_text:
|
602 |
+
description = self._smart_append(description, zones_desc_text)
|
603 |
+
|
604 |
+
final_formatted_description = self._format_final_description(description)
|
605 |
+
|
606 |
+
if not enable_landmark:
|
607 |
+
final_formatted_description = self.filter_landmark_references(final_formatted_description, enable_landmark=False)
|
608 |
+
|
609 |
+
# If after all processing, description is empty, fallback to a very generic one.
|
610 |
+
if not final_formatted_description.strip() or final_formatted_description.strip() == ".":
|
611 |
+
self.logger.warning(f"Description for scene_type '{current_scene_type}' became empty after processing. Falling back.")
|
612 |
+
final_formatted_description = self._format_final_description(
|
613 |
+
self._generate_generic_description(current_detected_objects, lighting_info)
|
614 |
+
)
|
615 |
+
|
616 |
+
return final_formatted_description
|
617 |
+
|
618 |
|
619 |
def _smart_append(self, current_text: str, new_fragment: str) -> str:
|
620 |
"""
|
|
|
648 |
(new_fragment.startswith("A ") or new_fragment.startswith("An ")):
|
649 |
return current_text + ". " + new_fragment
|
650 |
|
651 |
+
# 檢查新片段是否包含地標名稱(通常為專有名詞)
|
652 |
+
has_landmark_name = any(word[0].isupper() for word in new_fragment.split()
|
653 |
+
if len(word) > 2 and not word.startswith(("A ", "An ", "The ")))
|
654 |
+
|
655 |
# Decide how to join the texts
|
656 |
if ends_with_sentence:
|
657 |
# After a sentence, start with uppercase and add proper spacing
|
658 |
joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:])
|
659 |
elif ends_with_comma:
|
660 |
# After a comma, maintain flow with lowercase unless it's a proper noun or special case
|
661 |
+
if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name:
|
662 |
joined_text = current_text + " " + new_fragment
|
663 |
else:
|
664 |
joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:]
|
|
|
668 |
else:
|
669 |
# For other cases, decide based on the content
|
670 |
if self._is_related_phrases(current_text, new_fragment):
|
671 |
+
if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name:
|
672 |
joined_text = current_text + ", " + new_fragment
|
673 |
else:
|
674 |
joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:]
|
|
|
717 |
|
718 |
return False
|
719 |
|
720 |
+
|
721 |
def _format_final_description(self, text: str) -> str:
|
722 |
"""
|
723 |
Format the final description text to ensure correct punctuation,
|
724 |
capitalization, and spacing.
|
|
|
|
|
|
|
|
|
|
|
|
|
725 |
"""
|
726 |
+
if not text or not text.strip(): # Also check if text is just whitespace
|
|
|
|
|
727 |
return ""
|
728 |
|
729 |
+
# Trim leading/trailing whitespace first
|
730 |
+
text = text.strip()
|
|
|
731 |
|
732 |
+
# 1. Handle consecutive "A/An" segments (potentially split them into sentences)
|
733 |
+
text = re.sub(r'(A\s+[^.!?]+?[\w\.])\s+(A\s+)', r'\1. \2', text, flags=re.IGNORECASE)
|
734 |
+
text = re.sub(r'(An\s+[^.!?]+?[\w\.])\s+(An?\s+)', r'\1. \2', text, flags=re.IGNORECASE)
|
735 |
|
736 |
+
# 2. Ensure first character of the entire text is uppercase
|
737 |
+
if text:
|
738 |
+
text = text[0].upper() + text[1:]
|
739 |
|
740 |
+
# 3. Normalize whitespace: multiple spaces to one
|
741 |
+
text = re.sub(r'\s{2,}', ' ', text)
|
|
|
|
|
742 |
|
743 |
+
# 4. Capitalize after sentence-ending punctuation (. ! ?)
|
744 |
+
def capitalize_after_punctuation(match):
|
745 |
+
return match.group(1) + match.group(2).upper()
|
746 |
+
text = re.sub(r'([.!?]\s+)([a-z])', capitalize_after_punctuation, text)
|
747 |
|
748 |
+
# 5. Handle capitalization after commas (your existing robust logic is good)
|
749 |
def fix_capitalization_after_comma(match):
|
750 |
+
leading_comma_space = match.group(1) # (,\s+)
|
751 |
+
word_after_comma = match.group(2) # ([A-Z][a-zA-Z]*)
|
752 |
+
|
753 |
+
proper_nouns_exceptions = ["I", "I'm", "I've", "I'd", "I'll",
|
754 |
+
"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
|
755 |
+
"January", "February", "March", "April", "May", "June", "July",
|
756 |
+
"August", "September", "October", "November", "December"]
|
757 |
+
|
758 |
+
if word_after_comma in proper_nouns_exceptions:
|
759 |
+
return match.group(0)
|
760 |
+
# If the word looks like a proper noun (e.g., multi-word capitalized, or a known location/brand)
|
761 |
+
# This heuristic can be tricky. For simplicity, if it's already capitalized and not a common word, keep it.
|
762 |
+
if len(word_after_comma) > 2 and word_after_comma[0].isupper() and word_after_comma.lower() not in ["this", "that", "these", "those", "they", "their", "then", "thus"]:
|
763 |
+
return match.group(0) # Keep it if it looks like a proper noun already
|
764 |
+
|
765 |
+
return leading_comma_space + word_after_comma[0].lower() + word_after_comma[1:]
|
766 |
+
text = re.sub(r'(,\s+)([A-Z][a-zA-Z\'\-]+)', fix_capitalization_after_comma, text) # Added hyphen and apostrophe to word
|
767 |
+
|
768 |
+
# 6. Correct spacing around punctuation
|
769 |
+
text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) # Ensures one space AFTER punctuation, none before
|
770 |
+
text = text.replace(' .', '.').replace(' ,', ',') # Clean up potential space before period/comma from previous rule
|
771 |
+
|
772 |
+
# 7. Consolidate multiple sentence-ending punctuations (e.g., "!!", "?.", ".?")
|
773 |
+
text = re.sub(r'[.!?]{2,}', '.', text) # Convert multiple to a single period
|
774 |
+
text = re.sub(r',+', ',', text) # Multiple commas to one
|
775 |
+
|
776 |
+
# 8. Ensure text ends with a single sentence-ending punctuation mark
|
777 |
+
text = text.strip() # Remove trailing whitespace before checking last char
|
778 |
+
if text and not text[-1] in '.!?':
|
779 |
+
text += '.'
|
780 |
|
781 |
+
# 9. Remove any leading punctuation or extra spaces that might have been introduced
|
782 |
+
text = re.sub(r'^[.,;:!?\s]+', '', text)
|
|
|
|
|
|
|
783 |
|
784 |
+
# 10. Final check for first letter capitalization
|
785 |
+
if text:
|
786 |
+
text = text[0].upper() + text[1:]
|
787 |
|
788 |
+
# 11. Remove space before final punctuation mark if accidentally added by rule 7
|
789 |
+
text = re.sub(r'\s+([.!?])$', r'\1', text)
|
|
|
790 |
|
791 |
+
return text.strip() # Final strip
|
|
|
|
|
|
|
|
|
792 |
|
793 |
def _is_intersection(self, detected_objects: List[Dict]) -> bool:
|
794 |
"""
|
|
|
870 |
|
871 |
return base_desc
|
872 |
|
873 |
+
def _get_prominent_objects(self, detected_objects: List[Dict], min_prominence_score: float = 0.1, max_categories_to_return: int = 5, max_total_objects: int = 7) -> List[Dict]:
|
874 |
+
"""
|
875 |
+
Helper function to get the most prominent objects.
|
876 |
+
Prioritizes high-confidence, large objects, and ensures a diversity of object types.
|
877 |
+
|
878 |
+
Args:
|
879 |
+
detected_objects: List of detected objects.
|
880 |
+
min_prominence_score: Minimum score for an object to be considered initially.
|
881 |
+
max_categories_to_return: Max number of different object categories to prioritize.
|
882 |
+
max_total_objects: Overall cap on the number of prominent objects returned.
|
883 |
+
|
884 |
+
Returns:
|
885 |
+
List of prominent detected objects.
|
886 |
+
"""
|
887 |
+
if not detected_objects:
|
888 |
+
return []
|
889 |
+
|
890 |
+
scored_objects = []
|
891 |
+
for obj in detected_objects:
|
892 |
+
area = obj.get("normalized_area", 0.0) + 1e-6
|
893 |
+
confidence = obj.get("confidence", 0.0)
|
894 |
+
|
895 |
+
# Base score: area and confidence are key
|
896 |
+
score = (area * 0.65) + (confidence * 0.35) # Slightly more weight to area
|
897 |
+
|
898 |
+
# Bonus for generally important object classes (in a generic way)
|
899 |
+
# This is a simple heuristic. More advanced would be context-dependent.
|
900 |
+
# For example, 'person' is often more salient.
|
901 |
+
# Avoid hardcoding specific class_ids here if possible, or use broad categories if available.
|
902 |
+
# For simplicity, we'll keep the landmark bonus for now.
|
903 |
+
if obj.get("class_name") == "person": # Example: person is generally prominent
|
904 |
+
score += 0.1
|
905 |
+
if obj.get("is_landmark"): # Landmarks are always prominent
|
906 |
+
score += 0.5
|
907 |
+
|
908 |
+
if score >= min_prominence_score:
|
909 |
+
scored_objects.append((obj, score))
|
910 |
+
|
911 |
+
if not scored_objects:
|
912 |
+
return []
|
913 |
+
|
914 |
+
# Sort by score in descending order
|
915 |
+
scored_objects.sort(key=lambda x: x[1], reverse=True)
|
916 |
+
|
917 |
+
# Prioritize diversity of object categories first
|
918 |
+
prominent_by_category = {}
|
919 |
+
final_prominent_objects = []
|
920 |
+
|
921 |
+
for obj, score in scored_objects:
|
922 |
+
category = obj.get("class_name", "unknown")
|
923 |
+
if category not in prominent_by_category:
|
924 |
+
if len(prominent_by_category) < max_categories_to_return:
|
925 |
+
prominent_by_category[category] = obj
|
926 |
+
final_prominent_objects.append(obj)
|
927 |
+
|
928 |
+
elif len(final_prominent_objects) < max_total_objects and obj not in final_prominent_objects:
|
929 |
+
if score > 0.3:
|
930 |
+
final_prominent_objects.append(obj)
|
931 |
+
|
932 |
+
# If still under max_total_objects, fill with highest scored remaining objects regardless of category
|
933 |
+
if len(final_prominent_objects) < max_total_objects:
|
934 |
+
for obj, score in scored_objects:
|
935 |
+
if len(final_prominent_objects) >= max_total_objects:
|
936 |
+
break
|
937 |
+
if obj not in final_prominent_objects:
|
938 |
+
final_prominent_objects.append(obj)
|
939 |
+
|
940 |
+
# Re-sort the final list by original prominence score to maintain order
|
941 |
+
final_prominent_objects_with_scores = []
|
942 |
+
for obj in final_prominent_objects:
|
943 |
+
for original_obj, original_score in scored_objects:
|
944 |
+
if obj is original_obj: # Check for object identity
|
945 |
+
final_prominent_objects_with_scores.append((obj, original_score))
|
946 |
+
break
|
947 |
+
|
948 |
+
final_prominent_objects_with_scores.sort(key=lambda x: x[1], reverse=True)
|
949 |
+
|
950 |
+
return [obj for obj, score in final_prominent_objects_with_scores[:max_total_objects]]
|
951 |
+
|
952 |
+
|
953 |
+
def _format_object_list_for_description(self,
|
954 |
+
objects: List[Dict],
|
955 |
+
use_indefinite_article_for_one: bool = False,
|
956 |
+
count_threshold_for_generalization: int = -1, # Default to -1 for precise counts
|
957 |
+
max_types_to_list: int = 5
|
958 |
+
) -> str:
|
959 |
+
"""
|
960 |
+
Formats a list of detected objects into a human-readable string with counts.
|
961 |
+
Args:
|
962 |
+
objects: List of object dictionaries, each expected to have 'class_name'.
|
963 |
+
use_indefinite_article_for_one: If True, uses "a/an" for single items. If False, uses "one".
|
964 |
+
count_threshold_for_generalization: If count exceeds this, use general terms. -1 means precise counts.
|
965 |
+
max_types_to_list: Maximum number of different object types to include in the list.
|
966 |
+
"""
|
967 |
+
if not objects:
|
968 |
+
return "no specific objects clearly identified"
|
969 |
+
|
970 |
+
counts: Dict[str, int] = {}
|
971 |
+
for obj in objects:
|
972 |
+
name = obj.get("class_name", "unknown object")
|
973 |
+
if name == "unknown object" or not name: # Skip unknown or empty names
|
974 |
+
continue
|
975 |
+
counts[name] = counts.get(name, 0) + 1
|
976 |
+
|
977 |
+
if not counts:
|
978 |
+
return "no specific objects clearly identified"
|
979 |
+
|
980 |
+
descriptions = []
|
981 |
+
# Sort by count (desc) then name (asc) for consistent output order
|
982 |
+
# Limit the number of distinct object types being listed
|
983 |
+
sorted_counts = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:max_types_to_list]
|
984 |
+
|
985 |
+
|
986 |
+
for name, count in sorted_counts:
|
987 |
+
if count == 1:
|
988 |
+
if use_indefinite_article_for_one:
|
989 |
+
if name[0].lower() in 'aeiou':
|
990 |
+
descriptions.append(f"an {name}")
|
991 |
+
else:
|
992 |
+
descriptions.append(f"a {name}")
|
993 |
+
else:
|
994 |
+
descriptions.append(f"one {name}") # Output "one car" instead of "a car"
|
995 |
+
else: # count > 1
|
996 |
+
plural_name = name
|
997 |
+
if name.endswith("y") and not name.lower().endswith(("ay", "ey", "iy", "oy", "uy")):
|
998 |
+
plural_name = name[:-1] + "ies"
|
999 |
+
elif name.endswith(("s", "sh", "ch", "x", "z")):
|
1000 |
+
plural_name = name + "es"
|
1001 |
+
elif not name.endswith("s"): # Avoid double 's' like "buss"
|
1002 |
+
plural_name = name + "s"
|
1003 |
+
|
1004 |
+
if count_threshold_for_generalization != -1 and count > count_threshold_for_generalization:
|
1005 |
+
if count <= count_threshold_for_generalization + 3:
|
1006 |
+
descriptions.append(f"several {plural_name}")
|
1007 |
+
else:
|
1008 |
+
descriptions.append(f"many {plural_name}")
|
1009 |
+
else: # Use exact count (e.g., "6 cars")
|
1010 |
+
descriptions.append(f"{count} {plural_name}")
|
1011 |
+
|
1012 |
+
if not descriptions:
|
1013 |
+
return "no specific objects clearly identified"
|
1014 |
+
|
1015 |
+
if len(descriptions) == 1:
|
1016 |
+
return descriptions[0]
|
1017 |
+
elif len(descriptions) == 2:
|
1018 |
+
return f"{descriptions[0]} and {descriptions[1]}"
|
1019 |
+
else:
|
1020 |
+
# Oxford comma for lists of 3 or more.
|
1021 |
+
return ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}"
|
1022 |
+
|
1023 |
+
def _get_spatial_description(self, obj: Dict, image_width: Optional[int] = None, image_height: Optional[int] = None) -> str:
|
1024 |
+
"""
|
1025 |
+
Generates a brief spatial description for an object.
|
1026 |
+
(This is a new helper function)
|
1027 |
+
"""
|
1028 |
+
region = obj.get("region")
|
1029 |
+
if region:
|
1030 |
+
# Convert region name to more descriptive terms
|
1031 |
+
region_map = {
|
1032 |
+
"top_left": "in the top-left", "top_center": "at the top-center", "top_right": "in the top-right",
|
1033 |
+
"middle_left": "on the middle-left side", "middle_center": "in the center", "middle_right": "on the middle-right side",
|
1034 |
+
"bottom_left": "in the bottom-left", "bottom_center": "at the bottom-center", "bottom_right": "in the bottom-right"
|
1035 |
+
}
|
1036 |
+
# More general terms if exact region is not critical
|
1037 |
+
if "top" in region: general_v_pos = "towards the top"
|
1038 |
+
elif "bottom" in region: general_v_pos = "towards the bottom"
|
1039 |
+
else: general_v_pos = "in the middle vertically"
|
1040 |
+
|
1041 |
+
if "left" in region: general_h_pos = "towards the left"
|
1042 |
+
elif "right" in region: general_h_pos = "towards the right"
|
1043 |
+
else: general_h_pos = "in the center horizontally"
|
1044 |
+
|
1045 |
+
# Prioritize specific region if available, else use general
|
1046 |
+
specific_desc = region_map.get(region, "")
|
1047 |
+
if specific_desc:
|
1048 |
+
return f"{specific_desc} of the frame"
|
1049 |
+
else:
|
1050 |
+
return f"{general_v_pos} and {general_h_pos} of the frame"
|
1051 |
+
|
1052 |
+
# Fallback if region info is not detailed enough or missing
|
1053 |
+
# We can use normalized_center if available
|
1054 |
+
norm_center = obj.get("normalized_center")
|
1055 |
+
if norm_center and image_width and image_height: # Check if image_width/height are provided
|
1056 |
+
x_norm, y_norm = norm_center
|
1057 |
+
h_pos = "left" if x_norm < 0.4 else "right" if x_norm > 0.6 else "center"
|
1058 |
+
v_pos = "top" if y_norm < 0.4 else "bottom" if y_norm > 0.6 else "middle"
|
1059 |
+
|
1060 |
+
if h_pos == "center" and v_pos == "middle":
|
1061 |
+
return "near the center of the image"
|
1062 |
+
return f"in the {v_pos}-{h_pos} area of the image"
|
1063 |
+
|
1064 |
+
return "in the scene" # Generic fallback
|
1065 |
+
|
1066 |
+
|
1067 |
+
def _generate_dynamic_everyday_description(self,
|
1068 |
+
detected_objects: List[Dict],
|
1069 |
+
lighting_info: Optional[Dict] = None,
|
1070 |
+
viewpoint: str = "eye_level",
|
1071 |
+
spatial_analysis: Optional[Dict] = None,
|
1072 |
+
image_dimensions: Optional[Tuple[int, int]] = None,
|
1073 |
+
places365_info: Optional[Dict] = None,
|
1074 |
+
object_statistics: Optional[Dict] = None
|
1075 |
+
) -> str:
|
1076 |
+
"""
|
1077 |
+
Dynamically generates a description for everyday scenes based on ALL relevant detected_objects,
|
1078 |
+
their counts, and context.
|
1079 |
+
It aims to describe the overall scene first, then details of object groups including accurate counts.
|
1080 |
+
"""
|
1081 |
+
description_segments = []
|
1082 |
+
image_width, image_height = image_dimensions if image_dimensions else (None, None)
|
1083 |
+
|
1084 |
+
if hasattr(self, 'logger'):
|
1085 |
+
self.logger.info(f"DynamicDesc: Start. Total Raw Objects: {len(detected_objects)}, View: {viewpoint}, Light: {lighting_info is not None}")
|
1086 |
+
|
1087 |
+
# 1. Overall Ambiance (Lighting and Viewpoint)
|
1088 |
+
ambiance_parts = []
|
1089 |
+
if lighting_info:
|
1090 |
+
time_of_day = lighting_info.get("time_of_day", "unknown lighting")
|
1091 |
+
is_indoor = lighting_info.get("is_indoor")
|
1092 |
+
ambiance_statement = "This is"
|
1093 |
+
if is_indoor is True: ambiance_statement += " an indoor scene"
|
1094 |
+
elif is_indoor is False: ambiance_statement += " an outdoor scene"
|
1095 |
+
else: ambiance_statement += " a scene"
|
1096 |
+
lighting_map = self.templates.get("lighting_templates", {})
|
1097 |
+
readable_lighting_base = lighting_map.get(time_of_day, f"with {time_of_day.replace('_', ' ')} lighting conditions")
|
1098 |
+
readable_lighting = readable_lighting_base.lower().replace("the scene is captured", "").replace("the scene has", "").strip()
|
1099 |
+
ambiance_statement += f", likely {readable_lighting}."
|
1100 |
+
ambiance_parts.append(ambiance_statement)
|
1101 |
+
|
1102 |
+
if viewpoint and viewpoint != "eye_level":
|
1103 |
+
vp_templates = self.templates.get("viewpoint_templates", {})
|
1104 |
+
if viewpoint in vp_templates:
|
1105 |
+
vp_prefix = vp_templates[viewpoint].get("prefix", "").strip()
|
1106 |
+
if vp_prefix:
|
1107 |
+
if not ambiance_parts:
|
1108 |
+
ambiance_parts.append(f"{vp_prefix.capitalize()} the general layout of the scene is observed.")
|
1109 |
+
else:
|
1110 |
+
ambiance_parts[-1] = ambiance_parts[-1].rstrip('.') + f", viewed {vp_templates[viewpoint].get('short_desc', viewpoint)}."
|
1111 |
+
|
1112 |
+
if ambiance_parts:
|
1113 |
+
description_segments.append(" ".join(ambiance_parts))
|
1114 |
+
|
1115 |
+
# 2. Describe ALL detected objects, grouped by class, with accurate counts and locations
|
1116 |
+
if not detected_objects:
|
1117 |
+
# This part remains, but the conditions to reach here might change based on confident_objects check
|
1118 |
+
if not description_segments:
|
1119 |
+
description_segments.append("A general scene is visible, but no specific objects were clearly identified.")
|
1120 |
+
else:
|
1121 |
+
description_segments.append("Within this setting, no specific objects were clearly identified.")
|
1122 |
+
else:
|
1123 |
+
objects_by_class: Dict[str, List[Dict]] = {}
|
1124 |
+
|
1125 |
+
# keeping 0.25 as a placeholder
|
1126 |
+
confidence_filter_threshold = getattr(self, 'confidence_threshold_for_description', 0.25)
|
1127 |
+
confident_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= confidence_filter_threshold]
|
1128 |
+
|
1129 |
+
if not confident_objects:
|
1130 |
+
# This message is more appropriate if objects existed but none met confidence
|
1131 |
+
no_confident_obj_msg = "While some elements might be present, no objects were identified with sufficient confidence for a detailed description."
|
1132 |
+
if not description_segments: description_segments.append(no_confident_obj_msg)
|
1133 |
+
else: description_segments.append(no_confident_obj_msg.lower().capitalize()) # Append as a new sentence
|
1134 |
+
else:
|
1135 |
+
if object_statistics:
|
1136 |
+
# 使用預計算的統計信息,並採用動態置信度策略
|
1137 |
+
for class_name, stats in object_statistics.items():
|
1138 |
+
count = stats.get("count", 0)
|
1139 |
+
avg_confidence = stats.get("avg_confidence", 0)
|
1140 |
+
|
1141 |
+
# 動態調整置信度閾值:裝飾性物品使用較低閾值
|
1142 |
+
dynamic_threshold = confidence_filter_threshold
|
1143 |
+
if class_name in ["potted plant", "vase", "clock", "book"]:
|
1144 |
+
dynamic_threshold = max(0.15, confidence_filter_threshold * 0.6)
|
1145 |
+
elif count >= 3: # 數量多的物品降低閾值
|
1146 |
+
dynamic_threshold = max(0.2, confidence_filter_threshold * 0.8)
|
1147 |
+
|
1148 |
+
if count > 0 and avg_confidence >= dynamic_threshold:
|
1149 |
+
matching_objects = [obj for obj in confident_objects if obj.get("class_name") == class_name]
|
1150 |
+
if not matching_objects:
|
1151 |
+
# 如果高信心度的物體中沒有,從原始列表中尋找
|
1152 |
+
matching_objects = [obj for obj in detected_objects
|
1153 |
+
if obj.get("class_name") == class_name and obj.get("confidence", 0) >= dynamic_threshold]
|
1154 |
+
|
1155 |
+
if matching_objects:
|
1156 |
+
actual_count = min(stats["count"], len(matching_objects))
|
1157 |
+
objects_by_class[class_name] = matching_objects[:actual_count]
|
1158 |
+
else:
|
1159 |
+
# 回退邏輯同樣使用動態閾值
|
1160 |
+
for obj in confident_objects:
|
1161 |
+
name = obj.get("class_name", "unknown object")
|
1162 |
+
if name == "unknown object" or not name: continue
|
1163 |
+
if name not in objects_by_class:
|
1164 |
+
objects_by_class[name] = []
|
1165 |
+
objects_by_class[name].append(obj)
|
1166 |
+
|
1167 |
+
if not objects_by_class: # Should be rare if confident_objects was not empty and had valid names
|
1168 |
+
description_segments.append("No common objects were confidently identified for detailed description.")
|
1169 |
+
else:
|
1170 |
+
def sort_key_object_groups(item_tuple: Tuple[str, List[Dict]]):
|
1171 |
+
class_name_key, obj_group_list = item_tuple
|
1172 |
+
priority = 3 # 預設優先級
|
1173 |
+
count = len(obj_group_list)
|
1174 |
+
|
1175 |
+
# 動態優先級:基於場景相關性和數量
|
1176 |
+
if class_name_key == "person":
|
1177 |
+
priority = 0
|
1178 |
+
elif class_name_key in ["dining table", "chair", "sofa", "bed"]:
|
1179 |
+
priority = 1 # 室內主要家具
|
1180 |
+
elif class_name_key in ["car", "bus", "truck", "traffic light"]:
|
1181 |
+
priority = 2 # 交通相關物體
|
1182 |
+
elif count >= 3: # 數量多的物體提升優先級
|
1183 |
+
priority = max(1, priority - 1)
|
1184 |
+
elif class_name_key in ["potted plant", "vase", "clock", "book"] and count >= 2:
|
1185 |
+
priority = 2 # 裝飾性物品有一定數量時提升優先級
|
1186 |
+
|
1187 |
+
avg_area = sum(o.get("normalized_area", 0.0) for o in obj_group_list) / len(obj_group_list) if obj_group_list else 0
|
1188 |
+
|
1189 |
+
# 增加數量權重:多個同類物體更重要
|
1190 |
+
quantity_bonus = min(count / 5.0, 1.0) # 最多1.0的加成
|
1191 |
+
|
1192 |
+
return (priority, -len(obj_group_list), -avg_area, -quantity_bonus)
|
1193 |
+
|
1194 |
+
# 去除重複的邏輯
|
1195 |
+
deduplicated_objects_by_class = {}
|
1196 |
+
processed_positions = []
|
1197 |
+
|
1198 |
+
for class_name, group_of_objects in objects_by_class.items():
|
1199 |
+
unique_objects = []
|
1200 |
+
|
1201 |
+
for obj in group_of_objects:
|
1202 |
+
obj_position = obj.get("normalized_center", [0.5, 0.5])
|
1203 |
+
is_duplicate = False
|
1204 |
+
|
1205 |
+
# 檢查是否與已處理的物體位置重疊
|
1206 |
+
for processed_pos in processed_positions:
|
1207 |
+
position_distance = abs(obj_position[0] - processed_pos[0]) + abs(obj_position[1] - processed_pos[1])
|
1208 |
+
if position_distance < 0.15: # 位置重疊閾值
|
1209 |
+
is_duplicate = True
|
1210 |
+
break
|
1211 |
+
|
1212 |
+
if not is_duplicate:
|
1213 |
+
unique_objects.append(obj)
|
1214 |
+
processed_positions.append(obj_position)
|
1215 |
+
|
1216 |
+
if unique_objects:
|
1217 |
+
deduplicated_objects_by_class[class_name] = unique_objects
|
1218 |
+
|
1219 |
+
objects_by_class = deduplicated_objects_by_class
|
1220 |
+
|
1221 |
+
sorted_object_groups = sorted(objects_by_class.items(), key=sort_key_object_groups)
|
1222 |
+
|
1223 |
+
object_clauses = [] # Stores individual object group descriptions
|
1224 |
+
|
1225 |
+
for class_name, group_of_objects in sorted_object_groups:
|
1226 |
+
count = len(group_of_objects)
|
1227 |
+
if count == 0: continue
|
1228 |
+
|
1229 |
+
# 使用統計信息確保準確的數量描述
|
1230 |
+
if object_statistics and class_name in object_statistics:
|
1231 |
+
actual_count = object_statistics[class_name]["count"]
|
1232 |
+
# 根據實際統計數量生成描述
|
1233 |
+
if actual_count == 1:
|
1234 |
+
formatted_name_with_exact_count = f"one {class_name}"
|
1235 |
+
else:
|
1236 |
+
plural_form = f"{class_name}s" if not class_name.endswith('s') else class_name
|
1237 |
+
formatted_name_with_exact_count = f"{actual_count} {plural_form}"
|
1238 |
+
else:
|
1239 |
+
# 回退到原有的格式化邏輯
|
1240 |
+
formatted_name_with_exact_count = self._format_object_list_for_description(
|
1241 |
+
[group_of_objects[0]] * count,
|
1242 |
+
use_indefinite_article_for_one=False,
|
1243 |
+
count_threshold_for_generalization=-1
|
1244 |
+
)
|
1245 |
+
|
1246 |
+
if formatted_name_with_exact_count == "no specific objects clearly identified" or not formatted_name_with_exact_count:
|
1247 |
+
continue
|
1248 |
+
|
1249 |
+
# Determine collective location for the group
|
1250 |
+
location_description_suffix = "" # e.g., "is in the center" or "are in the west area"
|
1251 |
+
if count == 1:
|
1252 |
+
location_description_suffix = f"is {self._get_spatial_description(group_of_objects[0], image_width, image_height)}"
|
1253 |
+
else:
|
1254 |
+
distinct_regions = sorted(list(set(obj.get("region", "unknown_region") for obj in group_of_objects)))
|
1255 |
+
known_regions = [r for r in distinct_regions if r != "unknown_region"]
|
1256 |
+
if not known_regions and "unknown_region" in distinct_regions:
|
1257 |
+
location_description_suffix = "are visible in the scene"
|
1258 |
+
elif len(known_regions) == 1:
|
1259 |
+
location_description_suffix = f"are primarily in the {known_regions[0].replace('_', ' ')} area"
|
1260 |
+
elif len(known_regions) == 2:
|
1261 |
+
location_description_suffix = f"are mainly across the {known_regions[0].replace('_',' ')} and {known_regions[1].replace('_',' ')} areas"
|
1262 |
+
elif len(known_regions) > 2:
|
1263 |
+
location_description_suffix = "are distributed in various parts of the scene"
|
1264 |
+
else:
|
1265 |
+
location_description_suffix = "are visible in the scene"
|
1266 |
+
|
1267 |
+
# Capitalize the object description (e.g., "Six cars")
|
1268 |
+
formatted_name_capitalized = formatted_name_with_exact_count[0].upper() + formatted_name_with_exact_count[1:]
|
1269 |
+
object_clauses.append(f"{formatted_name_capitalized} {location_description_suffix}")
|
1270 |
+
|
1271 |
+
if object_clauses:
|
1272 |
+
# Join object clauses into one or more sentences.
|
1273 |
+
if not description_segments: # If no ambiance, start with the first object clause.
|
1274 |
+
if object_clauses:
|
1275 |
+
first_clause = object_clauses.pop(0) # Take the first one out
|
1276 |
+
description_segments.append(first_clause + ".")
|
1277 |
+
else: # Ambiance exists, prepend with "The scene features..." or similar
|
1278 |
+
if object_clauses:
|
1279 |
+
description_segments.append("The scene features:") # Or "Key elements include:"
|
1280 |
+
|
1281 |
+
# Add remaining object clauses as separate points or a continuous sentence
|
1282 |
+
# For now, let's join them into a single continuous sentence string to be added.
|
1283 |
+
if object_clauses: # If there are more clauses after the first (or after "The scene features:")
|
1284 |
+
joined_object_clauses = ". ".join(object_clauses)
|
1285 |
+
if joined_object_clauses and not joined_object_clauses.endswith("."):
|
1286 |
+
joined_object_clauses += "."
|
1287 |
+
description_segments.append(joined_object_clauses)
|
1288 |
+
|
1289 |
+
elif not description_segments : # No ambiance and no describable objects after filtering
|
1290 |
+
return "The image depicts a scene, but specific objects could not be described with confidence or detail."
|
1291 |
+
|
1292 |
+
# --- Final assembly and formatting ---
|
1293 |
+
# Join all collected segments. _smart_append might be better if parts are not full sentences.
|
1294 |
+
# Since we aim for full sentences in segments, simple join then format.
|
1295 |
+
raw_description = ""
|
1296 |
+
for i, segment in enumerate(filter(None, description_segments)):
|
1297 |
+
segment = segment.strip()
|
1298 |
+
if not segment: continue
|
1299 |
+
|
1300 |
+
if not raw_description: # First non-empty segment
|
1301 |
+
raw_description = segment
|
1302 |
+
else:
|
1303 |
+
if not raw_description.endswith(('.', '!', '?')):
|
1304 |
+
raw_description += "."
|
1305 |
+
raw_description += " " + (segment[0].upper() + segment[1:] if len(segment) > 1 else segment.upper())
|
1306 |
+
|
1307 |
+
if raw_description and not raw_description.endswith(('.', '!', '?')):
|
1308 |
+
raw_description += "."
|
1309 |
+
|
1310 |
+
final_description = self._format_final_description(raw_description) # Crucial for final polish
|
1311 |
+
|
1312 |
+
if not final_description or len(final_description.strip()) < 20:
|
1313 |
+
# Fallback if description is too short or empty after processing
|
1314 |
+
# Use a more informative fallback if confident_objects existed
|
1315 |
+
if 'confident_objects' in locals() and confident_objects:
|
1316 |
+
return "The scene contains several detected objects, but a detailed textual description could not be fully constructed."
|
1317 |
+
else:
|
1318 |
+
return "A general scene is depicted with no objects identified with high confidence."
|
1319 |
+
|
1320 |
+
return final_description
|
1321 |
+
|
1322 |
+
|
1323 |
def _generate_scene_details(self,
|
1324 |
+
scene_type: str,
|
1325 |
+
detected_objects: List[Dict],
|
1326 |
+
lighting_info: Optional[Dict] = None,
|
1327 |
+
viewpoint: str = "eye_level",
|
1328 |
+
spatial_analysis: Optional[Dict] = None,
|
1329 |
+
image_dimensions: Optional[Tuple[int, int]] = None,
|
1330 |
+
places365_info: Optional[Dict] = None,
|
1331 |
+
object_statistics: Optional[Dict] = None
|
1332 |
+
) -> str:
|
1333 |
"""
|
1334 |
Generate detailed description based on scene type and detected objects.
|
1335 |
+
Enhanced to handle everyday scenes dynamically with accurate object counting.
|
1336 |
|
1337 |
Args:
|
1338 |
+
scene_type: Identified scene type.
|
1339 |
+
detected_objects: List of detected objects.
|
1340 |
+
lighting_info: Optional lighting condition information.
|
1341 |
+
viewpoint: Detected viewpoint (aerial, eye_level, etc.).
|
1342 |
+
spatial_analysis: Optional results from SpatialAnalyzer.
|
1343 |
+
image_dimensions: Optional tuple of (image_width, image_height).
|
1344 |
+
places365_info: Optional Places365 scene classification results.
|
1345 |
+
object_statistics: Optional detailed object statistics with counts and confidence.
|
1346 |
|
1347 |
Returns:
|
1348 |
+
str: Detailed scene description.
|
1349 |
"""
|
|
|
1350 |
scene_details = ""
|
1351 |
scene_templates = self.templates.get("scene_detail_templates", {})
|
1352 |
|
1353 |
+
# List of scene types considered "everyday" or generic
|
1354 |
+
everyday_scene_types = [
|
1355 |
+
"general_indoor_space", "generic_street_view",
|
1356 |
+
"desk_area_workspace", "outdoor_gathering_spot",
|
1357 |
+
"kitchen_counter_or_utility_area", "unknown"
|
1358 |
+
]
|
1359 |
|
1360 |
+
# Extract Places365 attributes for enhanced description
|
1361 |
+
places365_attributes = []
|
1362 |
+
scene_specific_details = ""
|
1363 |
+
|
1364 |
+
if places365_info and places365_info.get('confidence', 0) > 0.4:
|
1365 |
+
attributes = places365_info.get('attributes', [])
|
1366 |
+
scene_label = places365_info.get('scene_label', '')
|
1367 |
+
|
1368 |
+
# Filter relevant attributes for description enhancement
|
1369 |
+
relevant_attributes = [attr for attr in attributes if attr in [
|
1370 |
+
'natural_lighting', 'artificial_lighting', 'commercial', 'residential',
|
1371 |
+
'workplace', 'recreational', 'educational', 'open_space', 'enclosed_space'
|
1372 |
+
]]
|
1373 |
+
places365_attributes = relevant_attributes[:2]
|
1374 |
+
|
1375 |
+
# Generate scene-specific contextual details using object statistics
|
1376 |
+
if object_statistics:
|
1377 |
+
if 'commercial' in attributes and object_statistics.get('person', {}).get('count', 0) > 0:
|
1378 |
+
person_count = object_statistics['person']['count']
|
1379 |
+
if person_count == 1:
|
1380 |
+
scene_specific_details = "This appears to be an active commercial environment with a customer present."
|
1381 |
+
else:
|
1382 |
+
scene_specific_details = f"This appears to be an active commercial environment with {person_count} people present."
|
1383 |
+
elif 'residential' in attributes and scene_type in ['living_room', 'bedroom', 'kitchen']:
|
1384 |
+
scene_specific_details = "The setting suggests a comfortable residential living space."
|
1385 |
+
elif 'workplace' in attributes and any(object_statistics.get(obj, {}).get('count', 0) > 0
|
1386 |
+
for obj in ['laptop', 'keyboard', 'monitor']):
|
1387 |
+
scene_specific_details = "The environment indicates an active workspace or office setting."
|
1388 |
else:
|
1389 |
+
# Fallback to original logic if object_statistics not available
|
1390 |
+
if 'commercial' in attributes and any(obj['class_name'] in ['person', 'chair', 'table'] for obj in detected_objects):
|
1391 |
+
scene_specific_details = "This appears to be an active commercial environment with customer activity."
|
1392 |
+
elif 'residential' in attributes and scene_type in ['living_room', 'bedroom', 'kitchen']:
|
1393 |
+
scene_specific_details = "The setting suggests a comfortable residential living space."
|
1394 |
+
elif 'workplace' in attributes and any(obj['class_name'] in ['laptop', 'keyboard', 'monitor'] for obj in detected_objects):
|
1395 |
+
scene_specific_details = "The environment indicates an active workspace or office setting."
|
1396 |
+
|
1397 |
+
# Determine scene description approach
|
1398 |
+
is_confident_specific_scene = scene_type not in everyday_scene_types and scene_type in scene_templates
|
1399 |
+
treat_as_everyday = scene_type in everyday_scene_types
|
1400 |
+
|
1401 |
+
if hasattr(self, 'enable_landmark') and not self.enable_landmark:
|
1402 |
+
if scene_type not in ["kitchen", "bedroom", "living_room", "office_workspace", "dining_area", "professional_kitchen"]:
|
1403 |
+
treat_as_everyday = True
|
1404 |
+
|
1405 |
+
if treat_as_everyday or not is_confident_specific_scene:
|
1406 |
+
# Generate dynamic description for everyday scenes with object statistics
|
1407 |
+
self.logger.info(f"Generating dynamic description for scene_type: {scene_type}")
|
1408 |
+
scene_details = self._generate_dynamic_everyday_description(
|
1409 |
+
detected_objects,
|
1410 |
+
lighting_info,
|
1411 |
+
viewpoint,
|
1412 |
+
spatial_analysis,
|
1413 |
+
image_dimensions,
|
1414 |
+
places365_info,
|
1415 |
+
object_statistics # Pass object statistics to dynamic description
|
1416 |
+
)
|
1417 |
+
elif scene_type in scene_templates:
|
1418 |
+
# Use template-based description with enhanced object information
|
1419 |
+
self.logger.info(f"Using template for scene_type: {scene_type}")
|
1420 |
+
viewpoint_key = f"{scene_type}_{viewpoint}"
|
1421 |
+
templates_list = scene_templates.get(viewpoint_key, scene_templates.get(scene_type, []))
|
1422 |
|
|
|
1423 |
if templates_list:
|
1424 |
detail_template = random.choice(templates_list)
|
|
|
|
|
1425 |
scene_details = self._fill_detail_template(
|
1426 |
detail_template,
|
1427 |
detected_objects,
|
1428 |
+
scene_type,
|
1429 |
+
places365_info,
|
1430 |
+
object_statistics # Pass object statistics to template filling
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1431 |
)
|
1432 |
else:
|
1433 |
+
scene_details = self._generate_dynamic_everyday_description(
|
1434 |
+
detected_objects, lighting_info, viewpoint, spatial_analysis,
|
1435 |
+
image_dimensions, places365_info, object_statistics
|
1436 |
+
)
|
1437 |
+
else:
|
1438 |
+
# Fallback to dynamic description with object statistics
|
1439 |
+
self.logger.info(f"No specific template for {scene_type}, generating dynamic description.")
|
1440 |
+
scene_details = self._generate_dynamic_everyday_description(
|
1441 |
+
detected_objects, lighting_info, viewpoint, spatial_analysis,
|
1442 |
+
image_dimensions, places365_info, object_statistics
|
1443 |
+
)
|
1444 |
+
|
1445 |
+
# Filter out landmark references if landmark detection is disabled
|
1446 |
+
if hasattr(self, 'enable_landmark') and not self.enable_landmark:
|
1447 |
+
scene_details = self.filter_landmark_references(scene_details, enable_landmark=False)
|
1448 |
|
1449 |
+
return scene_details if scene_details else "A scene with some visual elements."
|
1450 |
|
1451 |
+
def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str, places365_info: Optional[Dict] = None, object_statistics: Optional[Dict] = None) -> str:
|
1452 |
"""
|
1453 |
Fill a template with specific details based on detected objects.
|
1454 |
|
|
|
1469 |
# Get object template fillers
|
1470 |
fillers = self.templates.get("object_template_fillers", {})
|
1471 |
|
1472 |
+
# 基於物品的統計資訊形成更準確的模板填充內容
|
1473 |
+
statistics_based_replacements = {}
|
1474 |
+
if object_statistics:
|
1475 |
+
# 根據統計信息生成具體的物體描述
|
1476 |
+
for class_name, stats in object_statistics.items():
|
1477 |
+
count = stats.get("count", 0)
|
1478 |
+
if count > 0:
|
1479 |
+
# 為常見物體類別生成基於統計的描述
|
1480 |
+
if class_name == "potted plant":
|
1481 |
+
if count == 1:
|
1482 |
+
statistics_based_replacements["plant_elements"] = "a potted plant"
|
1483 |
+
elif count <= 3:
|
1484 |
+
statistics_based_replacements["plant_elements"] = f"{count} potted plants"
|
1485 |
+
else:
|
1486 |
+
statistics_based_replacements["plant_elements"] = f"multiple potted plants ({count} total)"
|
1487 |
+
|
1488 |
+
elif class_name == "chair":
|
1489 |
+
if count == 1:
|
1490 |
+
statistics_based_replacements["seating"] = "a chair"
|
1491 |
+
elif count <= 4:
|
1492 |
+
statistics_based_replacements["seating"] = f"{count} chairs"
|
1493 |
+
else:
|
1494 |
+
statistics_based_replacements["seating"] = f"numerous chairs ({count} total)"
|
1495 |
+
|
1496 |
+
elif class_name == "person":
|
1497 |
+
if count == 1:
|
1498 |
+
statistics_based_replacements["people_and_vehicles"] = "a person"
|
1499 |
+
statistics_based_replacements["pedestrian_flow"] = "an individual walking"
|
1500 |
+
elif count <= 5:
|
1501 |
+
statistics_based_replacements["people_and_vehicles"] = f"{count} people"
|
1502 |
+
statistics_based_replacements["pedestrian_flow"] = f"{count} people walking"
|
1503 |
+
else:
|
1504 |
+
statistics_based_replacements["people_and_vehicles"] = f"many people ({count} individuals)"
|
1505 |
+
statistics_based_replacements["pedestrian_flow"] = f"a crowd of {count} people"
|
1506 |
+
|
1507 |
# 為所有可能的變數設置默認值
|
1508 |
default_replacements = {
|
1509 |
# 室內相關
|
|
|
1683 |
"knowledge_transfer": "learning exchanges"
|
1684 |
}
|
1685 |
|
1686 |
+
# 將統計的資訊形成的替換內容合併到默認替換中
|
1687 |
+
default_replacements.update(statistics_based_replacements)
|
1688 |
+
|
1689 |
+
# Add Places365-specific template variables
|
1690 |
+
places365_scene_context = ""
|
1691 |
+
places365_atmosphere = ""
|
1692 |
+
|
1693 |
+
if places365_info and places365_info.get('confidence', 0) > 0.35:
|
1694 |
+
scene_label = places365_info.get('scene_label', '').replace('_', ' ')
|
1695 |
+
attributes = places365_info.get('attributes', [])
|
1696 |
+
|
1697 |
+
if scene_label and scene_label != scene_type:
|
1698 |
+
places365_scene_context = f"characteristic of a {scene_label}"
|
1699 |
+
|
1700 |
+
if 'natural_lighting' in attributes:
|
1701 |
+
places365_atmosphere = "with natural illumination"
|
1702 |
+
elif 'artificial_lighting' in attributes:
|
1703 |
+
places365_atmosphere = "under artificial lighting"
|
1704 |
+
|
1705 |
+
# Update default_replacements with Places365 context
|
1706 |
+
if places365_scene_context:
|
1707 |
+
default_replacements["places365_context"] = places365_scene_context
|
1708 |
+
else:
|
1709 |
+
default_replacements["places365_context"] = ""
|
1710 |
+
|
1711 |
+
if places365_atmosphere:
|
1712 |
+
default_replacements["places365_atmosphere"] = places365_atmosphere
|
1713 |
+
else:
|
1714 |
+
default_replacements["places365_atmosphere"] = ""
|
1715 |
+
|
1716 |
# For each placeholder, try to fill with appropriate content
|
1717 |
for placeholder in placeholders:
|
1718 |
if placeholder in fillers:
|
|
|
1940 |
if not detected_objects:
|
1941 |
return "eye_level" # default
|
1942 |
|
1943 |
+
# extract space and size
|
1944 |
top_region_count = 0
|
1945 |
bottom_region_count = 0
|
1946 |
total_objects = len(detected_objects)
|
|
|
1956 |
crosswalk_pattern_detected = False
|
1957 |
|
1958 |
for obj in detected_objects:
|
1959 |
+
# 計算頂部or底部區域中的物體
|
1960 |
region = obj["region"]
|
1961 |
if "top" in region:
|
1962 |
top_region_count += 1
|
1963 |
elif "bottom" in region:
|
1964 |
bottom_region_count += 1
|
1965 |
|
1966 |
+
# 計算標準化大小(Area)
|
1967 |
if "normalized_area" in obj:
|
1968 |
sizes.append(obj["normalized_area"])
|
1969 |
|
1970 |
+
# 計算高度or寬度比例
|
1971 |
if "normalized_size" in obj:
|
1972 |
width, height = obj["normalized_size"]
|
1973 |
if width > 0:
|
1974 |
height_width_ratios.append(height / width)
|
1975 |
|
1976 |
+
# 收集人的位置
|
1977 |
if obj["class_id"] == 0: # 人
|
1978 |
if "normalized_center" in obj:
|
1979 |
people_positions.append(obj["normalized_center"])
|
1980 |
|
1981 |
+
# 專門為斑馬線的十字路口添加檢測邏輯
|
1982 |
# 檢查是否有明顯的垂直和水平行人分布
|
1983 |
people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # 人
|
1984 |
|
|
|
1997 |
y_range = max(y_coords) - min(y_coords)
|
1998 |
|
1999 |
# 嘗試檢測十字形分布
|
2000 |
+
# 如果 x 和 y 方向都有較大範圍,且範圍相似,就有可能是十字路口
|
2001 |
if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
|
2002 |
|
2003 |
# 計算到中心點的距離
|
|
|
2194 |
description = description.replace("a bed in the room", "a bed")
|
2195 |
|
2196 |
# 處理重複的物品列表
|
|
|
2197 |
object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
|
2198 |
|
2199 |
for obj_list in object_lists:
|
|
|
2243 |
if not functional_zones:
|
2244 |
return ""
|
2245 |
|
2246 |
+
# 處理不同類型的 functional_zones 參數
|
2247 |
+
if isinstance(functional_zones, list):
|
2248 |
+
# 如果是列表,轉換為字典格式
|
2249 |
+
zones_dict = {}
|
2250 |
+
for i, zone in enumerate(functional_zones):
|
2251 |
+
if isinstance(zone, dict) and 'name' in zone:
|
2252 |
+
zone_name = zone['name']
|
2253 |
+
else:
|
2254 |
+
zone_name = f"zone_{i}"
|
2255 |
+
zones_dict[zone_name] = zone if isinstance(zone, dict) else {"description": str(zone)}
|
2256 |
+
functional_zones = zones_dict
|
2257 |
+
elif not isinstance(functional_zones, dict):
|
2258 |
+
return ""
|
2259 |
+
|
2260 |
# 計算場景中的總人數
|
2261 |
total_people_count = 0
|
2262 |
people_by_zone = {}
|
|
|
2296 |
|
2297 |
# 生成匯總描述
|
2298 |
summary = ""
|
2299 |
+
max_mentioned_people = 0 # track已經提到的最大人數
|
2300 |
|
2301 |
# 如果總人數顯著且還沒在主描述中提到,添加總人數描述
|
2302 |
if total_people_count > 5:
|
2303 |
summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
|
2304 |
+
max_mentioned_people = total_people_count # update已提到的最大人數
|
2305 |
|
2306 |
# 處理每個區域的描述,確保人數信息的一致性
|
2307 |
processed_zones = []
|
|
|
2310 |
zone_desc = zone_info.get("description", "a functional zone")
|
2311 |
zone_people_count = people_by_zone.get(zone_name, 0)
|
2312 |
|
2313 |
+
# 檢查描述中是否包含人數資訊
|
2314 |
contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())
|
2315 |
|
2316 |
# 如果描述包含人數信息,且人數較小(小於已提到的最大人數),則修改描述
|
evaluation_metrics.py
CHANGED
@@ -138,7 +138,7 @@ class EvaluationMetrics:
|
|
138 |
# Create empty plot if error
|
139 |
fig, ax = plt.subplots(figsize=figsize)
|
140 |
ax.text(0.5, 0.5, viz_data["error"],
|
141 |
-
ha='center', va='center', fontsize=14
|
142 |
ax.set_xlim(0, 1)
|
143 |
ax.set_ylim(0, 1)
|
144 |
ax.axis('off')
|
@@ -148,7 +148,7 @@ class EvaluationMetrics:
|
|
148 |
# Create empty plot if no data
|
149 |
fig, ax = plt.subplots(figsize=figsize)
|
150 |
ax.text(0.5, 0.5, "No detection data available",
|
151 |
-
ha='center', va='center', fontsize=14
|
152 |
ax.set_xlim(0, 1)
|
153 |
ax.set_ylim(0, 1)
|
154 |
ax.axis('off')
|
@@ -163,7 +163,6 @@ class EvaluationMetrics:
|
|
163 |
colors = [item["color"] for item in class_data]
|
164 |
|
165 |
# Create figure and horizontal bar chart with improved styling
|
166 |
-
plt.rcParams['font.family'] = 'Arial'
|
167 |
fig, ax = plt.subplots(figsize=figsize)
|
168 |
|
169 |
# Set background color to white
|
@@ -181,15 +180,15 @@ class EvaluationMetrics:
|
|
181 |
conf = class_data[i]["average_confidence"]
|
182 |
ax.text(width + 0.3, bar.get_y() + bar.get_height()/2,
|
183 |
f"{width:.0f} (conf: {conf:.2f})",
|
184 |
-
va='center', fontsize=12
|
185 |
|
186 |
# Customize axis and labels with larger fonts
|
187 |
ax.set_yticks(y_pos)
|
188 |
-
ax.set_yticklabels(class_names, fontsize=14
|
189 |
ax.invert_yaxis() # Labels read top-to-bottom
|
190 |
-
ax.set_xlabel('Count', fontsize=14
|
191 |
ax.set_title(f'Objects Detected: {viz_data["total_objects"]} Total',
|
192 |
-
fontsize=16,
|
193 |
|
194 |
# Add grid for better readability
|
195 |
ax.set_axisbelow(True)
|
@@ -204,7 +203,7 @@ class EvaluationMetrics:
|
|
204 |
f"Average Confidence: {viz_data['average_confidence']:.2f}\n"
|
205 |
f"Unique Classes: {len(viz_data['class_data'])}"
|
206 |
)
|
207 |
-
plt.figtext(0.02, 0.02, summary_text, fontsize=12,
|
208 |
bbox=dict(facecolor='white', alpha=0.9, boxstyle='round,pad=0.5',
|
209 |
edgecolor='#E5E7EB'))
|
210 |
|
|
|
138 |
# Create empty plot if error
|
139 |
fig, ax = plt.subplots(figsize=figsize)
|
140 |
ax.text(0.5, 0.5, viz_data["error"],
|
141 |
+
ha='center', va='center', fontsize=14)
|
142 |
ax.set_xlim(0, 1)
|
143 |
ax.set_ylim(0, 1)
|
144 |
ax.axis('off')
|
|
|
148 |
# Create empty plot if no data
|
149 |
fig, ax = plt.subplots(figsize=figsize)
|
150 |
ax.text(0.5, 0.5, "No detection data available",
|
151 |
+
ha='center', va='center', fontsize=14)
|
152 |
ax.set_xlim(0, 1)
|
153 |
ax.set_ylim(0, 1)
|
154 |
ax.axis('off')
|
|
|
163 |
colors = [item["color"] for item in class_data]
|
164 |
|
165 |
# Create figure and horizontal bar chart with improved styling
|
|
|
166 |
fig, ax = plt.subplots(figsize=figsize)
|
167 |
|
168 |
# Set background color to white
|
|
|
180 |
conf = class_data[i]["average_confidence"]
|
181 |
ax.text(width + 0.3, bar.get_y() + bar.get_height()/2,
|
182 |
f"{width:.0f} (conf: {conf:.2f})",
|
183 |
+
va='center', fontsize=12)
|
184 |
|
185 |
# Customize axis and labels with larger fonts
|
186 |
ax.set_yticks(y_pos)
|
187 |
+
ax.set_yticklabels(class_names, fontsize=14)
|
188 |
ax.invert_yaxis() # Labels read top-to-bottom
|
189 |
+
ax.set_xlabel('Count', fontsize=14)
|
190 |
ax.set_title(f'Objects Detected: {viz_data["total_objects"]} Total',
|
191 |
+
fontsize=16, fontweight='bold')
|
192 |
|
193 |
# Add grid for better readability
|
194 |
ax.set_axisbelow(True)
|
|
|
203 |
f"Average Confidence: {viz_data['average_confidence']:.2f}\n"
|
204 |
f"Unique Classes: {len(viz_data['class_data'])}"
|
205 |
)
|
206 |
+
plt.figtext(0.02, 0.02, summary_text, fontsize=12,
|
207 |
bbox=dict(facecolor='white', alpha=0.9, boxstyle='round,pad=0.5',
|
208 |
edgecolor='#E5E7EB'))
|
209 |
|
image_processor.py
CHANGED
@@ -13,6 +13,7 @@ from visualization_helper import VisualizationHelper
|
|
13 |
from evaluation_metrics import EvaluationMetrics
|
14 |
from lighting_analyzer import LightingAnalyzer
|
15 |
from scene_analyzer import SceneAnalyzer
|
|
|
16 |
|
17 |
class ImageProcessor:
|
18 |
"""
|
@@ -20,13 +21,76 @@ class ImageProcessor:
|
|
20 |
Separates processing logic from UI components
|
21 |
"""
|
22 |
|
23 |
-
def __init__(self, use_llm=True, llm_model_path=None):
|
24 |
"""Initialize the image processor with required components"""
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
|
32 |
"""
|
@@ -53,48 +117,74 @@ class ImageProcessor:
|
|
53 |
|
54 |
return self.model_instances[model_name]
|
55 |
|
56 |
-
def analyze_scene(self, detection_result: Any, lighting_info: Optional[Dict] = None) -> Dict:
|
57 |
"""
|
58 |
Perform scene analysis on detection results
|
59 |
|
60 |
Args:
|
61 |
detection_result: Object detection result from YOLOv8
|
62 |
lighting_info: Lighting condition analysis results (optional)
|
|
|
|
|
63 |
|
64 |
Returns:
|
65 |
Dictionary containing scene analysis results
|
66 |
"""
|
|
|
67 |
try:
|
68 |
-
#
|
69 |
-
|
|
|
|
|
|
|
|
|
70 |
self.scene_analyzer = SceneAnalyzer(
|
71 |
-
class_names=
|
72 |
use_llm=self.use_llm,
|
|
|
|
|
73 |
llm_model_path=self.llm_model_path
|
74 |
)
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
-
# Perform scene analysis with lighting info
|
82 |
scene_analysis = self.scene_analyzer.analyze(
|
83 |
detection_result=detection_result,
|
84 |
lighting_info=lighting_info,
|
85 |
class_confidence_threshold=0.35,
|
86 |
-
scene_confidence_threshold=0.6
|
|
|
|
|
87 |
)
|
88 |
|
89 |
return scene_analysis
|
|
|
90 |
except Exception as e:
|
91 |
print(f"Error in scene analysis: {str(e)}")
|
92 |
import traceback
|
93 |
traceback.print_exc()
|
|
|
|
|
94 |
return {
|
95 |
"scene_type": "unknown",
|
96 |
"confidence": 0.0,
|
97 |
"description": f"Error during scene analysis: {str(e)}",
|
|
|
98 |
"objects_present": [],
|
99 |
"object_count": 0,
|
100 |
"regions": {},
|
@@ -103,146 +193,256 @@ class ImageProcessor:
|
|
103 |
"lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
|
104 |
}
|
105 |
|
106 |
-
def analyze_lighting_conditions(self, image):
|
107 |
"""
|
108 |
-
|
109 |
|
110 |
Args:
|
111 |
image: 輸入圖像
|
|
|
112 |
|
113 |
Returns:
|
114 |
Dict: 光照分析結果
|
115 |
"""
|
116 |
-
return self.lighting_analyzer.analyze(image)
|
117 |
|
118 |
-
def
|
119 |
"""
|
120 |
-
|
121 |
|
122 |
Args:
|
123 |
-
image: Input image (
|
124 |
-
model_name: Name of the model to use
|
125 |
-
confidence_threshold: Confidence threshold for detection
|
126 |
-
filter_classes: Optional list of classes to filter results
|
127 |
|
128 |
Returns:
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
"""
|
131 |
-
# Get model instance
|
132 |
model_instance = self.get_model_instance(model_name, confidence_threshold)
|
|
|
|
|
133 |
|
134 |
-
# Initialize key variables
|
135 |
result = None
|
136 |
-
|
137 |
temp_path = None
|
|
|
138 |
|
139 |
try:
|
140 |
-
# Processing input image
|
141 |
if isinstance(image, np.ndarray):
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
else:
|
146 |
-
|
147 |
-
|
|
|
148 |
elif image is None:
|
149 |
return None, "No image provided. Please upload an image.", {}
|
150 |
else:
|
151 |
-
|
|
|
|
|
|
|
152 |
|
153 |
-
#
|
154 |
-
|
155 |
|
156 |
-
|
157 |
-
|
|
|
158 |
temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
|
159 |
temp_path = os.path.join(temp_dir, temp_filename)
|
160 |
-
|
161 |
|
162 |
-
# Object detection
|
163 |
result = model_instance.detect(temp_path)
|
164 |
|
165 |
-
if result is None:
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
|
|
|
|
170 |
|
171 |
-
#
|
|
|
172 |
spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
|
173 |
-
|
|
|
|
|
|
|
174 |
|
175 |
-
# Add lighting information
|
176 |
-
stats["lighting_conditions"] = lighting_info
|
177 |
-
|
178 |
-
# Apply filter if specified
|
179 |
if filter_classes and len(filter_classes) > 0:
|
180 |
-
# Get classes, boxes, confidence
|
181 |
classes = result.boxes.cls.cpu().numpy().astype(int)
|
182 |
confs = result.boxes.conf.cpu().numpy()
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
filtered_stats = {
|
190 |
-
"total_objects": int(np.sum(mask)),
|
191 |
-
"class_statistics": {},
|
192 |
-
"average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0,
|
193 |
-
"spatial_metrics": stats["spatial_metrics"],
|
194 |
"lighting_conditions": lighting_info
|
195 |
}
|
196 |
-
|
197 |
-
|
198 |
names = result.names
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
self.color_mapper
|
|
|
215 |
)
|
216 |
|
217 |
-
|
218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
)
|
220 |
|
221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
-
|
224 |
-
|
225 |
-
|
|
|
|
|
226 |
|
227 |
-
|
228 |
-
|
|
|
229 |
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
except Exception as e:
|
233 |
-
error_message = f"Error
|
234 |
import traceback
|
235 |
traceback.print_exc()
|
236 |
-
|
237 |
-
return None, error_message, {}
|
238 |
-
|
239 |
finally:
|
240 |
if temp_path and os.path.exists(temp_path):
|
241 |
-
try:
|
242 |
-
|
243 |
-
except Exception as e:
|
244 |
-
print(f"Cannot delete temp files {temp_path}: {str(e)}")
|
245 |
-
|
246 |
|
247 |
def format_result_text(self, stats: Dict) -> str:
|
248 |
"""
|
@@ -281,7 +481,7 @@ class ImageProcessor:
|
|
281 |
else:
|
282 |
lines.append("No class information available.")
|
283 |
|
284 |
-
#
|
285 |
if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
|
286 |
lines.append("Object Distribution:")
|
287 |
|
|
|
13 |
from evaluation_metrics import EvaluationMetrics
|
14 |
from lighting_analyzer import LightingAnalyzer
|
15 |
from scene_analyzer import SceneAnalyzer
|
16 |
+
from places365_model import Places365Model
|
17 |
|
18 |
class ImageProcessor:
|
19 |
"""
|
|
|
21 |
Separates processing logic from UI components
|
22 |
"""
|
23 |
|
24 |
+
def __init__(self, use_llm=True, llm_model_path=None, enable_places365=True, places365_model_name='resnet50_places365'):
|
25 |
"""Initialize the image processor with required components"""
|
26 |
+
print(f"Initializing ImageProcessor with use_llm={use_llm}, enable_places365={enable_places365}")
|
27 |
+
|
28 |
+
try:
|
29 |
+
# Initialize basic components first
|
30 |
+
self.use_llm = use_llm
|
31 |
+
self.llm_model_path = llm_model_path
|
32 |
+
self.enable_places365 = enable_places365
|
33 |
+
self.model_instances = {}
|
34 |
+
|
35 |
+
# Initialize ColorMapper
|
36 |
+
self.color_mapper = ColorMapper()
|
37 |
+
print("ColorMapper initialized successfully")
|
38 |
+
|
39 |
+
# Initialize LightingAnalyzer
|
40 |
+
self.lighting_analyzer = LightingAnalyzer()
|
41 |
+
print("LightingAnalyzer initialized successfully")
|
42 |
+
|
43 |
+
# Initialize Places365 model if enabled
|
44 |
+
self.places365_model = None
|
45 |
+
if self.enable_places365:
|
46 |
+
try:
|
47 |
+
self.places365_model = Places365Model(
|
48 |
+
model_name=places365_model_name,
|
49 |
+
device=None
|
50 |
+
)
|
51 |
+
print(f"Places365 model initialized successfully with {places365_model_name}")
|
52 |
+
except Exception as e:
|
53 |
+
print(f"Warning: Failed to initialize Places365 model: {e}")
|
54 |
+
print("Continuing without Places365 analysis")
|
55 |
+
self.enable_places365 = False
|
56 |
+
self.places365_model = None
|
57 |
+
|
58 |
+
# Initialize SceneAnalyzer with error handling
|
59 |
+
self.scene_analyzer = None
|
60 |
+
self.class_names = None # Will be set when first model is loaded
|
61 |
+
|
62 |
+
try:
|
63 |
+
# Initialize SceneAnalyzer without class_names (will be set later)
|
64 |
+
self.scene_analyzer = SceneAnalyzer(
|
65 |
+
class_names=None,
|
66 |
+
use_llm=self.use_llm,
|
67 |
+
use_clip=True,
|
68 |
+
enable_landmark=True,
|
69 |
+
llm_model_path=self.llm_model_path
|
70 |
+
)
|
71 |
+
print("SceneAnalyzer initialized successfully")
|
72 |
+
|
73 |
+
# Verify critical components
|
74 |
+
if self.scene_analyzer is not None:
|
75 |
+
print(f"SceneAnalyzer status - spatial_analyzer: {hasattr(self.scene_analyzer, 'spatial_analyzer')}, "
|
76 |
+
f"descriptor: {hasattr(self.scene_analyzer, 'descriptor')}, "
|
77 |
+
f"scene_describer: {hasattr(self.scene_analyzer, 'scene_describer')}")
|
78 |
+
else:
|
79 |
+
print("WARNING: scene_analyzer is None after initialization")
|
80 |
+
|
81 |
+
except Exception as e:
|
82 |
+
print(f"Error initializing SceneAnalyzer: {e}")
|
83 |
+
import traceback
|
84 |
+
traceback.print_exc()
|
85 |
+
self.scene_analyzer = None
|
86 |
+
|
87 |
+
print("ImageProcessor initialization completed successfully")
|
88 |
+
|
89 |
+
except Exception as e:
|
90 |
+
print(f"Critical error during ImageProcessor initialization: {e}")
|
91 |
+
import traceback
|
92 |
+
traceback.print_exc()
|
93 |
+
raise RuntimeError(f"Failed to initialize ImageProcessor: {str(e)}")
|
94 |
|
95 |
def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
|
96 |
"""
|
|
|
117 |
|
118 |
return self.model_instances[model_name]
|
119 |
|
120 |
+
def analyze_scene(self, detection_result: Any, lighting_info: Optional[Dict] = None, enable_landmark=True, places365_info=None) -> Dict:
|
121 |
"""
|
122 |
Perform scene analysis on detection results
|
123 |
|
124 |
Args:
|
125 |
detection_result: Object detection result from YOLOv8
|
126 |
lighting_info: Lighting condition analysis results (optional)
|
127 |
+
enable_landmark: Whether to enable landmark detection
|
128 |
+
places365_info: Places365 analysis results (optional)
|
129 |
|
130 |
Returns:
|
131 |
Dictionary containing scene analysis results
|
132 |
"""
|
133 |
+
print(f"DEBUG: analyze_scene received enable_landmark={enable_landmark}")
|
134 |
try:
|
135 |
+
# Check if detection_result has valid names
|
136 |
+
class_names = getattr(detection_result, 'names', None) if detection_result else None
|
137 |
+
|
138 |
+
# Initialize or reinitialize scene analyzer if needed
|
139 |
+
if self.scene_analyzer is None:
|
140 |
+
print("Scene analyzer not initialized, creating new instance")
|
141 |
self.scene_analyzer = SceneAnalyzer(
|
142 |
+
class_names=class_names,
|
143 |
use_llm=self.use_llm,
|
144 |
+
use_clip=True,
|
145 |
+
enable_landmark=enable_landmark,
|
146 |
llm_model_path=self.llm_model_path
|
147 |
)
|
148 |
|
149 |
+
if self.scene_analyzer is None:
|
150 |
+
raise ValueError("Failed to create SceneAnalyzer instance")
|
151 |
+
else:
|
152 |
+
# Update existing scene analyzer settings
|
153 |
+
self.scene_analyzer.enable_landmark = enable_landmark
|
154 |
+
|
155 |
+
# Update class names if available and different
|
156 |
+
if class_names and self.scene_analyzer.class_names != class_names:
|
157 |
+
self.scene_analyzer.class_names = class_names
|
158 |
+
if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
|
159 |
+
self.scene_analyzer.spatial_analyzer.class_names = class_names
|
160 |
+
|
161 |
+
# Update landmark detection settings in child components
|
162 |
+
if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
|
163 |
+
self.scene_analyzer.spatial_analyzer.enable_landmark = enable_landmark
|
164 |
|
165 |
+
# Perform scene analysis with lighting info and Places365 context
|
166 |
scene_analysis = self.scene_analyzer.analyze(
|
167 |
detection_result=detection_result,
|
168 |
lighting_info=lighting_info,
|
169 |
class_confidence_threshold=0.35,
|
170 |
+
scene_confidence_threshold=0.6,
|
171 |
+
enable_landmark=enable_landmark,
|
172 |
+
places365_info=places365_info
|
173 |
)
|
174 |
|
175 |
return scene_analysis
|
176 |
+
|
177 |
except Exception as e:
|
178 |
print(f"Error in scene analysis: {str(e)}")
|
179 |
import traceback
|
180 |
traceback.print_exc()
|
181 |
+
|
182 |
+
# Return a valid default result
|
183 |
return {
|
184 |
"scene_type": "unknown",
|
185 |
"confidence": 0.0,
|
186 |
"description": f"Error during scene analysis: {str(e)}",
|
187 |
+
"enhanced_description": "Scene analysis could not be completed due to an error.",
|
188 |
"objects_present": [],
|
189 |
"object_count": 0,
|
190 |
"regions": {},
|
|
|
193 |
"lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
|
194 |
}
|
195 |
|
196 |
+
def analyze_lighting_conditions(self, image, places365_info: Optional[Dict] = None):
|
197 |
"""
|
198 |
+
分析光照條件並考慮 Places365 場景資訊。
|
199 |
|
200 |
Args:
|
201 |
image: 輸入圖像
|
202 |
+
places365_info: Places365 場景分析結果,用於覆蓋邏輯
|
203 |
|
204 |
Returns:
|
205 |
Dict: 光照分析結果
|
206 |
"""
|
207 |
+
return self.lighting_analyzer.analyze(image, places365_info=places365_info)
|
208 |
|
209 |
+
def analyze_places365_scene(self, image):
|
210 |
"""
|
211 |
+
Analyze scene using Places365 model.
|
212 |
|
213 |
Args:
|
214 |
+
image: Input image (PIL Image)
|
|
|
|
|
|
|
215 |
|
216 |
Returns:
|
217 |
+
Dict: Places365 analysis results or None if disabled/failed
|
218 |
+
"""
|
219 |
+
if not self.enable_places365 or self.places365_model is None:
|
220 |
+
return None
|
221 |
+
|
222 |
+
try:
|
223 |
+
if not isinstance(image, Image.Image):
|
224 |
+
if isinstance(image, np.ndarray):
|
225 |
+
image = Image.fromarray(image)
|
226 |
+
else:
|
227 |
+
print(f"Warning: Cannot process image of type {type(image)} for Places365")
|
228 |
+
return None
|
229 |
+
|
230 |
+
places365_result = self.places365_model.predict(image)
|
231 |
+
|
232 |
+
if places365_result and places365_result.get('confidence', 0) > 0.1:
|
233 |
+
print(f"Places365 detected: {places365_result['scene_label']} "
|
234 |
+
f"(mapped: {places365_result['mapped_scene_type']}) "
|
235 |
+
f"confidence: {places365_result['confidence']:.3f}")
|
236 |
+
return places365_result
|
237 |
+
else:
|
238 |
+
print("Places365 analysis failed or low confidence")
|
239 |
+
return None
|
240 |
+
|
241 |
+
except Exception as e:
|
242 |
+
print(f"Error in Places365 analysis: {str(e)}")
|
243 |
+
return None
|
244 |
+
|
245 |
+
def process_image(self, image: Any, model_name: str, confidence_threshold: float, filter_classes: Optional[List[int]] = None, enable_landmark: bool = True) -> Tuple[Any, str, Dict]:
|
246 |
+
"""
|
247 |
+
Process an image for object detection and scene analysis.
|
248 |
+
Args:
|
249 |
+
image: Input image (numpy array or PIL Image).
|
250 |
+
model_name: Name of the model to use.
|
251 |
+
confidence_threshold: Confidence threshold for detection.
|
252 |
+
filter_classes: Optional list of classes to filter results.
|
253 |
+
enable_landmark: Whether to enable landmark detection for this run.
|
254 |
+
Returns:
|
255 |
+
Tuple of (result_image_pil, result_text, stats_data_with_scene_analysis).
|
256 |
"""
|
|
|
257 |
model_instance = self.get_model_instance(model_name, confidence_threshold)
|
258 |
+
if model_instance is None:
|
259 |
+
return None, f"Failed to load model: {model_name}. Please check model configuration.", {}
|
260 |
|
|
|
261 |
result = None
|
262 |
+
stats_data = {}
|
263 |
temp_path = None
|
264 |
+
pil_image_for_processing = None # Use this to store the consistently processed PIL image
|
265 |
|
266 |
try:
|
|
|
267 |
if isinstance(image, np.ndarray):
|
268 |
+
if image.ndim == 3 and image.shape[2] == 3: # RGB or BGR
|
269 |
+
# Assuming BGR from OpenCV, convert to RGB for PIL standard
|
270 |
+
image_rgb_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
271 |
+
pil_image_for_processing = Image.fromarray(image_rgb_np)
|
272 |
+
elif image.ndim == 3 and image.shape[2] == 4: # RGBA or BGRA
|
273 |
+
image_rgba_np = cv2.cvtColor(image, cv2.COLOR_BGRA2RGBA) # Ensure RGBA
|
274 |
+
pil_image_for_processing = Image.fromarray(image_rgba_np).convert("RGB") # Convert to RGB
|
275 |
+
elif image.ndim == 2: # Grayscale
|
276 |
+
pil_image_for_processing = Image.fromarray(image).convert("RGB")
|
277 |
else:
|
278 |
+
pil_image_for_processing = Image.fromarray(image) # Hope for the best
|
279 |
+
elif isinstance(image, Image.Image):
|
280 |
+
pil_image_for_processing = image.copy() # Use a copy
|
281 |
elif image is None:
|
282 |
return None, "No image provided. Please upload an image.", {}
|
283 |
else:
|
284 |
+
return None, f"Unsupported image type: {type(image)}. Please provide a NumPy array or PIL Image.", {}
|
285 |
+
|
286 |
+
if pil_image_for_processing.mode != "RGB": # Ensure final image is RGB
|
287 |
+
pil_image_for_processing = pil_image_for_processing.convert("RGB")
|
288 |
|
289 |
+
# Add Places365 scene analysis parallel to lighting analysis
|
290 |
+
places365_info = self.analyze_places365_scene(pil_image_for_processing)
|
291 |
|
292 |
+
lighting_info = self.analyze_lighting_conditions(pil_image_for_processing, places365_info=places365_info)
|
293 |
+
|
294 |
+
temp_dir = tempfile.gettempdir()
|
295 |
temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
|
296 |
temp_path = os.path.join(temp_dir, temp_filename)
|
297 |
+
pil_image_for_processing.save(temp_path, format="JPEG")
|
298 |
|
|
|
299 |
result = model_instance.detect(temp_path)
|
300 |
|
301 |
+
if result is None or not hasattr(result, 'boxes'):
|
302 |
+
scene_analysis_no_yolo = self.analyze_scene(result, lighting_info, enable_landmark=enable_landmark, places365_info=places365_info)
|
303 |
+
desc_no_yolo = scene_analysis_no_yolo.get("enhanced_description", scene_analysis_no_yolo.get("description", "Detection failed, scene context analysis attempted."))
|
304 |
+
stats_data["scene_analysis"] = scene_analysis_no_yolo
|
305 |
+
if places365_info:
|
306 |
+
stats_data["places365_analysis"] = places365_info
|
307 |
+
return pil_image_for_processing, desc_no_yolo, stats_data
|
308 |
|
309 |
+
# 統計資訊
|
310 |
+
stats_data = EvaluationMetrics.calculate_basic_stats(result)
|
311 |
spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
|
312 |
+
stats_data["spatial_metrics"] = spatial_metrics
|
313 |
+
stats_data["lighting_conditions"] = lighting_info
|
314 |
+
if places365_info:
|
315 |
+
stats_data["places365_analysis"] = places365_info
|
316 |
|
|
|
|
|
|
|
|
|
317 |
if filter_classes and len(filter_classes) > 0:
|
|
|
318 |
classes = result.boxes.cls.cpu().numpy().astype(int)
|
319 |
confs = result.boxes.conf.cpu().numpy()
|
320 |
+
mask = np.isin(classes, filter_classes)
|
321 |
+
filtered_stats_data = {
|
322 |
+
"total_objects": int(np.sum(mask)), "class_statistics": {},
|
323 |
+
"average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0.0,
|
324 |
+
"spatial_metrics": stats_data.get("spatial_metrics",{}),
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
"lighting_conditions": lighting_info
|
326 |
}
|
327 |
+
if places365_info:
|
328 |
+
filtered_stats_data["places365_analysis"] = places365_info
|
329 |
names = result.names
|
330 |
+
class_conf_sums = {}
|
331 |
+
for cls_id_int, conf_val in zip(classes[mask], confs[mask]):
|
332 |
+
cls_name = names[cls_id_int]
|
333 |
+
if cls_name not in filtered_stats_data["class_statistics"]:
|
334 |
+
filtered_stats_data["class_statistics"][cls_name] = {"count": 0}
|
335 |
+
class_conf_sums[cls_name] = 0.0
|
336 |
+
filtered_stats_data["class_statistics"][cls_name]["count"] += 1 # 累計統計資訊
|
337 |
+
class_conf_sums[cls_name] += conf_val
|
338 |
+
for cls_name_stat, data_stat in filtered_stats_data["class_statistics"].items():
|
339 |
+
data_stat["average_confidence"] = round(class_conf_sums[cls_name_stat] / data_stat["count"] if data_stat["count"] > 0 else 0.0, 4)
|
340 |
+
stats_data = filtered_stats_data
|
341 |
+
|
342 |
+
viz_data = EvaluationMetrics.generate_visualization_data(result, self.color_mapper.get_all_colors())
|
343 |
+
|
344 |
+
result_image_pil = VisualizationHelper.visualize_detection(
|
345 |
+
temp_path, result, color_mapper=self.color_mapper,
|
346 |
+
figsize=(12, 12), return_pil=True, filter_classes=filter_classes
|
347 |
)
|
348 |
|
349 |
+
result_text_summary = EvaluationMetrics.format_detection_summary(viz_data)
|
350 |
+
|
351 |
+
# Pass the enable_landmark parameter from function signature
|
352 |
+
# Initialize or update scene analyzer if needed
|
353 |
+
if self.scene_analyzer is None:
|
354 |
+
print("Creating SceneAnalyzer in process_image")
|
355 |
+
self.scene_analyzer = SceneAnalyzer(
|
356 |
+
class_names=result.names if result else None,
|
357 |
+
use_llm=self.use_llm,
|
358 |
+
use_clip=True,
|
359 |
+
enable_landmark=enable_landmark,
|
360 |
+
llm_model_path=self.llm_model_path
|
361 |
+
)
|
362 |
+
|
363 |
+
if self.scene_analyzer is None:
|
364 |
+
print("ERROR: Failed to create SceneAnalyzer in process_image")
|
365 |
+
else:
|
366 |
+
# Update existing scene analyzer with current settings
|
367 |
+
if result and hasattr(result, 'names'):
|
368 |
+
self.scene_analyzer.class_names = result.names
|
369 |
+
if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
|
370 |
+
self.scene_analyzer.spatial_analyzer.class_names = result.names
|
371 |
+
|
372 |
+
self.scene_analyzer.enable_landmark = enable_landmark
|
373 |
+
if hasattr(self.scene_analyzer, 'spatial_analyzer') and self.scene_analyzer.spatial_analyzer:
|
374 |
+
self.scene_analyzer.spatial_analyzer.enable_landmark = enable_landmark
|
375 |
+
|
376 |
+
# Perform scene analysis using the existing analyze_scene method
|
377 |
+
scene_analysis_result = self.analyze_scene(
|
378 |
+
detection_result=result,
|
379 |
+
lighting_info=lighting_info,
|
380 |
+
enable_landmark=enable_landmark,
|
381 |
+
places365_info=places365_info
|
382 |
)
|
383 |
|
384 |
+
stats_data["scene_analysis"] = scene_analysis_result
|
385 |
+
|
386 |
+
final_result_text = result_text_summary
|
387 |
+
|
388 |
+
# Use enable_landmark parameter for landmark block
|
389 |
+
if enable_landmark and "detected_landmarks" in scene_analysis_result:
|
390 |
+
landmarks_detected = scene_analysis_result.get("detected_landmarks", [])
|
391 |
+
if not landmarks_detected and scene_analysis_result.get("primary_landmark"):
|
392 |
+
primary_lm = scene_analysis_result.get("primary_landmark")
|
393 |
+
if isinstance(primary_lm, dict): landmarks_detected = [primary_lm]
|
394 |
+
|
395 |
+
if landmarks_detected:
|
396 |
+
final_result_text += "\n\n--- Detected Landmarks ---\n"
|
397 |
+
# Ensure drawing on the correct PIL image
|
398 |
+
img_to_draw_on = result_image_pil.copy() # Draw on a copy
|
399 |
+
img_for_drawing_cv2 = cv2.cvtColor(np.array(img_to_draw_on), cv2.COLOR_RGB2BGR)
|
400 |
+
|
401 |
+
for landmark_item in landmarks_detected:
|
402 |
+
if not isinstance(landmark_item, dict): continue
|
403 |
+
|
404 |
+
# Use .get() for all potentially missing keys 比較保險
|
405 |
+
landmark_name_disp = landmark_item.get("class_name", landmark_item.get("name", "N/A"))
|
406 |
+
landmark_loc_disp = landmark_item.get("location", "N/A")
|
407 |
+
landmark_conf_disp = landmark_item.get("confidence", 0.0)
|
408 |
+
|
409 |
+
final_result_text += f"• {landmark_name_disp} ({landmark_loc_disp}, confidence: {landmark_conf_disp:.2f})\n"
|
410 |
|
411 |
+
if "box" in landmark_item:
|
412 |
+
box = landmark_item["box"]
|
413 |
+
pt1 = (int(box[0]), int(box[1])); pt2 = (int(box[2]), int(box[3]))
|
414 |
+
color_lm = (255, 0, 255); thickness_lm = 3 # Magenta BGR
|
415 |
+
cv2.rectangle(img_for_drawing_cv2, pt1, pt2, color_lm, thickness_lm)
|
416 |
|
417 |
+
label_lm = f"{landmark_name_disp} ({landmark_conf_disp:.2f})"
|
418 |
+
font_scale_lm = 0.6; font_thickness_lm = 1
|
419 |
+
(w_text, h_text), baseline = cv2.getTextSize(label_lm, cv2.FONT_HERSHEY_SIMPLEX, font_scale_lm, font_thickness_lm)
|
420 |
|
421 |
+
# Label position logic (simplified from your extensive one for brevity)
|
422 |
+
label_y_pos = pt1[1] - baseline - 3
|
423 |
+
if label_y_pos < h_text : # If label goes above image, put it below box
|
424 |
+
label_y_pos = pt2[1] + h_text + baseline + 3
|
425 |
+
|
426 |
+
label_bg_pt1 = (pt1[0], label_y_pos - h_text - baseline)
|
427 |
+
label_bg_pt2 = (pt1[0] + w_text, label_y_pos + baseline)
|
428 |
+
|
429 |
+
cv2.rectangle(img_for_drawing_cv2, label_bg_pt1, label_bg_pt2, color_lm, -1)
|
430 |
+
cv2.putText(img_for_drawing_cv2, label_lm, (pt1[0], label_y_pos),
|
431 |
+
cv2.FONT_HERSHEY_SIMPLEX, font_scale_lm, (255,255,255), font_thickness_lm, cv2.LINE_AA)
|
432 |
+
|
433 |
+
result_image_pil = Image.fromarray(cv2.cvtColor(img_for_drawing_cv2, cv2.COLOR_BGR2RGB))
|
434 |
+
|
435 |
+
return result_image_pil, final_result_text, stats_data
|
436 |
|
437 |
except Exception as e:
|
438 |
+
error_message = f"Error in ImageProcessor.process_image: {str(e)}"
|
439 |
import traceback
|
440 |
traceback.print_exc()
|
441 |
+
return pil_image_for_processing if pil_image_for_processing else None, error_message, {}
|
|
|
|
|
442 |
finally:
|
443 |
if temp_path and os.path.exists(temp_path):
|
444 |
+
try: os.remove(temp_path)
|
445 |
+
except Exception as e: print(f"Warning: Cannot delete temp file {temp_path}: {str(e)}")
|
|
|
|
|
|
|
446 |
|
447 |
def format_result_text(self, stats: Dict) -> str:
|
448 |
"""
|
|
|
481 |
else:
|
482 |
lines.append("No class information available.")
|
483 |
|
484 |
+
# 添加空間資訊
|
485 |
if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
|
486 |
lines.append("Object Distribution:")
|
487 |
|
landmark_activities.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
landmark_data.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
lighting_analyzer.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
lighting_conditions.py
CHANGED
@@ -12,6 +12,36 @@ LIGHTING_CONDITIONS = {
|
|
12 |
"bright": "The scene has the diffused bright lighting of an overcast day.",
|
13 |
"medium": "The scene has even, soft lighting typical of a cloudy day.",
|
14 |
"dim": "The scene has the muted lighting of a heavily overcast day."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
},
|
16 |
"sunset/sunrise": {
|
17 |
"general": "The scene is captured during golden hour with warm lighting.",
|
@@ -81,6 +111,10 @@ LIGHTING_CONDITIONS = {
|
|
81 |
"beach_lighting": "sun-drenched",
|
82 |
"sports_venue_lighting": "arena-lit",
|
83 |
"professional_kitchen_lighting": "kitchen-task lit",
|
|
|
|
|
|
|
|
|
84 |
"unknown": ""
|
85 |
},
|
86 |
"activity_modifiers": {
|
@@ -127,5 +161,11 @@ LIGHTING_CONDITIONS = {
|
|
127 |
"bright": "The space blends bright natural and artificial light sources across indoor-outdoor boundaries.",
|
128 |
"medium": "The area combines moderate indoor lighting with outdoor illumination in a balanced way.",
|
129 |
"dim": "The transition space features subtle lighting gradients between indoor and outdoor zones."
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
}
|
131 |
}
|
|
|
12 |
"bright": "The scene has the diffused bright lighting of an overcast day.",
|
13 |
"medium": "The scene has even, soft lighting typical of a cloudy day.",
|
14 |
"dim": "The scene has the muted lighting of a heavily overcast day."
|
15 |
+
},
|
16 |
+
"day_cloudy_gray": {
|
17 |
+
"general": "The scene is captured during an overcast day with muted gray lighting.",
|
18 |
+
"bright": "The scene has bright but diffused gray daylight from heavy cloud cover.",
|
19 |
+
"medium": "The scene has even, muted lighting typical of a gray, overcast day.",
|
20 |
+
"dim": "The scene has subdued lighting under thick gray clouds."
|
21 |
+
},
|
22 |
+
"indoor_residential_natural": {
|
23 |
+
"general": "The scene is captured in a residential setting with natural window lighting.",
|
24 |
+
"bright": "The residential space is brightly lit with abundant natural light from windows.",
|
25 |
+
"medium": "The home interior has comfortable natural lighting complemented by artificial sources.",
|
26 |
+
"dim": "The residential space has soft natural lighting creating a cozy atmosphere."
|
27 |
+
},
|
28 |
+
"indoor_designer_residential": {
|
29 |
+
"general": "The scene is captured in a well-designed residential space with curated lighting.",
|
30 |
+
"bright": "The residential interior features bright, designer lighting creating an elegant atmosphere.",
|
31 |
+
"medium": "The home space has thoughtfully planned lighting balancing aesthetics and functionality.",
|
32 |
+
"dim": "The residential area has sophisticated mood lighting enhancing the design elements."
|
33 |
+
},
|
34 |
+
"indoor_bright_natural_mix": {
|
35 |
+
"general": "The scene is captured indoors with a blend of natural and artificial lighting.",
|
36 |
+
"bright": "The indoor space combines bright natural window light with artificial illumination.",
|
37 |
+
"medium": "The interior has balanced mixed lighting from windows and electric sources.",
|
38 |
+
"dim": "The indoor area has gentle mixed lighting creating comfortable illumination."
|
39 |
+
},
|
40 |
+
"indoor_restaurant_bar": {
|
41 |
+
"general": "The scene is captured inside a restaurant or bar with characteristic warm lighting.",
|
42 |
+
"bright": "The dining establishment is well-lit with warm illumination emphasizing ambiance.",
|
43 |
+
"medium": "The restaurant/bar has moderate warm lighting creating a comfortable social atmosphere.",
|
44 |
+
"dim": "The establishment features soft, warm lighting creating an intimate dining or social atmosphere."
|
45 |
},
|
46 |
"sunset/sunrise": {
|
47 |
"general": "The scene is captured during golden hour with warm lighting.",
|
|
|
111 |
"beach_lighting": "sun-drenched",
|
112 |
"sports_venue_lighting": "arena-lit",
|
113 |
"professional_kitchen_lighting": "kitchen-task lit",
|
114 |
+
"day_cloudy_gray": "gray-lit",
|
115 |
+
"indoor_residential_natural": "naturally-lit residential",
|
116 |
+
"indoor_designer_residential": "designer-lit residential",
|
117 |
+
"indoor_bright_natural_mix": "mixed-lit indoor",
|
118 |
"unknown": ""
|
119 |
},
|
120 |
"activity_modifiers": {
|
|
|
161 |
"bright": "The space blends bright natural and artificial light sources across indoor-outdoor boundaries.",
|
162 |
"medium": "The area combines moderate indoor lighting with outdoor illumination in a balanced way.",
|
163 |
"dim": "The transition space features subtle lighting gradients between indoor and outdoor zones."
|
164 |
+
},
|
165 |
+
"stadium_or_floodlit_area": {
|
166 |
+
"general": "The scene is captured under powerful floodlights creating uniform bright illumination.",
|
167 |
+
"bright": "The area is intensely illuminated by floodlights, similar to stadium conditions.",
|
168 |
+
"medium": "The space has even, powerful lighting typical of sports facilities or outdoor events.",
|
169 |
+
"dim": "The area has moderate floodlight illumination providing consistent lighting across the space."
|
170 |
}
|
171 |
}
|
llm_enhancer.py
CHANGED
@@ -19,7 +19,6 @@ class LLMEnhancer:
|
|
19 |
top_p: float = 0.85):
|
20 |
"""
|
21 |
初始化LLM增強器
|
22 |
-
|
23 |
Args:
|
24 |
model_path: LLM模型的路徑或HuggingFace log in,默認使用Llama 3.2
|
25 |
tokenizer_path: token處理器的路徑,通常與model_path相同
|
@@ -38,7 +37,7 @@ class LLMEnhancer:
|
|
38 |
self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
|
39 |
self.tokenizer_path = tokenizer_path or self.model_path
|
40 |
|
41 |
-
#
|
42 |
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
|
43 |
self.logger.info(f"Using device: {self.device}")
|
44 |
|
@@ -50,7 +49,7 @@ class LLMEnhancer:
|
|
50 |
self.model = None
|
51 |
self.tokenizer = None
|
52 |
|
53 |
-
#
|
54 |
self.call_count = 0
|
55 |
|
56 |
self._initialize_prompts()
|
@@ -124,17 +123,12 @@ class LLMEnhancer:
|
|
124 |
self.enhance_description_template = """
|
125 |
<|system|>
|
126 |
You are an expert visual analyst. Your task is to improve the readability and fluency of scene descriptions using STRICT factual accuracy.
|
127 |
-
|
128 |
Your **top priority is to avoid hallucination** or fabrication. You are working in a computer vision pipeline using object detection (YOLO) and image embeddings. You MUST treat the input object list as a whitelist. Do not speculate beyond this list.
|
129 |
-
|
130 |
</|system|>
|
131 |
-
|
132 |
<|user|>
|
133 |
Rewrite the following scene description to be fluent and clear. DO NOT add any objects, events, or spatial relationships that are not explicitly present in the original or object list.
|
134 |
-
|
135 |
ORIGINAL:
|
136 |
{original_description}
|
137 |
-
|
138 |
CRITICAL RULES:
|
139 |
1. NEVER assume room type, object function, or scene purpose unless directly stated.
|
140 |
2. NEVER invent object types. You are limited to: {object_list}
|
@@ -143,60 +137,51 @@ class LLMEnhancer:
|
|
143 |
5. You MAY describe confirmed materials, colors, and composition style if visually obvious and non-speculative.
|
144 |
6. Write 2–4 complete, well-structured sentences with punctuation.
|
145 |
7. Final output MUST be a single fluent paragraph of 60–200 words (not longer).
|
146 |
-
8.
|
147 |
-
9.
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
</|user|>
|
149 |
-
|
150 |
<|assistant|>
|
151 |
"""
|
152 |
|
153 |
-
|
154 |
# 錯誤檢測的prompt
|
155 |
self.verify_detection_template = """
|
156 |
Task: You are an advanced vision system that verifies computer vision detections for accuracy.
|
157 |
-
|
158 |
Analyze the following detection results and identify any potential errors or inconsistencies:
|
159 |
-
|
160 |
SCENE TYPE: {scene_type}
|
161 |
SCENE NAME: {scene_name}
|
162 |
CONFIDENCE: {confidence:.2f}
|
163 |
-
|
164 |
DETECTED OBJECTS: {detected_objects}
|
165 |
-
|
166 |
CLIP ANALYSIS RESULTS:
|
167 |
{clip_analysis}
|
168 |
-
|
169 |
Possible Errors to Check:
|
170 |
1. Objects misidentified (e.g., architectural elements labeled as vehicles)
|
171 |
2. Cultural elements misunderstood (e.g., Asian temple structures labeled as boats)
|
172 |
3. Objects that seem out of place for this type of scene
|
173 |
4. Inconsistencies between different detection systems
|
174 |
-
|
175 |
If you find potential errors, list them clearly with explanations. If the detections seem reasonable, state that they appear accurate.
|
176 |
-
|
177 |
Verification Results:
|
178 |
"""
|
179 |
|
180 |
# 無檢測處理的prompt
|
181 |
self.no_detection_template = """
|
182 |
Task: You are an advanced scene understanding system analyzing an image where standard object detection failed to identify specific objects.
|
183 |
-
|
184 |
Based on advanced image embeddings (CLIP analysis), we have the following information:
|
185 |
-
|
186 |
MOST LIKELY SCENE: {top_scene} (confidence: {top_confidence:.2f})
|
187 |
VIEWPOINT: {viewpoint}
|
188 |
LIGHTING: {lighting_condition}
|
189 |
-
|
190 |
CULTURAL ANALYSIS: {cultural_analysis}
|
191 |
-
|
192 |
Create a detailed description of what might be in this scene, considering:
|
193 |
1. The most likely type of location or setting
|
194 |
2. Possible architectural or natural elements present
|
195 |
3. The lighting and atmosphere
|
196 |
4. Potential cultural or regional characteristics
|
197 |
-
|
198 |
Your description should be natural, flowing, and offer insights into what the image likely contains despite the lack of specific object detection.
|
199 |
-
|
200 |
Scene Description:
|
201 |
"""
|
202 |
|
@@ -300,7 +285,7 @@ class LLMEnhancer:
|
|
300 |
self.logger.info("Model not loaded, no context to reset")
|
301 |
|
302 |
def _remove_introduction_sentences(self, response: str) -> str:
|
303 |
-
"""
|
304 |
# 識別常見的介紹性模式
|
305 |
intro_patterns = [
|
306 |
r'^Here is the (?:rewritten|enhanced) .*?description:',
|
@@ -318,7 +303,7 @@ class LLMEnhancer:
|
|
318 |
return response
|
319 |
|
320 |
def enhance_description(self, scene_data: Dict[str, Any]) -> str:
|
321 |
-
"""
|
322 |
try:
|
323 |
# 重置上下文
|
324 |
self.reset_context()
|
@@ -332,7 +317,7 @@ class LLMEnhancer:
|
|
332 |
if not original_desc:
|
333 |
return "No original description provided."
|
334 |
|
335 |
-
#
|
336 |
scene_type = scene_data.get("scene_type", "unknown scene")
|
337 |
scene_type = self._clean_scene_type(scene_type)
|
338 |
|
@@ -357,16 +342,28 @@ class LLMEnhancer:
|
|
357 |
if confidence >= high_confidence_threshold:
|
358 |
filtered_objects.append(obj)
|
359 |
|
360 |
-
#
|
|
|
361 |
object_counts = {}
|
362 |
-
for obj in filtered_objects:
|
363 |
-
class_name = obj.get("class_name", "")
|
364 |
-
if class_name not in object_counts:
|
365 |
-
object_counts[class_name] = 0
|
366 |
-
object_counts[class_name] += 1
|
367 |
|
368 |
-
|
369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
|
371 |
# 如果沒有高信心度物件,回退到使用原始描述中的關鍵詞
|
372 |
if not high_confidence_objects:
|
@@ -399,6 +396,29 @@ class LLMEnhancer:
|
|
399 |
response = self._generate_llm_response(prompt)
|
400 |
|
401 |
# 檢查回應完整性的更嚴格標準
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
is_incomplete = (
|
403 |
len(response) < 100 or # too short
|
404 |
(len(response) < 200 and "." not in response[-30:]) or # 結尾沒有適當的標點符號
|
@@ -442,7 +462,15 @@ class LLMEnhancer:
|
|
442 |
if perspective and perspective.lower() not in result.lower():
|
443 |
result = f"{perspective}, {result[0].lower()}{result[1:]}"
|
444 |
|
445 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
446 |
|
447 |
except Exception as e:
|
448 |
self.logger.error(f"Enhancement failed: {str(e)}")
|
@@ -451,7 +479,7 @@ class LLMEnhancer:
|
|
451 |
return original_desc # 發生任何錯誤時返回原始描述
|
452 |
|
453 |
def _verify_factual_accuracy(self, original: str, generated: str, object_list: str) -> str:
|
454 |
-
"""
|
455 |
|
456 |
# 將原始描述和物體列表合併為授權詞彙源
|
457 |
authorized_content = original.lower() + " " + object_list.lower()
|
@@ -475,6 +503,55 @@ class LLMEnhancer:
|
|
475 |
pattern = re.compile(r'\b' + term + r'\b', re.IGNORECASE)
|
476 |
generated = pattern.sub(replacement, generated)
|
477 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
478 |
return generated
|
479 |
|
480 |
|
@@ -486,14 +563,12 @@ class LLMEnhancer:
|
|
486 |
confidence: float) -> Dict[str, Any]:
|
487 |
"""
|
488 |
驗證並可能修正YOLO的檢測結果
|
489 |
-
|
490 |
Args:
|
491 |
detected_objects: YOLO檢測到的物體列表
|
492 |
clip_analysis: CLIP分析結果
|
493 |
scene_type: 識別的場景類型
|
494 |
scene_name: 場景名稱
|
495 |
confidence: 場景分類的信心度
|
496 |
-
|
497 |
Returns:
|
498 |
Dict: 包含驗證結果和建議的字典
|
499 |
"""
|
@@ -520,7 +595,7 @@ class LLMEnhancer:
|
|
520 |
result = {
|
521 |
"verification_text": verification_result,
|
522 |
"has_errors": "appear accurate" not in verification_result.lower(),
|
523 |
-
"corrected_objects": None
|
524 |
}
|
525 |
|
526 |
return result
|
@@ -567,10 +642,8 @@ class LLMEnhancer:
|
|
567 |
def handle_no_detection(self, clip_analysis: Dict[str, Any]) -> str:
|
568 |
"""
|
569 |
處理YOLO未檢測到物體的情況
|
570 |
-
|
571 |
Args:
|
572 |
clip_analysis: CLIP分析結果
|
573 |
-
|
574 |
Returns:
|
575 |
str: 生成的場景描述
|
576 |
"""
|
@@ -603,10 +676,8 @@ class LLMEnhancer:
|
|
603 |
def _clean_input_text(self, text: str) -> str:
|
604 |
"""
|
605 |
對輸入文本進行通用的格式清理,處理常見的格式問題。
|
606 |
-
|
607 |
Args:
|
608 |
text: 輸入文本
|
609 |
-
|
610 |
Returns:
|
611 |
清理後的文本
|
612 |
"""
|
@@ -635,13 +706,11 @@ class LLMEnhancer:
|
|
635 |
def _fact_check_description(self, original_desc: str, enhanced_desc: str, scene_type: str, detected_objects: List[str]) -> str:
|
636 |
"""
|
637 |
驗證並可能修正增強後的描述,確保有保持事實準確性。
|
638 |
-
|
639 |
Args:
|
640 |
original_desc: 原始場景描述
|
641 |
enhanced_desc: 增強後的描述待驗證
|
642 |
scene_type: 場景類型
|
643 |
detected_objects: 檢測到的物體名稱列表
|
644 |
-
|
645 |
Returns:
|
646 |
經過事實檢查的描述
|
647 |
"""
|
@@ -842,13 +911,14 @@ class LLMEnhancer:
|
|
842 |
# 為 Llama 模型設置特定參數
|
843 |
if "llama" in self.model_path.lower():
|
844 |
generation_params.update({
|
845 |
-
"temperature": 0.
|
846 |
"max_new_tokens": 600,
|
847 |
"do_sample": True,
|
848 |
-
"top_p": 0.
|
849 |
-
"repetition_penalty": 1.
|
850 |
-
"num_beams":
|
851 |
-
"length_penalty": 1
|
|
|
852 |
})
|
853 |
|
854 |
else:
|
@@ -885,9 +955,9 @@ class LLMEnhancer:
|
|
885 |
if response.startswith(input_text):
|
886 |
response = response[len(input_text):].strip()
|
887 |
|
888 |
-
#
|
889 |
if not response or len(response.strip()) < 10:
|
890 |
-
self.logger.warning("
|
891 |
return "No detailed description could be generated."
|
892 |
|
893 |
return response
|
@@ -902,10 +972,8 @@ class LLMEnhancer:
|
|
902 |
"""
|
903 |
Clean the LLM response to ensure the output contains only clean descriptive text.
|
904 |
Sometimes it will not only display the description but display tags, notes...etc
|
905 |
-
|
906 |
Args:
|
907 |
response: Original response from the LLM
|
908 |
-
|
909 |
Returns:
|
910 |
Cleaned description text
|
911 |
"""
|
@@ -939,13 +1007,27 @@ class LLMEnhancer:
|
|
939 |
for marker in section_markers:
|
940 |
response = re.sub(marker, '', response, flags=re.IGNORECASE)
|
941 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
942 |
# 3. Remove common prefixes and suffixes
|
943 |
prefixes_to_remove = [
|
944 |
"Enhanced Description:",
|
945 |
"Scene Description:",
|
946 |
"Description:",
|
947 |
"Here is the enhanced description:",
|
948 |
-
"Here's the enhanced description:"
|
|
|
|
|
|
|
|
|
949 |
]
|
950 |
|
951 |
for prefix in prefixes_to_remove:
|
@@ -1004,6 +1086,49 @@ class LLMEnhancer:
|
|
1004 |
# Recombine unique sentences
|
1005 |
response = ' '.join(unique_sentences)
|
1006 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1007 |
# 10. Ensure word count is within limits (50-150 words)
|
1008 |
words = response.split()
|
1009 |
if len(words) > 200:
|
@@ -1035,7 +1160,20 @@ class LLMEnhancer:
|
|
1035 |
# Remove the last preposition or conjunction
|
1036 |
response = " ".join(words[:-1]) + "."
|
1037 |
|
1038 |
-
# 12.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1039 |
if not response or len(response) < 40:
|
1040 |
# Try to get the first meaningful paragraph from the original response
|
1041 |
paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
|
@@ -1052,7 +1190,7 @@ class LLMEnhancer:
|
|
1052 |
# If still no good content, return a simple message
|
1053 |
return "Unable to generate a valid enhanced description."
|
1054 |
|
1055 |
-
#
|
1056 |
response = re.sub(r'</?\|.*?\|>', '', response) # Any remaining tags
|
1057 |
response = re.sub(r'\(.*?\)', '', response) # Any remaining parenthetical content
|
1058 |
response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE) # Any remaining notes
|
@@ -1064,7 +1202,7 @@ class LLMEnhancer:
|
|
1064 |
if response and response[0].islower():
|
1065 |
response = response[0].upper() + response[1:]
|
1066 |
|
1067 |
-
#
|
1068 |
response = re.sub(r'\s*\n\s*', ' ', response) # 將所有換行符替換為空格
|
1069 |
response = ' '.join(response.split())
|
1070 |
|
|
|
19 |
top_p: float = 0.85):
|
20 |
"""
|
21 |
初始化LLM增強器
|
|
|
22 |
Args:
|
23 |
model_path: LLM模型的路徑或HuggingFace log in,默認使用Llama 3.2
|
24 |
tokenizer_path: token處理器的路徑,通常與model_path相同
|
|
|
37 |
self.model_path = model_path or "meta-llama/Llama-3.2-3B-Instruct"
|
38 |
self.tokenizer_path = tokenizer_path or self.model_path
|
39 |
|
40 |
+
# check device
|
41 |
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
|
42 |
self.logger.info(f"Using device: {self.device}")
|
43 |
|
|
|
49 |
self.model = None
|
50 |
self.tokenizer = None
|
51 |
|
52 |
+
# 追蹤模型調用次數
|
53 |
self.call_count = 0
|
54 |
|
55 |
self._initialize_prompts()
|
|
|
123 |
self.enhance_description_template = """
|
124 |
<|system|>
|
125 |
You are an expert visual analyst. Your task is to improve the readability and fluency of scene descriptions using STRICT factual accuracy.
|
|
|
126 |
Your **top priority is to avoid hallucination** or fabrication. You are working in a computer vision pipeline using object detection (YOLO) and image embeddings. You MUST treat the input object list as a whitelist. Do not speculate beyond this list.
|
|
|
127 |
</|system|>
|
|
|
128 |
<|user|>
|
129 |
Rewrite the following scene description to be fluent and clear. DO NOT add any objects, events, or spatial relationships that are not explicitly present in the original or object list.
|
|
|
130 |
ORIGINAL:
|
131 |
{original_description}
|
|
|
132 |
CRITICAL RULES:
|
133 |
1. NEVER assume room type, object function, or scene purpose unless directly stated.
|
134 |
2. NEVER invent object types. You are limited to: {object_list}
|
|
|
137 |
5. You MAY describe confirmed materials, colors, and composition style if visually obvious and non-speculative.
|
138 |
6. Write 2–4 complete, well-structured sentences with punctuation.
|
139 |
7. Final output MUST be a single fluent paragraph of 60–200 words (not longer).
|
140 |
+
8. Begin your response directly with the scene description. Do NOT include any introductory phrases, explanations, or formatting indicators.
|
141 |
+
9. Ensure grammatical completeness in all sentences. Each sentence must have a complete subject and predicate structure.
|
142 |
+
10. Vary sentence structures naturally while maintaining grammatical accuracy. Avoid incomplete phrases or dangling modifiers.
|
143 |
+
11. Limit repetition of descriptive verbs and spatial indicators to maintain text diversity and readability.
|
144 |
+
12. Create natural spatial flow by connecting object descriptions organically rather than listing positions mechanically.
|
145 |
+
13. Use transitional phrases to connect ideas smoothly, varying expression patterns throughout the description.
|
146 |
+
14. End with a conclusive observation about atmosphere, style, or overall impression rather than restating layout information.
|
147 |
+
15. When describing quantities or arrangements, use only information explicitly confirmed by the object detection system.
|
148 |
</|user|>
|
|
|
149 |
<|assistant|>
|
150 |
"""
|
151 |
|
|
|
152 |
# 錯誤檢測的prompt
|
153 |
self.verify_detection_template = """
|
154 |
Task: You are an advanced vision system that verifies computer vision detections for accuracy.
|
|
|
155 |
Analyze the following detection results and identify any potential errors or inconsistencies:
|
|
|
156 |
SCENE TYPE: {scene_type}
|
157 |
SCENE NAME: {scene_name}
|
158 |
CONFIDENCE: {confidence:.2f}
|
|
|
159 |
DETECTED OBJECTS: {detected_objects}
|
|
|
160 |
CLIP ANALYSIS RESULTS:
|
161 |
{clip_analysis}
|
|
|
162 |
Possible Errors to Check:
|
163 |
1. Objects misidentified (e.g., architectural elements labeled as vehicles)
|
164 |
2. Cultural elements misunderstood (e.g., Asian temple structures labeled as boats)
|
165 |
3. Objects that seem out of place for this type of scene
|
166 |
4. Inconsistencies between different detection systems
|
|
|
167 |
If you find potential errors, list them clearly with explanations. If the detections seem reasonable, state that they appear accurate.
|
|
|
168 |
Verification Results:
|
169 |
"""
|
170 |
|
171 |
# 無檢測處理的prompt
|
172 |
self.no_detection_template = """
|
173 |
Task: You are an advanced scene understanding system analyzing an image where standard object detection failed to identify specific objects.
|
|
|
174 |
Based on advanced image embeddings (CLIP analysis), we have the following information:
|
|
|
175 |
MOST LIKELY SCENE: {top_scene} (confidence: {top_confidence:.2f})
|
176 |
VIEWPOINT: {viewpoint}
|
177 |
LIGHTING: {lighting_condition}
|
|
|
178 |
CULTURAL ANALYSIS: {cultural_analysis}
|
|
|
179 |
Create a detailed description of what might be in this scene, considering:
|
180 |
1. The most likely type of location or setting
|
181 |
2. Possible architectural or natural elements present
|
182 |
3. The lighting and atmosphere
|
183 |
4. Potential cultural or regional characteristics
|
|
|
184 |
Your description should be natural, flowing, and offer insights into what the image likely contains despite the lack of specific object detection.
|
|
|
185 |
Scene Description:
|
186 |
"""
|
187 |
|
|
|
285 |
self.logger.info("Model not loaded, no context to reset")
|
286 |
|
287 |
def _remove_introduction_sentences(self, response: str) -> str:
|
288 |
+
"""remove introduction sentences"""
|
289 |
# 識別常見的介紹性模式
|
290 |
intro_patterns = [
|
291 |
r'^Here is the (?:rewritten|enhanced) .*?description:',
|
|
|
303 |
return response
|
304 |
|
305 |
def enhance_description(self, scene_data: Dict[str, Any]) -> str:
|
306 |
+
"""場景描述增強器,處理各種場景類型並保留視角與光照資訊,並作為總窗口可運用於其他class"""
|
307 |
try:
|
308 |
# 重置上下文
|
309 |
self.reset_context()
|
|
|
317 |
if not original_desc:
|
318 |
return "No original description provided."
|
319 |
|
320 |
+
# get scene type 並標準化
|
321 |
scene_type = scene_data.get("scene_type", "unknown scene")
|
322 |
scene_type = self._clean_scene_type(scene_type)
|
323 |
|
|
|
342 |
if confidence >= high_confidence_threshold:
|
343 |
filtered_objects.append(obj)
|
344 |
|
345 |
+
# 優先使用傳入的物體統計信息,如果不存在則計算
|
346 |
+
object_statistics = scene_data.get("object_statistics", {})
|
347 |
object_counts = {}
|
|
|
|
|
|
|
|
|
|
|
348 |
|
349 |
+
if object_statistics:
|
350 |
+
# 使用預計算的統計資訊,確保數量準確
|
351 |
+
for class_name, stats in object_statistics.items():
|
352 |
+
if stats.get("count", 0) > 0 and stats.get("avg_confidence", 0) >= high_confidence_threshold:
|
353 |
+
object_counts[class_name] = stats["count"]
|
354 |
+
else:
|
355 |
+
# 回退到原有的計算方式
|
356 |
+
for obj in filtered_objects:
|
357 |
+
class_name = obj.get("class_name", "")
|
358 |
+
if class_name not in object_counts:
|
359 |
+
object_counts[class_name] = 0
|
360 |
+
object_counts[class_name] += 1
|
361 |
+
|
362 |
+
# 將物件格式化為更精確的描述
|
363 |
+
high_confidence_objects = ", ".join([
|
364 |
+
f"{count} {obj}{'s' if count > 1 else ''}"
|
365 |
+
for obj, count in object_counts.items()
|
366 |
+
])
|
367 |
|
368 |
# 如果沒有高信心度物件,回退到使用原始描述中的關鍵詞
|
369 |
if not high_confidence_objects:
|
|
|
396 |
response = self._generate_llm_response(prompt)
|
397 |
|
398 |
# 檢查回應完整性的更嚴格標準
|
399 |
+
is_landmark_only = (
|
400 |
+
scene_data.get("scene_type") in ["tourist_landmark", "natural_landmark", "historical_monument"] and
|
401 |
+
(not scene_data.get("detected_objects") or len(scene_data.get("detected_objects", [])) <= 1)
|
402 |
+
)
|
403 |
+
|
404 |
+
# 如果是只有地標的情況,調整相關邏輯
|
405 |
+
if is_landmark_only:
|
406 |
+
# 確保原始描述不為空
|
407 |
+
original_desc = scene_data.get("original_description", "")
|
408 |
+
if not original_desc or len(original_desc.strip()) < 10:
|
409 |
+
# 從場景類型和地標信息生成基本描述
|
410 |
+
scene_type = scene_data.get("scene_type", "unknown")
|
411 |
+
scene_name = scene_data.get("scene_name", "Unknown")
|
412 |
+
if "primary_landmark" in scene_data:
|
413 |
+
landmark_name = scene_data["primary_landmark"].get("name", "unnamed landmark")
|
414 |
+
original_desc = f"A {scene_type.replace('_', ' ')} scene featuring {landmark_name}."
|
415 |
+
else:
|
416 |
+
original_desc = f"A {scene_type.replace('_', ' ')} scene."
|
417 |
+
|
418 |
+
# 更新場景數據
|
419 |
+
scene_data["original_description"] = original_desc
|
420 |
+
|
421 |
+
# 檢查回應完整性的更嚴格標準 (保持不變)
|
422 |
is_incomplete = (
|
423 |
len(response) < 100 or # too short
|
424 |
(len(response) < 200 and "." not in response[-30:]) or # 結尾沒有適當的標點符號
|
|
|
462 |
if perspective and perspective.lower() not in result.lower():
|
463 |
result = f"{perspective}, {result[0].lower()}{result[1:]}"
|
464 |
|
465 |
+
final_result = str(result)
|
466 |
+
if not final_result or len(final_result.strip()) < 20:
|
467 |
+
self.logger.warning(f"WARNING: LLM enhanced description is empty or too short!")
|
468 |
+
self.logger.info(f"Original description: {original_desc[:50]}...")
|
469 |
+
self.logger.info(f"Input data: scene_type={scene_data.get('scene_type')}, objects={len(scene_data.get('detected_objects', []))}")
|
470 |
+
else:
|
471 |
+
self.logger.info(f"LLM enhanced description generated successfully ({len(final_result)} chars)")
|
472 |
+
|
473 |
+
return final_result
|
474 |
|
475 |
except Exception as e:
|
476 |
self.logger.error(f"Enhancement failed: {str(e)}")
|
|
|
479 |
return original_desc # 發生任何錯誤時返回原始描述
|
480 |
|
481 |
def _verify_factual_accuracy(self, original: str, generated: str, object_list: str) -> str:
|
482 |
+
"""驗證生成的描述不包含原始描述或物體列表中沒有的信息,並檢測重複用詞問題"""
|
483 |
|
484 |
# 將原始描述和物體列表合併為授權詞彙源
|
485 |
authorized_content = original.lower() + " " + object_list.lower()
|
|
|
503 |
pattern = re.compile(r'\b' + term + r'\b', re.IGNORECASE)
|
504 |
generated = pattern.sub(replacement, generated)
|
505 |
|
506 |
+
# 檢查描述性詞彙重複問題
|
507 |
+
repetitive_patterns = [
|
508 |
+
(r'\b(visible)\b.*?\b(visible)\b', 'Multiple uses of "visible" detected'),
|
509 |
+
(r'\b(positioned)\b.*?\b(positioned)\b', 'Multiple uses of "positioned" detected'),
|
510 |
+
(r'\b(located)\b.*?\b(located)\b', 'Multiple uses of "located" detected'),
|
511 |
+
(r'\b(situated)\b.*?\b(situated)\b', 'Multiple uses of "situated" detected'),
|
512 |
+
(r'\b(appears)\b.*?\b(appears)\b', 'Multiple uses of "appears" detected'),
|
513 |
+
(r'\b(features)\b.*?\b(features)\b', 'Multiple uses of "features" detected'),
|
514 |
+
(r'\bThis\s+(\w+)\s+.*?\bThis\s+\1\b', 'Repetitive sentence structure detected')
|
515 |
+
]
|
516 |
+
|
517 |
+
# 定義替換詞典,提供多樣化的表達方式
|
518 |
+
replacement_dict = {
|
519 |
+
'visible': ['present', 'evident', 'apparent', 'observable'],
|
520 |
+
'positioned': ['arranged', 'placed', 'set', 'organized'],
|
521 |
+
'located': ['found', 'placed', 'situated', 'established'],
|
522 |
+
'situated': ['placed', 'positioned', 'arranged', 'set'],
|
523 |
+
'appears': ['seems', 'looks', 'presents', 'exhibits'],
|
524 |
+
'features': ['includes', 'contains', 'displays', 'showcases']
|
525 |
+
}
|
526 |
+
|
527 |
+
for pattern, issue in repetitive_patterns:
|
528 |
+
matches = list(re.finditer(pattern, generated, re.IGNORECASE | re.DOTALL))
|
529 |
+
if matches:
|
530 |
+
self.logger.warning(f"Text quality issue detected: {issue}")
|
531 |
+
|
532 |
+
# 針對特定重複詞彙進行替換
|
533 |
+
for word in replacement_dict.keys():
|
534 |
+
if word in issue.lower():
|
535 |
+
word_pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
|
536 |
+
word_matches = list(word_pattern.finditer(generated))
|
537 |
+
|
538 |
+
# 保留第一次出現,替換後續出現
|
539 |
+
for i, match in enumerate(word_matches[1:], 1):
|
540 |
+
if i <= len(replacement_dict[word]):
|
541 |
+
replacement = replacement_dict[word][(i-1) % len(replacement_dict[word])]
|
542 |
+
|
543 |
+
# 保持原始大小寫格式
|
544 |
+
if match.group().isupper():
|
545 |
+
replacement = replacement.upper()
|
546 |
+
elif match.group().istitle():
|
547 |
+
replacement = replacement.capitalize()
|
548 |
+
|
549 |
+
# 執行替換
|
550 |
+
generated = generated[:match.start()] + replacement + generated[match.end():]
|
551 |
+
# 重新計算後續匹配位置
|
552 |
+
word_matches = list(word_pattern.finditer(generated))
|
553 |
+
break
|
554 |
+
|
555 |
return generated
|
556 |
|
557 |
|
|
|
563 |
confidence: float) -> Dict[str, Any]:
|
564 |
"""
|
565 |
驗證並可能修正YOLO的檢測結果
|
|
|
566 |
Args:
|
567 |
detected_objects: YOLO檢測到的物體列表
|
568 |
clip_analysis: CLIP分析結果
|
569 |
scene_type: 識別的場景類型
|
570 |
scene_name: 場景名稱
|
571 |
confidence: 場景分類的信心度
|
|
|
572 |
Returns:
|
573 |
Dict: 包含驗證結果和建議的字典
|
574 |
"""
|
|
|
595 |
result = {
|
596 |
"verification_text": verification_result,
|
597 |
"has_errors": "appear accurate" not in verification_result.lower(),
|
598 |
+
"corrected_objects": None
|
599 |
}
|
600 |
|
601 |
return result
|
|
|
642 |
def handle_no_detection(self, clip_analysis: Dict[str, Any]) -> str:
|
643 |
"""
|
644 |
處理YOLO未檢測到物體的情況
|
|
|
645 |
Args:
|
646 |
clip_analysis: CLIP分析結果
|
|
|
647 |
Returns:
|
648 |
str: 生成的場景描述
|
649 |
"""
|
|
|
676 |
def _clean_input_text(self, text: str) -> str:
|
677 |
"""
|
678 |
對輸入文本進行通用的格式清理,處理常見的格式問題。
|
|
|
679 |
Args:
|
680 |
text: 輸入文本
|
|
|
681 |
Returns:
|
682 |
清理後的文本
|
683 |
"""
|
|
|
706 |
def _fact_check_description(self, original_desc: str, enhanced_desc: str, scene_type: str, detected_objects: List[str]) -> str:
|
707 |
"""
|
708 |
驗證並可能修正增強後的描述,確保有保持事實準確性。
|
|
|
709 |
Args:
|
710 |
original_desc: 原始場景描述
|
711 |
enhanced_desc: 增強後的描述待驗證
|
712 |
scene_type: 場景類型
|
713 |
detected_objects: 檢測到的物體名稱列表
|
|
|
714 |
Returns:
|
715 |
經過事實檢查的描述
|
716 |
"""
|
|
|
911 |
# 為 Llama 模型設置特定參數
|
912 |
if "llama" in self.model_path.lower():
|
913 |
generation_params.update({
|
914 |
+
"temperature": 0.35, # 不要太高, 否則模型可能會太有主觀意見
|
915 |
"max_new_tokens": 600,
|
916 |
"do_sample": True,
|
917 |
+
"top_p": 0.75,
|
918 |
+
"repetition_penalty": 1.5, # 重複的懲罰權重,可避免掉重複字
|
919 |
+
"num_beams": 5 ,
|
920 |
+
"length_penalty": 1,
|
921 |
+
"no_repeat_ngram_size": 3
|
922 |
})
|
923 |
|
924 |
else:
|
|
|
955 |
if response.startswith(input_text):
|
956 |
response = response[len(input_text):].strip()
|
957 |
|
958 |
+
# 確保不返回空的回應
|
959 |
if not response or len(response.strip()) < 10:
|
960 |
+
self.logger.warning("response is too short or empty")
|
961 |
return "No detailed description could be generated."
|
962 |
|
963 |
return response
|
|
|
972 |
"""
|
973 |
Clean the LLM response to ensure the output contains only clean descriptive text.
|
974 |
Sometimes it will not only display the description but display tags, notes...etc
|
|
|
975 |
Args:
|
976 |
response: Original response from the LLM
|
|
|
977 |
Returns:
|
978 |
Cleaned description text
|
979 |
"""
|
|
|
1007 |
for marker in section_markers:
|
1008 |
response = re.sub(marker, '', response, flags=re.IGNORECASE)
|
1009 |
|
1010 |
+
# 2.5. Deal with Here is...
|
1011 |
+
intro_prefixes = [
|
1012 |
+
r'^Here\s+is\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?scene\s+description.*?:\s*',
|
1013 |
+
r'^The\s+(?:rewritten\s+|enhanced\s+)?(?:scene\s+)?description\s+is.*?:\s*',
|
1014 |
+
r'^Here\'s\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?description.*?:\s*'
|
1015 |
+
]
|
1016 |
+
|
1017 |
+
for prefix_pattern in intro_prefixes:
|
1018 |
+
response = re.sub(prefix_pattern, '', response, flags=re.IGNORECASE)
|
1019 |
+
|
1020 |
# 3. Remove common prefixes and suffixes
|
1021 |
prefixes_to_remove = [
|
1022 |
"Enhanced Description:",
|
1023 |
"Scene Description:",
|
1024 |
"Description:",
|
1025 |
"Here is the enhanced description:",
|
1026 |
+
"Here's the enhanced description:",
|
1027 |
+
"Here is a rewritten scene description that adheres to the provided critical rules:",
|
1028 |
+
"Here is the rewritten scene description:",
|
1029 |
+
"Here's a rewritten scene description:",
|
1030 |
+
"The rewritten scene description is as follows:"
|
1031 |
]
|
1032 |
|
1033 |
for prefix in prefixes_to_remove:
|
|
|
1086 |
# Recombine unique sentences
|
1087 |
response = ' '.join(unique_sentences)
|
1088 |
|
1089 |
+
# 9.5. Advanced repetition detection and replacement
|
1090 |
+
repetitive_descriptors = ['visible', 'positioned', 'located', 'situated', 'appears', 'features', 'shows', 'displays']
|
1091 |
+
word_usage_count = {}
|
1092 |
+
|
1093 |
+
# Count occurrences of each repetitive descriptor
|
1094 |
+
for word in repetitive_descriptors:
|
1095 |
+
count = len(re.findall(r'\b' + word + r'\b', response, re.IGNORECASE))
|
1096 |
+
if count > 1:
|
1097 |
+
word_usage_count[word] = count
|
1098 |
+
|
1099 |
+
# Replace excessive repetitions with varied alternatives
|
1100 |
+
replacement_alternatives = {
|
1101 |
+
'visible': ['present', 'evident', 'apparent', 'observable'],
|
1102 |
+
'positioned': ['arranged', 'placed', 'set', 'organized'],
|
1103 |
+
'located': ['found', 'placed', 'situated', 'established'],
|
1104 |
+
'situated': ['placed', 'positioned', 'arranged', 'set'],
|
1105 |
+
'appears': ['seems', 'looks', 'presents', 'exhibits'],
|
1106 |
+
'features': ['includes', 'contains', 'displays', 'showcases'],
|
1107 |
+
'shows': ['reveals', 'presents', 'exhibits', 'demonstrates'],
|
1108 |
+
'displays': ['presents', 'exhibits', 'shows', 'reveals']
|
1109 |
+
}
|
1110 |
+
|
1111 |
+
for word, count in word_usage_count.items():
|
1112 |
+
if count > 1 and word in replacement_alternatives:
|
1113 |
+
# Find all occurrences
|
1114 |
+
pattern = re.compile(r'\b' + word + r'\b', re.IGNORECASE)
|
1115 |
+
matches = list(pattern.finditer(response))
|
1116 |
+
|
1117 |
+
# Replace subsequent occurrences (keep first one)
|
1118 |
+
for i, match in enumerate(matches[1:], 1):
|
1119 |
+
if i <= len(replacement_alternatives[word]):
|
1120 |
+
replacement = replacement_alternatives[word][(i-1) % len(replacement_alternatives[word])]
|
1121 |
+
# Maintain original case pattern
|
1122 |
+
if match.group().isupper():
|
1123 |
+
replacement = replacement.upper()
|
1124 |
+
elif match.group().istitle():
|
1125 |
+
replacement = replacement.capitalize()
|
1126 |
+
|
1127 |
+
response = response[:match.start()] + replacement + response[match.end():]
|
1128 |
+
# Update remaining matches positions
|
1129 |
+
offset = len(replacement) - len(match.group())
|
1130 |
+
matches = list(pattern.finditer(response))
|
1131 |
+
|
1132 |
# 10. Ensure word count is within limits (50-150 words)
|
1133 |
words = response.split()
|
1134 |
if len(words) > 200:
|
|
|
1160 |
# Remove the last preposition or conjunction
|
1161 |
response = " ".join(words[:-1]) + "."
|
1162 |
|
1163 |
+
# 12. Grammar completeness check
|
1164 |
+
incomplete_patterns = [
|
1165 |
+
r'\b(fine|the)\s+(the\s+)?(?:urban|area|scene)\b(?!\s+\w)', # 檢測不完整的片語
|
1166 |
+
r'\b(and|or|but|with|from|in|at|on)\s*[.!?]', # 介詞後直接結束
|
1167 |
+
r'\b\w+\s+\1\b' # 重複詞語檢測
|
1168 |
+
]
|
1169 |
+
|
1170 |
+
for pattern in incomplete_patterns:
|
1171 |
+
if re.search(pattern, response, re.IGNORECASE):
|
1172 |
+
# 移除有問題的片段或進行修正
|
1173 |
+
response = re.sub(pattern, '', response, flags=re.IGNORECASE)
|
1174 |
+
response = re.sub(r'\s{2,}', ' ', response) # 清理多餘空格
|
1175 |
+
|
1176 |
+
# 13. Ensure haven't over-filtered
|
1177 |
if not response or len(response) < 40:
|
1178 |
# Try to get the first meaningful paragraph from the original response
|
1179 |
paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
|
|
|
1190 |
# If still no good content, return a simple message
|
1191 |
return "Unable to generate a valid enhanced description."
|
1192 |
|
1193 |
+
# 14. Final cleaning - catch any missed special cases
|
1194 |
response = re.sub(r'</?\|.*?\|>', '', response) # Any remaining tags
|
1195 |
response = re.sub(r'\(.*?\)', '', response) # Any remaining parenthetical content
|
1196 |
response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE) # Any remaining notes
|
|
|
1202 |
if response and response[0].islower():
|
1203 |
response = response[0].upper() + response[1:]
|
1204 |
|
1205 |
+
# 15. 統一格式 - 確保輸出始終是單一段落
|
1206 |
response = re.sub(r'\s*\n\s*', ' ', response) # 將所有換行符替換為空格
|
1207 |
response = ' '.join(response.split())
|
1208 |
|
object_template_fillers.py
CHANGED
@@ -74,5 +74,10 @@ OBJECT_TEMPLATE_FILLERS = {
|
|
74 |
"playing_surface": ["marked court boundaries", "manicured field turf", "running tracks", "competition equipment", "sports field markers"],
|
75 |
"construction_equipment": ["tower cranes", "excavators", "cement mixers", "scaffolding structures", "construction barriers"],
|
76 |
"medical_elements": ["examination furniture", "monitoring equipment", "sanitation stations", "privacy screens", "medical supply carts"],
|
77 |
-
"educational_furniture": ["student desks", "lecture podiums", "laboratory benches", "learning stations", "collaborative workspace tables"]
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
74 |
"playing_surface": ["marked court boundaries", "manicured field turf", "running tracks", "competition equipment", "sports field markers"],
|
75 |
"construction_equipment": ["tower cranes", "excavators", "cement mixers", "scaffolding structures", "construction barriers"],
|
76 |
"medical_elements": ["examination furniture", "monitoring equipment", "sanitation stations", "privacy screens", "medical supply carts"],
|
77 |
+
"educational_furniture": ["student desks", "lecture podiums", "laboratory benches", "learning stations", "collaborative workspace tables"],
|
78 |
+
|
79 |
+
"landmark_features": ["distinctive architecture", "iconic structural elements", "famous design features", "recognized silhouette", "impressive proportions"],
|
80 |
+
"tourist_activities": ["sightseeing", "guided tours", "photography", "cultural exploration", "souvenir shopping"],
|
81 |
+
"outdoor_activities": ["nature photography", "hiking", "scenic viewing", "wildlife observation", "outdoor exploration"],
|
82 |
+
"historical_elements": ["cultural heritage", "historical events", "architectural periods", "traditional craftsmanship", "significant achievements"]
|
83 |
+
}
|
places365_model.py
ADDED
@@ -0,0 +1,492 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torchvision.transforms as transforms
|
4 |
+
import numpy as np
|
5 |
+
from PIL import Image
|
6 |
+
from typing import Dict, List, Tuple, Optional, Any
|
7 |
+
import logging
|
8 |
+
|
9 |
+
class Places365Model:
|
10 |
+
"""
|
11 |
+
Places365 scene classification model wrapper for scene understanding integration.
|
12 |
+
Provides scene classification and scene attribute prediction capabilities.
|
13 |
+
"""
|
14 |
+
|
15 |
+
def __init__(self, model_name: str = 'resnet50_places365', device: Optional[str] = None):
|
16 |
+
"""
|
17 |
+
Initialize Places365 model with configurable architecture and device.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
model_name: Model architecture name (默認 resnet50)
|
21 |
+
device: Target device for inference (auto-detected if None)
|
22 |
+
"""
|
23 |
+
self.logger = logging.getLogger(self.__class__.__name__)
|
24 |
+
|
25 |
+
# Device configuration with fallback logic
|
26 |
+
if device is None:
|
27 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
28 |
+
else:
|
29 |
+
self.device = device
|
30 |
+
|
31 |
+
self.model_name = model_name
|
32 |
+
self.model = None
|
33 |
+
self.scene_classes = []
|
34 |
+
self.scene_attributes = []
|
35 |
+
|
36 |
+
# Model configuration mapping
|
37 |
+
self.model_configs = {
|
38 |
+
'resnet18_places365': {
|
39 |
+
'arch': 'resnet18',
|
40 |
+
'num_classes': 365,
|
41 |
+
'url': 'http://places2.csail.mit.edu/models_places365/resnet18_places365.pth.tar'
|
42 |
+
},
|
43 |
+
'resnet50_places365': {
|
44 |
+
'arch': 'resnet50',
|
45 |
+
'num_classes': 365,
|
46 |
+
'url': 'http://places2.csail.mit.edu/models_places365/resnet50_places365.pth.tar'
|
47 |
+
},
|
48 |
+
'densenet161_places365': {
|
49 |
+
'arch': 'densenet161',
|
50 |
+
'num_classes': 365,
|
51 |
+
'url': 'http://places2.csail.mit.edu/models_places365/densenet161_places365.pth.tar'
|
52 |
+
}
|
53 |
+
}
|
54 |
+
|
55 |
+
self._load_model()
|
56 |
+
self._load_class_names()
|
57 |
+
self._setup_scene_mapping()
|
58 |
+
|
59 |
+
def _load_model(self):
|
60 |
+
"""載入與初始化 Places365 model"""
|
61 |
+
try:
|
62 |
+
if self.model_name not in self.model_configs:
|
63 |
+
raise ValueError(f"Unsupported model name: {self.model_name}")
|
64 |
+
|
65 |
+
config = self.model_configs[self.model_name]
|
66 |
+
|
67 |
+
# Import model architecture
|
68 |
+
if config['arch'].startswith('resnet'):
|
69 |
+
import torchvision.models as models
|
70 |
+
if config['arch'] == 'resnet18':
|
71 |
+
self.model = models.resnet18(num_classes=config['num_classes'])
|
72 |
+
elif config['arch'] == 'resnet50':
|
73 |
+
self.model = models.resnet50(num_classes=config['num_classes'])
|
74 |
+
elif config['arch'] == 'densenet161':
|
75 |
+
import torchvision.models as models
|
76 |
+
self.model = models.densenet161(num_classes=config['num_classes'])
|
77 |
+
|
78 |
+
# Load pretrained weights
|
79 |
+
checkpoint = torch.hub.load_state_dict_from_url(
|
80 |
+
config['url'],
|
81 |
+
map_location=self.device,
|
82 |
+
progress=True
|
83 |
+
)
|
84 |
+
|
85 |
+
# Handle different checkpoint formats
|
86 |
+
if 'state_dict' in checkpoint:
|
87 |
+
state_dict = checkpoint['state_dict']
|
88 |
+
# Remove 'module.' prefix if present
|
89 |
+
state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
|
90 |
+
else:
|
91 |
+
state_dict = checkpoint
|
92 |
+
|
93 |
+
self.model.load_state_dict(state_dict)
|
94 |
+
self.model.to(self.device)
|
95 |
+
self.model.eval()
|
96 |
+
|
97 |
+
self.logger.info(f"Places365 model {self.model_name} loaded successfully on {self.device}")
|
98 |
+
|
99 |
+
except Exception as e:
|
100 |
+
self.logger.error(f"Error loading Places365 model: {str(e)}")
|
101 |
+
raise
|
102 |
+
|
103 |
+
def _load_class_names(self):
|
104 |
+
"""Load Places365 class names and scene attributes."""
|
105 |
+
try:
|
106 |
+
# Load scene class names (365 categories)
|
107 |
+
import urllib.request
|
108 |
+
|
109 |
+
class_url = 'https://raw.githubusercontent.com/csailvision/places365/master/categories_places365.txt'
|
110 |
+
class_file = urllib.request.urlopen(class_url)
|
111 |
+
|
112 |
+
self.scene_classes = []
|
113 |
+
for line in class_file:
|
114 |
+
class_name = line.decode('utf-8').strip().split(' ')[0][3:] # Remove /x/ prefix
|
115 |
+
self.scene_classes.append(class_name)
|
116 |
+
|
117 |
+
# Load scene attributes (optional, for enhanced description)
|
118 |
+
attr_url = 'https://raw.githubusercontent.com/csailvision/places365/master/labels_sunattribute.txt'
|
119 |
+
try:
|
120 |
+
attr_file = urllib.request.urlopen(attr_url)
|
121 |
+
self.scene_attributes = []
|
122 |
+
for line in attr_file:
|
123 |
+
attr_name = line.decode('utf-8').strip()
|
124 |
+
self.scene_attributes.append(attr_name)
|
125 |
+
except:
|
126 |
+
self.logger.warning("Scene attributes not loaded, continuing with basic classification")
|
127 |
+
self.scene_attributes = []
|
128 |
+
|
129 |
+
self.logger.info(f"Loaded {len(self.scene_classes)} scene classes and {len(self.scene_attributes)} attributes")
|
130 |
+
|
131 |
+
except Exception as e:
|
132 |
+
self.logger.error(f"Error loading class names: {str(e)}")
|
133 |
+
# Fallback to basic class names if download fails
|
134 |
+
self.scene_classes = [f"scene_class_{i}" for i in range(365)]
|
135 |
+
self.scene_attributes = []
|
136 |
+
|
137 |
+
def _setup_scene_mapping(self):
|
138 |
+
"""Setup mapping from Places365 classes to common scene types."""
|
139 |
+
# 建立Places365類別到通用場景類型的映射關係
|
140 |
+
self.scene_type_mapping = {
|
141 |
+
# Indoor scenes
|
142 |
+
'living_room': 'living_room',
|
143 |
+
'bedroom': 'bedroom',
|
144 |
+
'kitchen': 'kitchen',
|
145 |
+
'dining_room': 'dining_area',
|
146 |
+
'bathroom': 'bathroom',
|
147 |
+
'office': 'office_workspace',
|
148 |
+
'conference_room': 'office_workspace',
|
149 |
+
'classroom': 'educational_setting',
|
150 |
+
'library': 'library',
|
151 |
+
'restaurant': 'restaurant',
|
152 |
+
'cafe': 'cafe',
|
153 |
+
'bar': 'bar',
|
154 |
+
'hotel_room': 'hotel_room',
|
155 |
+
'hospital_room': 'medical_facility',
|
156 |
+
'gym': 'gym',
|
157 |
+
'supermarket': 'retail_store',
|
158 |
+
'clothing_store': 'retail_store',
|
159 |
+
|
160 |
+
# Outdoor urban scenes
|
161 |
+
'street': 'city_street',
|
162 |
+
'crosswalk': 'intersection',
|
163 |
+
'parking_lot': 'parking_lot',
|
164 |
+
'gas_station': 'gas_station',
|
165 |
+
'bus_station': 'bus_stop',
|
166 |
+
'train_station': 'train_station',
|
167 |
+
'airport_terminal': 'airport',
|
168 |
+
'subway_station': 'subway_station',
|
169 |
+
'bridge': 'bridge',
|
170 |
+
'highway': 'highway',
|
171 |
+
'downtown': 'commercial_district',
|
172 |
+
'shopping_mall': 'shopping_mall',
|
173 |
+
|
174 |
+
# Natural outdoor scenes
|
175 |
+
'park': 'park_area',
|
176 |
+
'beach': 'beach',
|
177 |
+
'forest': 'forest',
|
178 |
+
'mountain': 'mountain',
|
179 |
+
'lake': 'lake',
|
180 |
+
'river': 'river',
|
181 |
+
'ocean': 'ocean',
|
182 |
+
'desert': 'desert',
|
183 |
+
'field': 'field',
|
184 |
+
'garden': 'garden',
|
185 |
+
|
186 |
+
# Landmark and tourist areas
|
187 |
+
'castle': 'historical_monument',
|
188 |
+
'palace': 'historical_monument',
|
189 |
+
'temple': 'temple',
|
190 |
+
'church': 'church',
|
191 |
+
'mosque': 'mosque',
|
192 |
+
'museum': 'museum',
|
193 |
+
'art_gallery': 'art_gallery',
|
194 |
+
'tower': 'tourist_landmark',
|
195 |
+
'monument': 'historical_monument',
|
196 |
+
|
197 |
+
# Sports and entertainment
|
198 |
+
'stadium': 'stadium',
|
199 |
+
'basketball_court': 'sports_field',
|
200 |
+
'tennis_court': 'sports_field',
|
201 |
+
'swimming_pool': 'swimming_pool',
|
202 |
+
'playground': 'playground',
|
203 |
+
'amusement_park': 'amusement_park',
|
204 |
+
'theater': 'theater',
|
205 |
+
'concert_hall': 'concert_hall',
|
206 |
+
|
207 |
+
# Transportation
|
208 |
+
'airplane_cabin': 'airplane_cabin',
|
209 |
+
'train_interior': 'train_interior',
|
210 |
+
'car_interior': 'car_interior',
|
211 |
+
|
212 |
+
# Construction and industrial
|
213 |
+
'construction_site': 'construction_site',
|
214 |
+
'factory': 'factory',
|
215 |
+
'warehouse': 'warehouse'
|
216 |
+
}
|
217 |
+
|
218 |
+
# Indoor/outdoor classification helper
|
219 |
+
self.indoor_classes = {
|
220 |
+
'living_room', 'bedroom', 'kitchen', 'dining_room', 'bathroom', 'office',
|
221 |
+
'conference_room', 'classroom', 'library', 'restaurant', 'cafe', 'bar',
|
222 |
+
'hotel_room', 'hospital_room', 'gym', 'supermarket', 'clothing_store',
|
223 |
+
'airplane_cabin', 'train_interior', 'car_interior', 'theater', 'concert_hall',
|
224 |
+
'museum', 'art_gallery', 'shopping_mall'
|
225 |
+
}
|
226 |
+
|
227 |
+
self.outdoor_classes = {
|
228 |
+
'street', 'crosswalk', 'parking_lot', 'gas_station', 'bus_station',
|
229 |
+
'train_station', 'airport_terminal', 'bridge', 'highway', 'downtown',
|
230 |
+
'park', 'beach', 'forest', 'mountain', 'lake', 'river', 'ocean',
|
231 |
+
'desert', 'field', 'garden', 'stadium', 'basketball_court', 'tennis_court',
|
232 |
+
'swimming_pool', 'playground', 'amusement_park', 'construction_site',
|
233 |
+
'factory', 'warehouse', 'castle', 'palace', 'temple', 'church', 'mosque',
|
234 |
+
'tower', 'monument'
|
235 |
+
}
|
236 |
+
|
237 |
+
def preprocess(self, image_pil: Image.Image) -> torch.Tensor:
|
238 |
+
"""
|
239 |
+
Preprocess PIL image for Places365 model inference.
|
240 |
+
|
241 |
+
Args:
|
242 |
+
image_pil: Input PIL image
|
243 |
+
|
244 |
+
Returns:
|
245 |
+
torch.Tensor: Preprocessed image tensor
|
246 |
+
"""
|
247 |
+
# Places365 standard preprocessing
|
248 |
+
transform = transforms.Compose([
|
249 |
+
transforms.Resize((256, 256)),
|
250 |
+
transforms.CenterCrop(224),
|
251 |
+
transforms.ToTensor(),
|
252 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
253 |
+
])
|
254 |
+
|
255 |
+
# Convert to RGB if needed
|
256 |
+
if image_pil.mode != 'RGB':
|
257 |
+
image_pil = image_pil.convert('RGB')
|
258 |
+
|
259 |
+
# Apply preprocessing
|
260 |
+
input_tensor = transform(image_pil).unsqueeze(0)
|
261 |
+
return input_tensor.to(self.device)
|
262 |
+
|
263 |
+
def predict(self, image_pil: Image.Image) -> Dict[str, Any]:
|
264 |
+
"""
|
265 |
+
Predict scene classification and attributes for input image.
|
266 |
+
|
267 |
+
Args:
|
268 |
+
image_pil: Input PIL image
|
269 |
+
|
270 |
+
Returns:
|
271 |
+
Dict containing scene predictions and confidence scores
|
272 |
+
"""
|
273 |
+
try:
|
274 |
+
# Preprocess image
|
275 |
+
input_tensor = self.preprocess(image_pil)
|
276 |
+
|
277 |
+
# Model inference
|
278 |
+
with torch.no_grad():
|
279 |
+
outputs = self.model(input_tensor)
|
280 |
+
probabilities = torch.nn.functional.softmax(outputs, dim=1)
|
281 |
+
|
282 |
+
# 返回最有可能的項目
|
283 |
+
top_k = min(10, len(self.scene_classes)) # Configurable top-k
|
284 |
+
top_probs, top_indices = torch.topk(probabilities, top_k, dim=1)
|
285 |
+
|
286 |
+
# Extract results
|
287 |
+
top_probs = top_probs.cpu().numpy()[0]
|
288 |
+
top_indices = top_indices.cpu().numpy()[0]
|
289 |
+
|
290 |
+
# Build prediction results
|
291 |
+
predictions = []
|
292 |
+
for i in range(top_k):
|
293 |
+
class_idx = top_indices[i]
|
294 |
+
confidence = float(top_probs[i])
|
295 |
+
scene_class = self.scene_classes[class_idx]
|
296 |
+
|
297 |
+
predictions.append({
|
298 |
+
'class_name': scene_class,
|
299 |
+
'class_index': class_idx,
|
300 |
+
'confidence': confidence
|
301 |
+
})
|
302 |
+
|
303 |
+
# Get primary prediction
|
304 |
+
primary_prediction = predictions[0]
|
305 |
+
primary_class = primary_prediction['class_name']
|
306 |
+
|
307 |
+
# 確認是 indoor/outdoor
|
308 |
+
is_indoor = self._classify_indoor_outdoor(primary_class)
|
309 |
+
|
310 |
+
# Map to common scene type
|
311 |
+
mapped_scene_type = self._map_places365_to_scene_types(primary_class)
|
312 |
+
|
313 |
+
# Determine scene attributes (basic inference based on class)
|
314 |
+
scene_attributes = self._infer_scene_attributes(primary_class)
|
315 |
+
|
316 |
+
result = {
|
317 |
+
'scene_label': primary_class,
|
318 |
+
'mapped_scene_type': mapped_scene_type,
|
319 |
+
'confidence': primary_prediction['confidence'],
|
320 |
+
'is_indoor': is_indoor,
|
321 |
+
'attributes': scene_attributes,
|
322 |
+
'top_predictions': predictions,
|
323 |
+
'all_probabilities': probabilities.cpu().numpy()[0].tolist()
|
324 |
+
}
|
325 |
+
|
326 |
+
return result
|
327 |
+
|
328 |
+
except Exception as e:
|
329 |
+
self.logger.error(f"Error in Places365 prediction: {str(e)}")
|
330 |
+
return {
|
331 |
+
'scene_label': 'unknown',
|
332 |
+
'mapped_scene_type': 'unknown',
|
333 |
+
'confidence': 0.0,
|
334 |
+
'is_indoor': None,
|
335 |
+
'attributes': [],
|
336 |
+
'top_predictions': [],
|
337 |
+
'error': str(e)
|
338 |
+
}
|
339 |
+
|
340 |
+
def _classify_indoor_outdoor(self, scene_class: str) -> Optional[bool]:
|
341 |
+
"""
|
342 |
+
Classify if scene is indoor or outdoor based on Places365 class.
|
343 |
+
|
344 |
+
Args:
|
345 |
+
scene_class: Places365 scene class name
|
346 |
+
|
347 |
+
Returns:
|
348 |
+
bool or None: True for indoor, False for outdoor, None if uncertain
|
349 |
+
"""
|
350 |
+
if scene_class in self.indoor_classes:
|
351 |
+
return True
|
352 |
+
elif scene_class in self.outdoor_classes:
|
353 |
+
return False
|
354 |
+
else:
|
355 |
+
# For ambiguous classes, use heuristics
|
356 |
+
indoor_keywords = ['room', 'office', 'store', 'shop', 'hall', 'interior', 'indoor']
|
357 |
+
outdoor_keywords = ['street', 'road', 'park', 'field', 'beach', 'mountain', 'outdoor']
|
358 |
+
|
359 |
+
scene_lower = scene_class.lower()
|
360 |
+
if any(keyword in scene_lower for keyword in indoor_keywords):
|
361 |
+
return True
|
362 |
+
elif any(keyword in scene_lower for keyword in outdoor_keywords):
|
363 |
+
return False
|
364 |
+
else:
|
365 |
+
return None
|
366 |
+
|
367 |
+
def _map_places365_to_scene_types(self, places365_class: str) -> str:
|
368 |
+
"""
|
369 |
+
Map Places365 class to common scene type used by the system.
|
370 |
+
|
371 |
+
Args:
|
372 |
+
places365_class: Places365 scene class name
|
373 |
+
|
374 |
+
Returns:
|
375 |
+
str: Mapped scene type
|
376 |
+
"""
|
377 |
+
# Direct mapping lookup
|
378 |
+
if places365_class in self.scene_type_mapping:
|
379 |
+
return self.scene_type_mapping[places365_class]
|
380 |
+
|
381 |
+
# Fuzzy matching for similar classes
|
382 |
+
places365_lower = places365_class.lower()
|
383 |
+
|
384 |
+
# Indoor fuzzy matching
|
385 |
+
if any(keyword in places365_lower for keyword in ['living', 'bedroom', 'kitchen']):
|
386 |
+
return 'general_indoor_space'
|
387 |
+
elif any(keyword in places365_lower for keyword in ['office', 'conference', 'meeting']):
|
388 |
+
return 'office_workspace'
|
389 |
+
elif any(keyword in places365_lower for keyword in ['dining', 'restaurant', 'cafe']):
|
390 |
+
return 'dining_area'
|
391 |
+
elif any(keyword in places365_lower for keyword in ['store', 'shop', 'market']):
|
392 |
+
return 'retail_store'
|
393 |
+
elif any(keyword in places365_lower for keyword in ['school', 'class', 'library']):
|
394 |
+
return 'educational_setting'
|
395 |
+
|
396 |
+
# Outdoor fuzzy matching
|
397 |
+
elif any(keyword in places365_lower for keyword in ['street', 'road', 'crosswalk']):
|
398 |
+
return 'city_street'
|
399 |
+
elif any(keyword in places365_lower for keyword in ['park', 'garden', 'plaza']):
|
400 |
+
return 'park_area'
|
401 |
+
elif any(keyword in places365_lower for keyword in ['beach', 'ocean', 'lake']):
|
402 |
+
return 'beach'
|
403 |
+
elif any(keyword in places365_lower for keyword in ['mountain', 'forest', 'desert']):
|
404 |
+
return 'natural_outdoor_area'
|
405 |
+
elif any(keyword in places365_lower for keyword in ['parking', 'garage']):
|
406 |
+
return 'parking_lot'
|
407 |
+
elif any(keyword in places365_lower for keyword in ['station', 'terminal', 'airport']):
|
408 |
+
return 'transportation_hub'
|
409 |
+
|
410 |
+
# Landmark fuzzy matching
|
411 |
+
elif any(keyword in places365_lower for keyword in ['castle', 'palace', 'monument', 'temple']):
|
412 |
+
return 'historical_monument'
|
413 |
+
elif any(keyword in places365_lower for keyword in ['tower', 'landmark']):
|
414 |
+
return 'tourist_landmark'
|
415 |
+
elif any(keyword in places365_lower for keyword in ['museum', 'gallery']):
|
416 |
+
return 'cultural_venue'
|
417 |
+
|
418 |
+
# Default fallback based on indoor/outdoor
|
419 |
+
is_indoor = self._classify_indoor_outdoor(places365_class)
|
420 |
+
if is_indoor is True:
|
421 |
+
return 'general_indoor_space'
|
422 |
+
elif is_indoor is False:
|
423 |
+
return 'generic_street_view'
|
424 |
+
else:
|
425 |
+
return 'unknown'
|
426 |
+
|
427 |
+
def _infer_scene_attributes(self, scene_class: str) -> List[str]:
|
428 |
+
"""
|
429 |
+
Infer basic scene attributes from Places365 class.
|
430 |
+
|
431 |
+
Args:
|
432 |
+
scene_class: Places365 scene class name
|
433 |
+
|
434 |
+
Returns:
|
435 |
+
List[str]: Inferred scene attributes
|
436 |
+
"""
|
437 |
+
attributes = []
|
438 |
+
scene_lower = scene_class.lower()
|
439 |
+
|
440 |
+
# Lighting attributes
|
441 |
+
if any(keyword in scene_lower for keyword in ['outdoor', 'street', 'park', 'beach']):
|
442 |
+
attributes.append('natural_lighting')
|
443 |
+
elif any(keyword in scene_lower for keyword in ['indoor', 'room', 'office']):
|
444 |
+
attributes.append('artificial_lighting')
|
445 |
+
|
446 |
+
# Functional attributes
|
447 |
+
if any(keyword in scene_lower for keyword in ['commercial', 'store', 'shop', 'restaurant']):
|
448 |
+
attributes.append('commercial')
|
449 |
+
elif any(keyword in scene_lower for keyword in ['residential', 'home', 'living', 'bedroom']):
|
450 |
+
attributes.append('residential')
|
451 |
+
elif any(keyword in scene_lower for keyword in ['office', 'conference', 'meeting']):
|
452 |
+
attributes.append('workplace')
|
453 |
+
elif any(keyword in scene_lower for keyword in ['recreation', 'park', 'playground', 'stadium']):
|
454 |
+
attributes.append('recreational')
|
455 |
+
elif any(keyword in scene_lower for keyword in ['educational', 'school', 'library', 'classroom']):
|
456 |
+
attributes.append('educational')
|
457 |
+
|
458 |
+
# Spatial attributes
|
459 |
+
if any(keyword in scene_lower for keyword in ['open', 'field', 'plaza', 'stadium']):
|
460 |
+
attributes.append('open_space')
|
461 |
+
elif any(keyword in scene_lower for keyword in ['enclosed', 'room', 'interior']):
|
462 |
+
attributes.append('enclosed_space')
|
463 |
+
|
464 |
+
return attributes
|
465 |
+
|
466 |
+
def get_scene_probabilities(self, image_pil: Image.Image) -> Dict[str, float]:
|
467 |
+
"""
|
468 |
+
Get probability distribution over all scene classes.
|
469 |
+
|
470 |
+
Args:
|
471 |
+
image_pil: Input PIL image
|
472 |
+
|
473 |
+
Returns:
|
474 |
+
Dict mapping scene class names to probabilities
|
475 |
+
"""
|
476 |
+
try:
|
477 |
+
input_tensor = self.preprocess(image_pil)
|
478 |
+
|
479 |
+
with torch.no_grad():
|
480 |
+
outputs = self.model(input_tensor)
|
481 |
+
probabilities = torch.nn.functional.softmax(outputs, dim=1)
|
482 |
+
|
483 |
+
probs = probabilities.cpu().numpy()[0]
|
484 |
+
|
485 |
+
return {
|
486 |
+
self.scene_classes[i]: float(probs[i])
|
487 |
+
for i in range(len(self.scene_classes))
|
488 |
+
}
|
489 |
+
|
490 |
+
except Exception as e:
|
491 |
+
self.logger.error(f"Error getting scene probabilities: {str(e)}")
|
492 |
+
return {}
|
requirements.txt
CHANGED
@@ -1,16 +1,17 @@
|
|
1 |
-
torch>=2.0.0
|
2 |
-
torchvision>=0.15.0
|
3 |
-
ultralytics>=8.0.0
|
4 |
-
opencv-python>=4.7.0
|
5 |
-
pillow>=9.4.0
|
6 |
-
numpy>=1.23.5
|
7 |
-
matplotlib>=3.7.0
|
8 |
-
gradio>=3.32.0
|
9 |
-
git+https://github.com/openai/CLIP.git
|
10 |
-
yt-dlp>=2023.3.4
|
11 |
-
requests>=2.28.1
|
12 |
-
transformers
|
13 |
-
accelerate
|
14 |
-
bitsandbytes
|
15 |
-
sentencepiece
|
16 |
-
huggingface_hub>=0.19.0
|
|
|
|
1 |
+
# torch>=2.0.0
|
2 |
+
# torchvision>=0.15.0
|
3 |
+
# ultralytics>=8.0.0
|
4 |
+
# opencv-python>=4.7.0
|
5 |
+
# pillow>=9.4.0
|
6 |
+
# numpy>=1.23.5
|
7 |
+
# matplotlib>=3.7.0
|
8 |
+
# gradio>=3.32.0
|
9 |
+
# git+https://github.com/openai/CLIP.git
|
10 |
+
# yt-dlp>=2023.3.4
|
11 |
+
# requests>=2.28.1
|
12 |
+
# transformers
|
13 |
+
# accelerate
|
14 |
+
# bitsandbytes
|
15 |
+
# sentencepiece
|
16 |
+
# huggingface_hub>=0.19.0
|
17 |
+
# urllib3>=1.26.0
|
scene_analyzer.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
scene_description.py
CHANGED
@@ -59,7 +59,7 @@ class SceneDescriptor:
|
|
59 |
"low": "This might be {description}, but the confidence is low. {details}"
|
60 |
}
|
61 |
|
62 |
-
#
|
63 |
self.scene_detail_templates = {
|
64 |
"default": ["A space with various objects."]
|
65 |
}
|
@@ -105,53 +105,90 @@ class SceneDescriptor:
|
|
105 |
return alternatives
|
106 |
|
107 |
|
108 |
-
def _infer_possible_activities(self, scene_type: str, detected_objects: List[Dict]) -> List[str]:
|
109 |
"""
|
110 |
Infer possible activities based on scene type and detected objects.
|
111 |
|
112 |
Args:
|
113 |
scene_type: Identified scene type
|
114 |
detected_objects: List of detected objects
|
|
|
|
|
115 |
|
116 |
Returns:
|
117 |
List of possible activities
|
118 |
"""
|
119 |
activities = []
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
if scene_type.startswith("aerial_view_"):
|
122 |
if scene_type == "aerial_view_intersection":
|
123 |
-
#
|
124 |
activities.extend(self.activity_templates.get("aerial_view_intersection", []))
|
125 |
-
|
126 |
-
#
|
127 |
pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
|
128 |
vehicles = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]] # Car, bus, truck
|
129 |
-
|
130 |
if pedestrians and vehicles:
|
131 |
activities.append("Waiting for an opportunity to cross the street")
|
132 |
activities.append("Obeying traffic signals")
|
133 |
-
|
134 |
elif scene_type == "aerial_view_commercial_area":
|
135 |
activities.extend(self.activity_templates.get("aerial_view_commercial_area", []))
|
136 |
-
|
137 |
elif scene_type == "aerial_view_plaza":
|
138 |
activities.extend(self.activity_templates.get("aerial_view_plaza", []))
|
139 |
-
|
140 |
else:
|
141 |
-
#
|
142 |
aerial_activities = [
|
143 |
-
"Street crossing",
|
144 |
-
"Waiting for signals",
|
145 |
-
"Following traffic rules",
|
146 |
"Pedestrian movement"
|
147 |
]
|
148 |
activities.extend(aerial_activities)
|
149 |
|
|
|
150 |
if scene_type in self.activity_templates:
|
151 |
activities.extend(self.activity_templates[scene_type])
|
152 |
elif "default" in self.activity_templates:
|
153 |
activities.extend(self.activity_templates["default"])
|
154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
detected_class_ids = [obj["class_id"] for obj in detected_objects]
|
156 |
|
157 |
# Add activities based on specific object combinations
|
@@ -181,8 +218,48 @@ class SceneDescriptor:
|
|
181 |
if 24 in detected_class_ids or 26 in detected_class_ids: # Backpack or handbag
|
182 |
activities.append("Carrying personal items")
|
183 |
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
def _identify_safety_concerns(self, detected_objects: List[Dict], scene_type: str) -> List[str]:
|
188 |
"""
|
@@ -198,8 +275,6 @@ class SceneDescriptor:
|
|
198 |
concerns = []
|
199 |
detected_class_ids = [obj["class_id"] for obj in detected_objects]
|
200 |
|
201 |
-
# ORIGINAL SAFETY CONCERNS LOGIC
|
202 |
-
|
203 |
# General safety concerns
|
204 |
if 42 in detected_class_ids or 43 in detected_class_ids: # Fork or knife
|
205 |
concerns.append("Sharp utensils present")
|
@@ -232,8 +307,6 @@ class SceneDescriptor:
|
|
232 |
if obj["region"] in ["top_left", "top_center", "top_right"] and obj["normalized_area"] > 0.05:
|
233 |
concerns.append(f"Elevated {obj['class_name']} might be unstable")
|
234 |
|
235 |
-
# NEW SAFETY CONCERNS LOGIC FOR ADDITIONAL SCENE TYPES
|
236 |
-
|
237 |
# Upscale dining safety concerns
|
238 |
if scene_type == "upscale_dining":
|
239 |
# Check for fragile items
|
@@ -295,7 +368,6 @@ class SceneDescriptor:
|
|
295 |
concerns.append("Two-wheeled vehicles in pedestrian areas")
|
296 |
|
297 |
# Check for potential trip hazards
|
298 |
-
# We can't directly detect this, but can infer from context
|
299 |
if scene_type == "asian_commercial_street" and "bottom" in " ".join([obj["region"] for obj in detected_objects if obj["class_id"] == 0]):
|
300 |
# If people are in bottom regions, they might be walking on uneven surfaces
|
301 |
concerns.append("Potential uneven walking surfaces in commercial area")
|
@@ -324,7 +396,6 @@ class SceneDescriptor:
|
|
324 |
concerns.append("Busy traffic area potentially without visible traffic signals in view")
|
325 |
|
326 |
# Time of day considerations
|
327 |
-
# We don't have direct time data, but can infer from vehicle lights
|
328 |
vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]
|
329 |
if vehicle_objs and any("lighting_conditions" in obj for obj in detected_objects):
|
330 |
# If vehicles are present and it might be evening/night
|
|
|
59 |
"low": "This might be {description}, but the confidence is low. {details}"
|
60 |
}
|
61 |
|
62 |
+
# 只提供最基本的模板作為後備
|
63 |
self.scene_detail_templates = {
|
64 |
"default": ["A space with various objects."]
|
65 |
}
|
|
|
105 |
return alternatives
|
106 |
|
107 |
|
108 |
+
def _infer_possible_activities(self, scene_type: str, detected_objects: List[Dict], enable_landmark: bool = True, scene_scores: Optional[Dict] = None) -> List[str]:
|
109 |
"""
|
110 |
Infer possible activities based on scene type and detected objects.
|
111 |
|
112 |
Args:
|
113 |
scene_type: Identified scene type
|
114 |
detected_objects: List of detected objects
|
115 |
+
enable_landmark: Whether landmark detection is enabled
|
116 |
+
scene_scores: Optional dictionary of scene type scores
|
117 |
|
118 |
Returns:
|
119 |
List of possible activities
|
120 |
"""
|
121 |
activities = []
|
122 |
|
123 |
+
# Dynamically replace landmark scene types when landmark detection is disabled
|
124 |
+
if not enable_landmark and scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
|
125 |
+
alternative_scene_type = self._get_alternative_scene_type(scene_type, detected_objects, scene_scores)
|
126 |
+
print(f"Replacing landmark scene type '{scene_type}' with '{alternative_scene_type}' for activity inference")
|
127 |
+
scene_type = alternative_scene_type
|
128 |
+
|
129 |
+
# Process aerial view scenes
|
130 |
if scene_type.startswith("aerial_view_"):
|
131 |
if scene_type == "aerial_view_intersection":
|
132 |
+
# Use predefined intersection activities
|
133 |
activities.extend(self.activity_templates.get("aerial_view_intersection", []))
|
134 |
+
|
135 |
+
# Add pedestrian and vehicle specific activities
|
136 |
pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
|
137 |
vehicles = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]] # Car, bus, truck
|
138 |
+
|
139 |
if pedestrians and vehicles:
|
140 |
activities.append("Waiting for an opportunity to cross the street")
|
141 |
activities.append("Obeying traffic signals")
|
142 |
+
|
143 |
elif scene_type == "aerial_view_commercial_area":
|
144 |
activities.extend(self.activity_templates.get("aerial_view_commercial_area", []))
|
145 |
+
|
146 |
elif scene_type == "aerial_view_plaza":
|
147 |
activities.extend(self.activity_templates.get("aerial_view_plaza", []))
|
148 |
+
|
149 |
else:
|
150 |
+
# Handle other undefined aerial view scenes
|
151 |
aerial_activities = [
|
152 |
+
"Street crossing",
|
153 |
+
"Waiting for signals",
|
154 |
+
"Following traffic rules",
|
155 |
"Pedestrian movement"
|
156 |
]
|
157 |
activities.extend(aerial_activities)
|
158 |
|
159 |
+
# Add scene-specific activities from templates
|
160 |
if scene_type in self.activity_templates:
|
161 |
activities.extend(self.activity_templates[scene_type])
|
162 |
elif "default" in self.activity_templates:
|
163 |
activities.extend(self.activity_templates["default"])
|
164 |
|
165 |
+
# Filter out landmark-related activities when landmark detection is disabled
|
166 |
+
if not enable_landmark:
|
167 |
+
filtered_activities = []
|
168 |
+
landmark_keywords = ["sightseeing", "landmark", "tourist", "monument", "historical",
|
169 |
+
"guided tour", "photography", "cultural tourism", "heritage"]
|
170 |
+
|
171 |
+
for activity in activities:
|
172 |
+
if not any(keyword in activity.lower() for keyword in landmark_keywords):
|
173 |
+
filtered_activities.append(activity)
|
174 |
+
|
175 |
+
activities = filtered_activities
|
176 |
+
|
177 |
+
# If we filtered out all activities, add some generic ones based on scene type
|
178 |
+
if not activities:
|
179 |
+
generic_activities = {
|
180 |
+
"city_street": ["Walking", "Commuting", "Shopping"],
|
181 |
+
"intersection": ["Crossing the street", "Waiting for traffic signals"],
|
182 |
+
"commercial_district": ["Shopping", "Walking", "Dining"],
|
183 |
+
"pedestrian_area": ["Walking", "Socializing", "Shopping"],
|
184 |
+
"park_area": ["Relaxing", "Walking", "Exercise"],
|
185 |
+
"outdoor_natural_area": ["Walking", "Nature observation", "Relaxation"],
|
186 |
+
"urban_architecture": ["Walking", "Urban exploration", "Photography"]
|
187 |
+
}
|
188 |
+
|
189 |
+
activities.extend(generic_activities.get(scene_type, ["Walking", "Observing surroundings"]))
|
190 |
+
|
191 |
+
# Add activities based on detected objects
|
192 |
detected_class_ids = [obj["class_id"] for obj in detected_objects]
|
193 |
|
194 |
# Add activities based on specific object combinations
|
|
|
218 |
if 24 in detected_class_ids or 26 in detected_class_ids: # Backpack or handbag
|
219 |
activities.append("Carrying personal items")
|
220 |
|
221 |
+
# Add more person count-dependent activities
|
222 |
+
person_count = detected_class_ids.count(0)
|
223 |
+
if person_count > 3:
|
224 |
+
activities.append("Group gathering")
|
225 |
+
elif person_count > 1:
|
226 |
+
activities.append("Social interaction")
|
227 |
+
|
228 |
+
# Add additional activities based on significant objects
|
229 |
+
if 43 in detected_class_ids: # cup
|
230 |
+
activities.append("Drinking beverages")
|
231 |
+
|
232 |
+
if 32 in detected_class_ids: # sports ball
|
233 |
+
activities.append("Playing sports")
|
234 |
+
|
235 |
+
if 25 in detected_class_ids: # umbrella
|
236 |
+
activities.append("Sheltering from weather")
|
237 |
+
|
238 |
+
# Add location-specific activities based on environment objects
|
239 |
+
if any(furniture in detected_class_ids for furniture in [56, 57, 58, 59, 60]): # furniture items
|
240 |
+
activities.append("Using indoor facilities")
|
241 |
+
|
242 |
+
if any(outdoor_item in detected_class_ids for outdoor_item in [13, 14, 15]): # bench, outdoor items
|
243 |
+
activities.append("Enjoying outdoor spaces")
|
244 |
+
|
245 |
+
# Remove duplicates and ensure reasonable number of activities
|
246 |
+
unique_activities = list(set(activities))
|
247 |
+
|
248 |
+
# Limit to reasonable number (maximum 8 activities)
|
249 |
+
if len(unique_activities) > 8:
|
250 |
+
# Prioritize more specific activities over general ones
|
251 |
+
general_activities = ["Walking", "Observing surroundings", "Commuting", "Using indoor facilities"]
|
252 |
+
specific_activities = [a for a in unique_activities if a not in general_activities]
|
253 |
+
|
254 |
+
# Take all specific activities first, then fill with general ones if needed
|
255 |
+
if len(specific_activities) <= 8:
|
256 |
+
result = specific_activities + general_activities[:8-len(specific_activities)]
|
257 |
+
else:
|
258 |
+
result = specific_activities[:8]
|
259 |
+
else:
|
260 |
+
result = unique_activities
|
261 |
+
|
262 |
+
return result
|
263 |
|
264 |
def _identify_safety_concerns(self, detected_objects: List[Dict], scene_type: str) -> List[str]:
|
265 |
"""
|
|
|
275 |
concerns = []
|
276 |
detected_class_ids = [obj["class_id"] for obj in detected_objects]
|
277 |
|
|
|
|
|
278 |
# General safety concerns
|
279 |
if 42 in detected_class_ids or 43 in detected_class_ids: # Fork or knife
|
280 |
concerns.append("Sharp utensils present")
|
|
|
307 |
if obj["region"] in ["top_left", "top_center", "top_right"] and obj["normalized_area"] > 0.05:
|
308 |
concerns.append(f"Elevated {obj['class_name']} might be unstable")
|
309 |
|
|
|
|
|
310 |
# Upscale dining safety concerns
|
311 |
if scene_type == "upscale_dining":
|
312 |
# Check for fragile items
|
|
|
368 |
concerns.append("Two-wheeled vehicles in pedestrian areas")
|
369 |
|
370 |
# Check for potential trip hazards
|
|
|
371 |
if scene_type == "asian_commercial_street" and "bottom" in " ".join([obj["region"] for obj in detected_objects if obj["class_id"] == 0]):
|
372 |
# If people are in bottom regions, they might be walking on uneven surfaces
|
373 |
concerns.append("Potential uneven walking surfaces in commercial area")
|
|
|
396 |
concerns.append("Busy traffic area potentially without visible traffic signals in view")
|
397 |
|
398 |
# Time of day considerations
|
|
|
399 |
vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]
|
400 |
if vehicle_objs and any("lighting_conditions" in obj for obj in detected_objects):
|
401 |
# If vehicles are present and it might be evening/night
|
scene_detail_templates.py
CHANGED
@@ -200,4 +200,19 @@ SCENE_DETAIL_TEMPLATES = {
|
|
200 |
"This professional culinary area contains {kitchen_equipment} arranged in stations for {food_preparation}.",
|
201 |
"An industrial kitchen featuring {kitchen_equipment} designed for efficient {food_preparation}."
|
202 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
}
|
|
|
200 |
"This professional culinary area contains {kitchen_equipment} arranged in stations for {food_preparation}.",
|
201 |
"An industrial kitchen featuring {kitchen_equipment} designed for efficient {food_preparation}."
|
202 |
],
|
203 |
+
"tourist_landmark": [
|
204 |
+
"This notable landmark attracts visitors who come to see {landmark_features} and experience {tourist_activities}.",
|
205 |
+
"A famous landmark site where tourists can observe {landmark_features} and engage in {tourist_activities}.",
|
206 |
+
"This iconic landmark showcases {landmark_features} and is a popular destination for {tourist_activities}."
|
207 |
+
],
|
208 |
+
"natural_landmark": [
|
209 |
+
"This natural landmark features {landmark_features} and offers opportunities for {outdoor_activities}.",
|
210 |
+
"A scenic natural formation with {landmark_features} where visitors enjoy {outdoor_activities}.",
|
211 |
+
"This impressive natural landmark displays {landmark_features} and attracts nature enthusiasts for {outdoor_activities}."
|
212 |
+
],
|
213 |
+
"historical_monument": [
|
214 |
+
"This historical monument exhibits {landmark_features} and has significance related to {historical_elements}.",
|
215 |
+
"An important historical site featuring {landmark_features} and representing {historical_elements}.",
|
216 |
+
"This heritage monument showcases {landmark_features} and commemorates {historical_elements}."
|
217 |
+
]
|
218 |
}
|
scene_type.py
CHANGED
@@ -384,4 +384,127 @@ SCENE_TYPES = {
|
|
384 |
"minimum_required": 3,
|
385 |
"description": "A commercial kitchen with professional cooking equipment and food preparation areas"
|
386 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
}
|
|
|
384 |
"minimum_required": 3,
|
385 |
"description": "A commercial kitchen with professional cooking equipment and food preparation areas"
|
386 |
},
|
387 |
+
"tourist_landmark": {
|
388 |
+
"name": "Tourist Landmark",
|
389 |
+
"required_objects": [0], # person
|
390 |
+
"optional_objects": [24, 26, 67], # backpack, handbag, cell phone
|
391 |
+
"minimum_required": 0, # 可能沒有人,但仍然是地標
|
392 |
+
"description": "A location featuring a famous landmark with tourist activity",
|
393 |
+
"priority": 1.2 # 提高優先級
|
394 |
+
},
|
395 |
+
"natural_landmark": {
|
396 |
+
"name": "Natural Landmark",
|
397 |
+
"required_objects": [0], # person
|
398 |
+
"optional_objects": [24, 26, 67], # backpack, handbag, cell phone
|
399 |
+
"minimum_required": 0,
|
400 |
+
"description": "A natural landmark site with scenic views",
|
401 |
+
"priority": 1.2
|
402 |
+
},
|
403 |
+
"historical_monument": {
|
404 |
+
"name": "Historical Monument",
|
405 |
+
"required_objects": [0], # person
|
406 |
+
"optional_objects": [24, 26, 67], # backpack, handbag, cell phone
|
407 |
+
"minimum_required": 0,
|
408 |
+
"description": "A historical monument or heritage site",
|
409 |
+
"priority": 1.2
|
410 |
+
},
|
411 |
+
"general_indoor_space": {
|
412 |
+
"name": "General Indoor Space",
|
413 |
+
"required_objects": [], # No strict required objects, depends on combination
|
414 |
+
"optional_objects": [
|
415 |
+
56, # chair
|
416 |
+
57, # couch
|
417 |
+
58, # potted plant
|
418 |
+
59, # bed
|
419 |
+
60, # dining table
|
420 |
+
61, # toilet
|
421 |
+
62, # tv
|
422 |
+
63, # laptop
|
423 |
+
66, # keyboard
|
424 |
+
67, # cell phone
|
425 |
+
73, # book
|
426 |
+
74, # clock
|
427 |
+
75, # vase
|
428 |
+
39, # bottle
|
429 |
+
41, # cup
|
430 |
+
],
|
431 |
+
"minimum_required": 2, # Needs at least a few common indoor items
|
432 |
+
"description": "An indoor area with various common household or functional items.",
|
433 |
+
"priority": 0.8 # Lower priority than more specific scenes
|
434 |
+
},
|
435 |
+
"generic_street_view": {
|
436 |
+
"name": "Generic Street View",
|
437 |
+
"required_objects": [], # More about the combination
|
438 |
+
"optional_objects": [
|
439 |
+
0, # person
|
440 |
+
1, # bicycle
|
441 |
+
2, # car
|
442 |
+
3, # motorcycle
|
443 |
+
5, # bus
|
444 |
+
7, # truck
|
445 |
+
9, # traffic light
|
446 |
+
10, # fire hydrant
|
447 |
+
11, # stop sign
|
448 |
+
13, # bench
|
449 |
+
# Consider adding building if YOLO detects it (not a standard COCO class for YOLOv8, but some custom models might)
|
450 |
+
],
|
451 |
+
"minimum_required": 2, # e.g., a car and a person, or multiple vehicles
|
452 |
+
"description": "An outdoor street view, likely in an urban or suburban setting, with vehicles and/or pedestrians.",
|
453 |
+
"priority": 0.85
|
454 |
+
},
|
455 |
+
"desk_area_workspace": {
|
456 |
+
"name": "Desk Area / Workspace",
|
457 |
+
"required_objects": [
|
458 |
+
63, # laptop or 62 (tv as monitor) or 66 (keyboard)
|
459 |
+
],
|
460 |
+
"optional_objects": [
|
461 |
+
56, # chair
|
462 |
+
60, # dining table (often used as a desk)
|
463 |
+
64, # mouse
|
464 |
+
66, # keyboard
|
465 |
+
73, # book
|
466 |
+
41, # cup
|
467 |
+
67, # cell phone
|
468 |
+
74, # clock
|
469 |
+
],
|
470 |
+
"minimum_required": 2, # e.g., laptop and chair, or table and keyboard
|
471 |
+
"description": "A workspace or desk area, typically featuring a computer and related accessories.",
|
472 |
+
"priority": 0.9
|
473 |
+
},
|
474 |
+
"outdoor_gathering_spot": {
|
475 |
+
"name": "Outdoor Gathering Spot",
|
476 |
+
"required_objects": [
|
477 |
+
0, # person
|
478 |
+
],
|
479 |
+
"optional_objects": [
|
480 |
+
13, # bench
|
481 |
+
32, # sports ball
|
482 |
+
24, # backpack
|
483 |
+
25, # umbrella
|
484 |
+
29, # frisbee
|
485 |
+
33, # kite
|
486 |
+
58, # potted plant (if in a more structured park area)
|
487 |
+
],
|
488 |
+
"minimum_required": 2, # e.g., person and bench, or multiple people
|
489 |
+
"description": "An outdoor area where people might gather for leisure or activity.",
|
490 |
+
"priority": 0.8
|
491 |
+
},
|
492 |
+
"kitchen_counter_or_utility_area": {
|
493 |
+
"name": "Kitchen Counter or Utility Area",
|
494 |
+
"required_objects": [],
|
495 |
+
"optional_objects": [
|
496 |
+
39, # bottle
|
497 |
+
41, # cup
|
498 |
+
44, # spoon
|
499 |
+
45, # bowl
|
500 |
+
68, # microwave
|
501 |
+
69, # oven
|
502 |
+
70, # toaster
|
503 |
+
71, # sink
|
504 |
+
72, # refrigerator
|
505 |
+
],
|
506 |
+
"minimum_required": 2, # e.g., sink and microwave, or refrigerator and bottles
|
507 |
+
"description": "An area likely used for food preparation or kitchen utilities.",
|
508 |
+
"priority": 0.9
|
509 |
+
}
|
510 |
}
|
spatial_analyzer.py
CHANGED
@@ -282,19 +282,29 @@ class SpatialAnalyzer:
|
|
282 |
# Group objects by category and region
|
283 |
category_regions = {}
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
for obj in detected_objects:
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
category = cat_name
|
291 |
-
break
|
292 |
-
|
293 |
-
# Add to category-region mapping
|
294 |
if category not in category_regions:
|
295 |
category_regions[category] = {}
|
296 |
|
297 |
-
region = obj
|
298 |
if region not in category_regions[category]:
|
299 |
category_regions[category][region] = []
|
300 |
|
@@ -328,156 +338,470 @@ class SpatialAnalyzer:
|
|
328 |
elif scene_type == "upscale_dining":
|
329 |
# Upscale dining specific logic
|
330 |
zones.update(self._identify_upscale_dining_zones(category_regions, detected_objects))
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
else:
|
332 |
# Default zone identification for other scene types
|
333 |
zones.update(self._identify_default_zones(category_regions, detected_objects))
|
334 |
|
335 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
if not zones:
|
337 |
zones.update(self._identify_default_zones(category_regions, detected_objects))
|
338 |
|
|
|
|
|
|
|
|
|
339 |
return zones
|
340 |
|
341 |
-
def
|
342 |
"""
|
343 |
-
Identify
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
|
345 |
Args:
|
346 |
-
category_regions: Objects grouped by category and region
|
347 |
detected_objects: List of detected objects
|
348 |
-
scene_type:
|
349 |
|
350 |
Returns:
|
351 |
-
|
352 |
"""
|
353 |
zones = {}
|
354 |
|
355 |
-
#
|
356 |
-
if "
|
357 |
-
furniture_regions = category_regions["furniture"]
|
358 |
-
main_furniture_region = max(furniture_regions.items(),
|
359 |
-
key=lambda x: len(x[1]),
|
360 |
-
default=(None, []))
|
361 |
|
362 |
-
|
363 |
-
|
364 |
-
zones["social_zone"] = {
|
365 |
-
"region": main_furniture_region[0],
|
366 |
-
"objects": zone_objects,
|
367 |
-
"description": f"Social or seating area with {', '.join(zone_objects)}"
|
368 |
-
}
|
369 |
|
370 |
-
#
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
electronics_items.extend([obj["class_name"] for obj in region_objects])
|
375 |
-
|
376 |
-
if electronics_items:
|
377 |
-
zones["entertainment_zone"] = {
|
378 |
-
"region": self._find_main_region(category_regions.get("electronics", {})),
|
379 |
-
"objects": electronics_items,
|
380 |
-
"description": f"Entertainment or media area with {', '.join(electronics_items)}"
|
381 |
-
}
|
382 |
|
383 |
-
|
384 |
-
|
385 |
-
food_items = []
|
386 |
-
food_regions = {}
|
387 |
-
|
388 |
-
for category in food_zone_categories:
|
389 |
-
if category in category_regions:
|
390 |
-
for region, objects in category_regions[category].items():
|
391 |
-
if region not in food_regions:
|
392 |
-
food_regions[region] = []
|
393 |
-
food_regions[region].extend(objects)
|
394 |
-
food_items.extend([obj["class_name"] for obj in objects])
|
395 |
-
|
396 |
-
if food_items:
|
397 |
-
main_food_region = max(food_regions.items(),
|
398 |
-
key=lambda x: len(x[1]),
|
399 |
-
default=(None, []))
|
400 |
|
401 |
-
if
|
402 |
-
zones["
|
403 |
-
"region":
|
404 |
-
"objects":
|
405 |
-
"description":
|
406 |
}
|
407 |
|
408 |
-
|
409 |
-
work_items = []
|
410 |
-
work_regions = {}
|
411 |
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
if (workspace_detected or scene_type in ["office_workspace", "meeting_room"]) and work_items:
|
430 |
-
main_work_region = max(work_regions.items(),
|
431 |
-
key=lambda x: len(x[1]),
|
432 |
-
default=(None, []))
|
433 |
|
434 |
-
|
435 |
-
zones["workspace_zone"] = {
|
436 |
-
"region": main_work_region[0],
|
437 |
-
"objects": list(set(work_items)),
|
438 |
-
"description": f"Work or study area with {', '.join(list(set(work_items))[:3])}"
|
439 |
-
}
|
440 |
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
|
|
457 |
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
478 |
|
479 |
return zones
|
480 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
481 |
def _identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
|
482 |
"""
|
483 |
Identify functional zones for urban intersections with enhanced spatial awareness.
|
@@ -532,6 +856,142 @@ class SpatialAnalyzer:
|
|
532 |
|
533 |
return zones
|
534 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
535 |
def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict],
|
536 |
region_distribution: Dict) -> Dict:
|
537 |
"""
|
@@ -601,7 +1061,7 @@ class SpatialAnalyzer:
|
|
601 |
if not vehicles:
|
602 |
return traffic_zones
|
603 |
|
604 |
-
#
|
605 |
vehicle_regions = {}
|
606 |
for v in vehicles:
|
607 |
region = v["region"]
|
@@ -652,7 +1112,7 @@ class SpatialAnalyzer:
|
|
652 |
|
653 |
def _get_directional_description(self, region: str) -> str:
|
654 |
"""
|
655 |
-
|
656 |
|
657 |
Args:
|
658 |
region: Region name from the grid
|
@@ -1433,12 +1893,3 @@ class SpatialAnalyzer:
|
|
1433 |
return max(region_objects_dict.items(),
|
1434 |
key=lambda x: len(x[1]),
|
1435 |
default=("unknown", []))[0]
|
1436 |
-
|
1437 |
-
def _find_main_region(self, region_objects_dict: Dict) -> str:
|
1438 |
-
"""Find the main region with the most objects"""
|
1439 |
-
if not region_objects_dict:
|
1440 |
-
return "unknown"
|
1441 |
-
|
1442 |
-
return max(region_objects_dict.items(),
|
1443 |
-
key=lambda x: len(x[1]),
|
1444 |
-
default=("unknown", []))[0]
|
|
|
282 |
# Group objects by category and region
|
283 |
category_regions = {}
|
284 |
|
285 |
+
if not getattr(self, 'enable_landmark', True):
|
286 |
+
detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)]
|
287 |
+
|
288 |
+
# 過濾地標相關場景類型
|
289 |
+
if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]:
|
290 |
+
scene_type = "city_street"
|
291 |
+
|
292 |
+
# MODIFIED: Smart threshold evaluation instead of fixed values
|
293 |
+
should_identify = self._evaluate_zone_identification_feasibility(detected_objects, scene_type)
|
294 |
+
|
295 |
+
if not should_identify:
|
296 |
+
return {}
|
297 |
+
|
298 |
+
# MODIFIED: Build category_regions mapping (was missing in original)
|
299 |
for obj in detected_objects:
|
300 |
+
category = self._categorize_object(obj)
|
301 |
+
if not category:
|
302 |
+
continue
|
303 |
+
|
|
|
|
|
|
|
|
|
304 |
if category not in category_regions:
|
305 |
category_regions[category] = {}
|
306 |
|
307 |
+
region = obj.get("region", "center")
|
308 |
if region not in category_regions[category]:
|
309 |
category_regions[category][region] = []
|
310 |
|
|
|
338 |
elif scene_type == "upscale_dining":
|
339 |
# Upscale dining specific logic
|
340 |
zones.update(self._identify_upscale_dining_zones(category_regions, detected_objects))
|
341 |
+
elif scene_type == "tourist_landmark" or "landmark" in scene_type:
|
342 |
+
# 處理地標場景類型
|
343 |
+
landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
|
344 |
+
if landmark_objects:
|
345 |
+
landmark_zones = self._identify_landmark_zones(landmark_objects)
|
346 |
+
zones.update(landmark_zones)
|
347 |
else:
|
348 |
# Default zone identification for other scene types
|
349 |
zones.update(self._identify_default_zones(category_regions, detected_objects))
|
350 |
|
351 |
+
# 檢查是否有地標物體但場景類型不是地標類型
|
352 |
+
if scene_type != "tourist_landmark" and "landmark" not in scene_type:
|
353 |
+
landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)]
|
354 |
+
if landmark_objects:
|
355 |
+
# 添加地標功能區,但不覆蓋已有的功能區
|
356 |
+
landmark_zones = self._identify_landmark_zones(landmark_objects)
|
357 |
+
# 確保地標區域不會覆蓋已識別的其他重要功能區
|
358 |
+
for zone_id, zone_info in landmark_zones.items():
|
359 |
+
if zone_id not in zones:
|
360 |
+
zones[zone_id] = zone_info
|
361 |
+
|
362 |
+
# MODIFIED: Enhanced fallback strategy - try simplified identification if no zones found
|
363 |
if not zones:
|
364 |
zones.update(self._identify_default_zones(category_regions, detected_objects))
|
365 |
|
366 |
+
# Final fallback: create basic zones from high-confidence objects
|
367 |
+
if not zones:
|
368 |
+
zones.update(self._create_basic_zones_from_objects(detected_objects, scene_type))
|
369 |
+
|
370 |
return zones
|
371 |
|
372 |
+
def _identify_core_objects_for_scene(self, detected_objects: List[Dict], scene_type: str) -> List[Dict]:
|
373 |
"""
|
374 |
+
Identify core objects that define a particular scene type.
|
375 |
+
|
376 |
+
Args:
|
377 |
+
detected_objects: List of detected objects
|
378 |
+
scene_type: Scene type
|
379 |
+
|
380 |
+
Returns:
|
381 |
+
List of core objects for the scene
|
382 |
+
"""
|
383 |
+
core_objects = []
|
384 |
+
|
385 |
+
scene_core_mapping = {
|
386 |
+
"bedroom": [59], # bed
|
387 |
+
"kitchen": [68, 69, 71, 72], # microwave, oven, sink, refrigerator
|
388 |
+
"living_room": [57, 58, 62], # sofa, chair, tv
|
389 |
+
"dining_area": [60, 46, 47], # dining table, fork, knife
|
390 |
+
"office_workspace": [63, 64, 66, 73] # laptop, mouse, keyboard, book
|
391 |
+
}
|
392 |
+
|
393 |
+
if scene_type in scene_core_mapping:
|
394 |
+
core_class_ids = scene_core_mapping[scene_type]
|
395 |
+
for obj in detected_objects:
|
396 |
+
if obj["class_id"] in core_class_ids and obj.get("confidence", 0) >= 0.4:
|
397 |
+
core_objects.append(obj)
|
398 |
+
|
399 |
+
return core_objects
|
400 |
+
|
401 |
+
def _get_object_categories(self, detected_objects: List[Dict]) -> set:
|
402 |
+
"""Get unique object categories from detected objects."""
|
403 |
+
object_categories = set()
|
404 |
+
for obj in detected_objects:
|
405 |
+
category = self._categorize_object(obj)
|
406 |
+
if category:
|
407 |
+
object_categories.add(category)
|
408 |
+
return object_categories
|
409 |
+
|
410 |
+
def _create_basic_zones_from_objects(self, detected_objects: List[Dict], scene_type: str) -> Dict:
|
411 |
+
"""
|
412 |
+
Create basic functional zones from individual high-confidence objects.
|
413 |
+
This is a fallback when standard zone identification fails.
|
414 |
|
415 |
Args:
|
|
|
416 |
detected_objects: List of detected objects
|
417 |
+
scene_type: Scene type
|
418 |
|
419 |
Returns:
|
420 |
+
Dictionary of basic zones
|
421 |
"""
|
422 |
zones = {}
|
423 |
|
424 |
+
# Focus on high-confidence objects
|
425 |
+
high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
|
|
|
|
|
|
|
|
|
426 |
|
427 |
+
if not high_conf_objects:
|
428 |
+
high_conf_objects = detected_objects # Fallback to all objects
|
|
|
|
|
|
|
|
|
|
|
429 |
|
430 |
+
# Create zones based on individual important objects
|
431 |
+
for i, obj in enumerate(high_conf_objects[:3]): # Limit to top 3 objects
|
432 |
+
class_name = obj["class_name"]
|
433 |
+
region = obj.get("region", "center")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
434 |
|
435 |
+
# Create descriptive zone based on object type
|
436 |
+
zone_description = self._get_basic_zone_description(class_name, scene_type)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
|
438 |
+
if zone_description:
|
439 |
+
zones[f"functional_area_{i+1}"] = {
|
440 |
+
"region": region,
|
441 |
+
"objects": [class_name],
|
442 |
+
"description": zone_description
|
443 |
}
|
444 |
|
445 |
+
return zones
|
|
|
|
|
446 |
|
447 |
+
def _get_basic_zone_description(self, class_name: str, scene_type: str) -> str:
|
448 |
+
"""Generate basic zone description based on object and scene type."""
|
449 |
+
|
450 |
+
# Object-specific descriptions
|
451 |
+
descriptions = {
|
452 |
+
"bed": "Sleeping and rest area",
|
453 |
+
"sofa": "Seating and relaxation area",
|
454 |
+
"chair": "Seating area",
|
455 |
+
"dining table": "Dining and meal area",
|
456 |
+
"tv": "Entertainment and media area",
|
457 |
+
"laptop": "Work and computing area",
|
458 |
+
"potted plant": "Decorative and green space area",
|
459 |
+
"refrigerator": "Food storage and kitchen area",
|
460 |
+
"car": "Vehicle and transportation area",
|
461 |
+
"person": "Activity and social area"
|
462 |
+
}
|
|
|
|
|
|
|
|
|
|
|
463 |
|
464 |
+
return descriptions.get(class_name, f"Functional area with {class_name}")
|
|
|
|
|
|
|
|
|
|
|
465 |
|
466 |
+
def _categorize_object(self, obj: Dict) -> str:
|
467 |
+
"""
|
468 |
+
Categorize detected objects into functional categories for zone identification.
|
469 |
+
"""
|
470 |
+
class_id = obj.get("class_id", -1)
|
471 |
+
class_name = obj.get("class_name", "").lower()
|
472 |
+
|
473 |
+
# Use existing category mapping if available
|
474 |
+
if hasattr(self, 'OBJECT_CATEGORIES') and self.OBJECT_CATEGORIES:
|
475 |
+
for category, ids in self.OBJECT_CATEGORIES.items():
|
476 |
+
if class_id in ids:
|
477 |
+
return category
|
478 |
+
|
479 |
+
# Fallback categorization based on class names for common COCO classes
|
480 |
+
furniture_items = ["chair", "couch", "bed", "dining table", "toilet"]
|
481 |
+
plant_items = ["potted plant"]
|
482 |
+
electronic_items = ["tv", "laptop", "mouse", "remote", "keyboard", "cell phone"]
|
483 |
+
vehicle_items = ["bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat"]
|
484 |
+
person_items = ["person"]
|
485 |
+
kitchen_items = ["bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
|
486 |
+
"banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog",
|
487 |
+
"pizza", "donut", "cake", "refrigerator", "oven", "toaster", "sink", "microwave"]
|
488 |
+
sports_items = ["frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
|
489 |
+
"baseball glove", "skateboard", "surfboard", "tennis racket"]
|
490 |
+
personal_items = ["handbag", "tie", "suitcase", "umbrella", "backpack"]
|
491 |
+
|
492 |
+
if any(item in class_name for item in furniture_items):
|
493 |
+
return "furniture"
|
494 |
+
elif any(item in class_name for item in plant_items):
|
495 |
+
return "plant"
|
496 |
+
elif any(item in class_name for item in electronic_items):
|
497 |
+
return "electronics"
|
498 |
+
elif any(item in class_name for item in vehicle_items):
|
499 |
+
return "vehicle"
|
500 |
+
elif any(item in class_name for item in person_items):
|
501 |
+
return "person"
|
502 |
+
elif any(item in class_name for item in kitchen_items):
|
503 |
+
return "kitchen_items"
|
504 |
+
elif any(item in class_name for item in sports_items):
|
505 |
+
return "sports"
|
506 |
+
elif any(item in class_name for item in personal_items):
|
507 |
+
return "personal_items"
|
508 |
+
else:
|
509 |
+
return "misc"
|
510 |
|
511 |
+
def _evaluate_zone_identification_feasibility(self, detected_objects: List[Dict], scene_type: str) -> bool:
|
512 |
+
"""
|
513 |
+
基於物件關聯性和分布特徵的彈性可行性評估
|
514 |
+
"""
|
515 |
+
if len(detected_objects) < 2:
|
516 |
+
return False
|
517 |
|
518 |
+
# 計算不同置信度層級的物件分布
|
519 |
+
high_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.6]
|
520 |
+
medium_conf_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= 0.4]
|
521 |
+
|
522 |
+
# 基礎條件:至少需要一定數量的可信物件
|
523 |
+
if len(medium_conf_objects) < 2:
|
524 |
+
return False
|
525 |
+
|
526 |
+
# evalure relationships
|
527 |
+
functional_relationships = self._calculate_functional_relationships(detected_objects)
|
528 |
+
|
529 |
+
# 評估space的分布多樣性
|
530 |
+
spatial_diversity = self._calculate_spatial_diversity(detected_objects)
|
531 |
+
|
532 |
+
# 綜合評分機制
|
533 |
+
feasibility_score = 0
|
534 |
+
|
535 |
+
# 物件數量的貢獻值(權重30%)
|
536 |
+
object_count_score = min(len(detected_objects) / 5.0, 1.0) * 0.3
|
537 |
+
|
538 |
+
# 信心度質量貢獻(權重25%)
|
539 |
+
confidence_score = len(high_conf_objects) / max(len(detected_objects), 1) * 0.25
|
540 |
+
|
541 |
+
# 功能關聯性貢獻(權重25%)
|
542 |
+
relationship_score = functional_relationships * 0.25
|
543 |
+
|
544 |
+
# space多樣性貢獻(權重20%)
|
545 |
+
diversity_score = spatial_diversity * 0.20
|
546 |
+
|
547 |
+
feasibility_score = object_count_score + confidence_score + relationship_score + diversity_score
|
548 |
+
|
549 |
+
# 動態閾值:基於場景複雜度調整
|
550 |
+
complexity_threshold = self._get_complexity_threshold(scene_type)
|
551 |
+
|
552 |
+
return feasibility_score >= complexity_threshold
|
553 |
+
|
554 |
+
def _calculate_functional_relationships(self, detected_objects: List[Dict]) -> float:
|
555 |
+
"""
|
556 |
+
計算物件間的功能關聯性評分
|
557 |
+
基於常見的物件組合模式評估功能相關性
|
558 |
+
"""
|
559 |
+
relationship_pairs = {
|
560 |
+
# 家具組合關係
|
561 |
+
frozenset([56, 60]): 1.0, # 椅子+桌子 (dining/work area)
|
562 |
+
frozenset([57, 62]): 0.9, # 沙發+電視 (living area)
|
563 |
+
frozenset([59, 58]): 0.7, # 床+植物 (bedroom decor)
|
564 |
+
|
565 |
+
# 工作相關組合
|
566 |
+
frozenset([63, 66]): 0.9, # 筆電+鍵盤 (workspace)
|
567 |
+
frozenset([63, 64]): 0.8, # 筆電+滑鼠 (workspace)
|
568 |
+
frozenset([60, 63]): 0.8, # 桌子+筆電 (workspace)
|
569 |
+
|
570 |
+
# 廚房相關組合
|
571 |
+
frozenset([68, 72]): 0.9, # 微波爐+冰箱 (kitchen)
|
572 |
+
frozenset([69, 71]): 0.8, # 烤箱+水槽 (kitchen)
|
573 |
+
|
574 |
+
# 用餐相關組合
|
575 |
+
frozenset([60, 40]): 0.8, # 桌子+酒杯 (dining)
|
576 |
+
frozenset([60, 41]): 0.8, # 桌子+杯子 (dining)
|
577 |
+
frozenset([56, 40]): 0.7, # 椅子+酒杯 (dining)
|
578 |
+
|
579 |
+
# 交通相關組合
|
580 |
+
frozenset([2, 9]): 0.8, # 汽車+交通燈 (traffic)
|
581 |
+
frozenset([0, 9]): 0.7, # 行人+交通燈 (crosswalk)
|
582 |
+
}
|
583 |
+
|
584 |
+
detected_class_ids = set(obj["class_id"] for obj in detected_objects)
|
585 |
+
max_possible_score = 0
|
586 |
+
actual_score = 0
|
587 |
+
|
588 |
+
for pair, score in relationship_pairs.items():
|
589 |
+
max_possible_score += score
|
590 |
+
if pair.issubset(detected_class_ids):
|
591 |
+
actual_score += score
|
592 |
+
|
593 |
+
return actual_score / max_possible_score if max_possible_score > 0 else 0
|
594 |
+
|
595 |
+
def _calculate_spatial_diversity(self, detected_objects: List[Dict]) -> float:
|
596 |
+
"""
|
597 |
+
計算物件空間分布的多樣性
|
598 |
+
評估物件是否分散在不同區域,避免所有物件集中在單一區域
|
599 |
+
"""
|
600 |
+
regions = set(obj.get("region", "center") for obj in detected_objects)
|
601 |
+
unique_regions = len(regions)
|
602 |
+
|
603 |
+
return min(unique_regions / 2.0, 1.0)
|
604 |
+
|
605 |
+
def _get_complexity_threshold(self, scene_type: str) -> float:
|
606 |
+
"""
|
607 |
+
可根據場景類型返回適當的複雜度閾值
|
608 |
+
平衡不同場景的區域劃分需求
|
609 |
+
"""
|
610 |
+
# 較簡單場景需要較高分數才進行區域劃分
|
611 |
+
simple_scenes = ["bedroom", "bathroom", "closet"]
|
612 |
+
# 較複雜場景可以較低分數進行區域劃分
|
613 |
+
complex_scenes = ["living_room", "kitchen", "office_workspace", "dining_area"]
|
614 |
+
|
615 |
+
if scene_type in simple_scenes:
|
616 |
+
return 0.65 # 較高閾值,避免過度細分
|
617 |
+
elif scene_type in complex_scenes:
|
618 |
+
return 0.45 # 較低閾值,允許合理劃分
|
619 |
+
else:
|
620 |
+
return 0.55 # 中等閾值,平衡策略
|
621 |
+
|
622 |
+
def _identify_indoor_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
|
623 |
+
"""
|
624 |
+
平衡化的室內功能區域識別
|
625 |
+
採用通用的物件關聯性分析,避免場景特定的硬編碼
|
626 |
+
"""
|
627 |
+
zones = {}
|
628 |
+
|
629 |
+
# 辨識到主要功能區域(基於物件關聯性而非場景類型)
|
630 |
+
primary_zone = self._identify_primary_functional_area(detected_objects)
|
631 |
+
if primary_zone:
|
632 |
+
zones["primary_area"] = primary_zone
|
633 |
+
|
634 |
+
# 只有明確證據且物件數量足夠時創建次要功能區域
|
635 |
+
if len(zones) >= 1 and len(detected_objects) >= 6:
|
636 |
+
secondary_zone = self._identify_secondary_functional_area(detected_objects, zones)
|
637 |
+
if secondary_zone:
|
638 |
+
zones["secondary_area"] = secondary_zone
|
639 |
|
640 |
return zones
|
641 |
|
642 |
+
def _identify_primary_functional_area(self, detected_objects: List[Dict]) -> Dict:
|
643 |
+
"""
|
644 |
+
辨識主要功能區域,基於最強的物件關聯性組合
|
645 |
+
採用通用邏輯處理各種室內場景
|
646 |
+
"""
|
647 |
+
# 用餐區域檢測(桌椅組合)
|
648 |
+
dining_area = self._detect_functional_combination(
|
649 |
+
detected_objects,
|
650 |
+
primary_objects=[60], # dining table
|
651 |
+
supporting_objects=[56, 40, 41, 42, 43], # chair, wine glass, cup, fork, knife
|
652 |
+
min_supporting=2,
|
653 |
+
description_template="Dining area with table and seating arrangement"
|
654 |
+
)
|
655 |
+
if dining_area:
|
656 |
+
return dining_area
|
657 |
+
|
658 |
+
# 休息區域檢測(沙發電視組合或床)
|
659 |
+
seating_area = self._detect_functional_combination(
|
660 |
+
detected_objects,
|
661 |
+
primary_objects=[57, 59], # sofa, bed
|
662 |
+
supporting_objects=[62, 58, 56], # tv, potted plant, chair
|
663 |
+
min_supporting=1,
|
664 |
+
description_template="Seating and relaxation area"
|
665 |
+
)
|
666 |
+
if seating_area:
|
667 |
+
return seating_area
|
668 |
+
|
669 |
+
# 工作區域檢測(電子設備與家具組合)
|
670 |
+
work_area = self._detect_functional_combination(
|
671 |
+
detected_objects,
|
672 |
+
primary_objects=[63, 66], # laptop, keyboard
|
673 |
+
supporting_objects=[60, 56, 64], # dining table, chair, mouse
|
674 |
+
min_supporting=2,
|
675 |
+
description_template="Workspace area with electronics and furniture"
|
676 |
+
)
|
677 |
+
if work_area:
|
678 |
+
return work_area
|
679 |
+
|
680 |
+
return None
|
681 |
+
|
682 |
+
def _identify_secondary_functional_area(self, detected_objects: List[Dict], existing_zones: Dict) -> Dict:
|
683 |
+
"""
|
684 |
+
識別次要功能區域,避免與主要區域重疊
|
685 |
+
"""
|
686 |
+
# 獲取已使用的區域
|
687 |
+
used_regions = set(zone["region"] for zone in existing_zones.values())
|
688 |
+
|
689 |
+
# 裝飾區域檢測(植物集中區域)
|
690 |
+
decorative_area = self._detect_functional_combination(
|
691 |
+
detected_objects,
|
692 |
+
primary_objects=[58], # potted plant
|
693 |
+
supporting_objects=[75], # vase
|
694 |
+
min_supporting=0,
|
695 |
+
min_primary=3, # 至少需要3個植物
|
696 |
+
description_template="Decorative area with plants and ornamental items",
|
697 |
+
exclude_regions=used_regions
|
698 |
+
)
|
699 |
+
if decorative_area:
|
700 |
+
return decorative_area
|
701 |
+
|
702 |
+
# 儲存區域檢測(廚房電器組合)
|
703 |
+
storage_area = self._detect_functional_combination(
|
704 |
+
detected_objects,
|
705 |
+
primary_objects=[72, 68, 69], # refrigerator, microwave, oven
|
706 |
+
supporting_objects=[71], # sink
|
707 |
+
min_supporting=0,
|
708 |
+
min_primary=2,
|
709 |
+
description_template="Kitchen appliance and storage area",
|
710 |
+
exclude_regions=used_regions
|
711 |
+
)
|
712 |
+
if storage_area:
|
713 |
+
return storage_area
|
714 |
+
|
715 |
+
return None
|
716 |
+
|
717 |
+
def _detect_functional_combination(self, detected_objects: List[Dict], primary_objects: List[int],
|
718 |
+
supporting_objects: List[int], min_supporting: int,
|
719 |
+
description_template: str, min_primary: int = 1,
|
720 |
+
exclude_regions: set = None) -> Dict:
|
721 |
+
"""
|
722 |
+
通用的功能組合檢測方法
|
723 |
+
基於主要物件和支持物件的組合判斷功能區域
|
724 |
+
|
725 |
+
Args:
|
726 |
+
detected_objects: 檢測到的物件列表
|
727 |
+
primary_objects: 主要物件的class_id列表
|
728 |
+
supporting_objects: 支持物件的class_id列表
|
729 |
+
min_supporting: 最少需要的支持物件數量
|
730 |
+
description_template: 描述模板
|
731 |
+
min_primary: 最少需要的主要物件數量
|
732 |
+
exclude_regions: 需要排除的區域集合
|
733 |
+
|
734 |
+
Returns:
|
735 |
+
Dict: 功能區域資訊,如果不符合條件則返回None
|
736 |
+
"""
|
737 |
+
if exclude_regions is None:
|
738 |
+
exclude_regions = set()
|
739 |
+
|
740 |
+
# 收集主要物件
|
741 |
+
primary_objs = [obj for obj in detected_objects
|
742 |
+
if obj["class_id"] in primary_objects and obj.get("confidence", 0) >= 0.4]
|
743 |
+
|
744 |
+
# 收集支持物件
|
745 |
+
supporting_objs = [obj for obj in detected_objects
|
746 |
+
if obj["class_id"] in supporting_objects and obj.get("confidence", 0) >= 0.4]
|
747 |
+
|
748 |
+
# 檢查是否滿足最少數量要求
|
749 |
+
if len(primary_objs) < min_primary or len(supporting_objs) < min_supporting:
|
750 |
+
return None
|
751 |
+
|
752 |
+
# 按區域組織物件
|
753 |
+
region_combinations = {}
|
754 |
+
all_relevant_objs = primary_objs + supporting_objs
|
755 |
+
|
756 |
+
for obj in all_relevant_objs:
|
757 |
+
region = obj["region"]
|
758 |
+
|
759 |
+
# 排除指定區域
|
760 |
+
if region in exclude_regions:
|
761 |
+
continue
|
762 |
+
|
763 |
+
if region not in region_combinations:
|
764 |
+
region_combinations[region] = {"primary": [], "supporting": [], "all": []}
|
765 |
+
|
766 |
+
region_combinations[region]["all"].append(obj)
|
767 |
+
|
768 |
+
if obj["class_id"] in primary_objects:
|
769 |
+
region_combinations[region]["primary"].append(obj)
|
770 |
+
else:
|
771 |
+
region_combinations[region]["supporting"].append(obj)
|
772 |
+
|
773 |
+
# 找到最佳區域組合
|
774 |
+
best_region = None
|
775 |
+
best_score = 0
|
776 |
+
|
777 |
+
for region, objs in region_combinations.items():
|
778 |
+
# 計算該區域的評分
|
779 |
+
primary_count = len(objs["primary"])
|
780 |
+
supporting_count = len(objs["supporting"])
|
781 |
+
|
782 |
+
# 必須滿足最低要求
|
783 |
+
if primary_count < min_primary or supporting_count < min_supporting:
|
784 |
+
continue
|
785 |
+
|
786 |
+
# 計算組合評分(主要物件權重較高)
|
787 |
+
score = primary_count * 2 + supporting_count
|
788 |
+
|
789 |
+
if score > best_score:
|
790 |
+
best_score = score
|
791 |
+
best_region = region
|
792 |
+
|
793 |
+
if best_region is None:
|
794 |
+
return None
|
795 |
+
|
796 |
+
best_combination = region_combinations[best_region]
|
797 |
+
all_objects = [obj["class_name"] for obj in best_combination["all"]]
|
798 |
+
|
799 |
+
return {
|
800 |
+
"region": best_region,
|
801 |
+
"objects": all_objects,
|
802 |
+
"description": description_template
|
803 |
+
}
|
804 |
+
|
805 |
def _identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
|
806 |
"""
|
807 |
Identify functional zones for urban intersections with enhanced spatial awareness.
|
|
|
856 |
|
857 |
return zones
|
858 |
|
859 |
+
def _identify_landmark_zones(self, landmark_objects: List[Dict]) -> Dict:
|
860 |
+
"""
|
861 |
+
識別與地標相關的功能區域
|
862 |
+
|
863 |
+
Args:
|
864 |
+
landmark_objects: 被識別為地標的物體列表
|
865 |
+
|
866 |
+
Returns:
|
867 |
+
Dict: 地標相關的功能區域
|
868 |
+
"""
|
869 |
+
landmark_zones = {}
|
870 |
+
|
871 |
+
if not landmark_objects:
|
872 |
+
print("Warning: No landmark objects provided to _identify_landmark_zones")
|
873 |
+
return landmark_zones
|
874 |
+
|
875 |
+
try:
|
876 |
+
for i, landmark in enumerate(landmark_objects):
|
877 |
+
if not isinstance(landmark, dict):
|
878 |
+
print(f"Warning: Landmark object at index {i} is not a dictionary: {type(landmark)}")
|
879 |
+
continue
|
880 |
+
|
881 |
+
landmark_id = landmark.get("landmark_id")
|
882 |
+
if not landmark_id:
|
883 |
+
print(f"Warning: Missing landmark_id for landmark at index {i}")
|
884 |
+
landmark_id = f"unknown_landmark_{i}"
|
885 |
+
|
886 |
+
landmark_name = landmark.get("class_name", "Landmark")
|
887 |
+
landmark_type = landmark.get("landmark_type", "architectural")
|
888 |
+
landmark_region = landmark.get("region", "middle_center")
|
889 |
+
|
890 |
+
# 為地標創建主要觀景區
|
891 |
+
zone_id = f"landmark_zone_{i+1}"
|
892 |
+
zone_name = f"{landmark_name} Viewing Area"
|
893 |
+
|
894 |
+
# 根據地標類型調整描述
|
895 |
+
if landmark_type == "natural":
|
896 |
+
zone_description = f"Scenic viewpoint for observing {landmark_name}, a notable natural landmark in {landmark.get('location', 'this area')}."
|
897 |
+
primary_function = "Nature observation and photography"
|
898 |
+
elif landmark_type == "monument":
|
899 |
+
zone_description = f"Viewing area around {landmark_name}, a significant monument in {landmark.get('location', 'this area')}."
|
900 |
+
primary_function = "Historical appreciation and cultural tourism"
|
901 |
+
else: # architectural
|
902 |
+
zone_description = f"Area centered around {landmark_name}, where visitors can observe and appreciate this iconic structure in {landmark.get('location', 'this area')}."
|
903 |
+
primary_function = "Architectural tourism and photography"
|
904 |
+
|
905 |
+
# 確定與地標相關的物體
|
906 |
+
related_objects = ["person", "camera", "cell phone", "backpack"]
|
907 |
+
|
908 |
+
# 創建功能區域
|
909 |
+
landmark_zones[zone_id] = {
|
910 |
+
"name": zone_name,
|
911 |
+
"description": zone_description,
|
912 |
+
"objects": ["landmark"] + [obj for obj in related_objects if obj in [o.get("class_name") for o in landmark_objects]],
|
913 |
+
"region": landmark_region,
|
914 |
+
"primary_function": primary_function
|
915 |
+
}
|
916 |
+
|
917 |
+
# 如果有建造年份信息,加到描述中
|
918 |
+
if "year_built" in landmark:
|
919 |
+
landmark_zones[zone_id]["description"] += f" Built in {landmark['year_built']}."
|
920 |
+
|
921 |
+
# 如果有建築風格信息,加到描述中
|
922 |
+
if "architectural_style" in landmark:
|
923 |
+
landmark_zones[zone_id]["description"] += f" Features {landmark['architectural_style']} architectural style."
|
924 |
+
|
925 |
+
# 如果有重要性信息,加到描述中
|
926 |
+
if "significance" in landmark:
|
927 |
+
landmark_zones[zone_id]["description"] += f" {landmark['significance']}."
|
928 |
+
|
929 |
+
try:
|
930 |
+
# 創建照相區
|
931 |
+
photo_region = landmark_region # 默認與地標在同一區域
|
932 |
+
|
933 |
+
# 根據地標位置調整照相區位置(地標前方通常是照相區)
|
934 |
+
region_mapping = {
|
935 |
+
"top_left": "bottom_right",
|
936 |
+
"top_center": "bottom_center",
|
937 |
+
"top_right": "bottom_left",
|
938 |
+
"middle_left": "middle_right",
|
939 |
+
"middle_center": "bottom_center",
|
940 |
+
"middle_right": "middle_left",
|
941 |
+
"bottom_left": "top_right",
|
942 |
+
"bottom_center": "top_center",
|
943 |
+
"bottom_right": "top_left"
|
944 |
+
}
|
945 |
+
|
946 |
+
if landmark_region in region_mapping:
|
947 |
+
photo_region = region_mapping[landmark_region]
|
948 |
+
|
949 |
+
landmark_zones[f"photo_spot_{i+1}"] = {
|
950 |
+
"name": f"{landmark_name} Photography Spot",
|
951 |
+
"description": f"Popular position for photographing {landmark_name} with optimal viewing angle.",
|
952 |
+
"objects": ["camera", "person", "cell phone"],
|
953 |
+
"region": photo_region,
|
954 |
+
"primary_function": "Tourist photography"
|
955 |
+
}
|
956 |
+
except Exception as e:
|
957 |
+
print(f"Error creating photo spot zone: {e}")
|
958 |
+
|
959 |
+
try:
|
960 |
+
# 如果是著名地標,可能有紀念品販售區
|
961 |
+
if landmark.get("confidence", 0) > 0.7: # 高置信度地標更可能有紀念品區
|
962 |
+
# 根據地標位置找到適合的紀念品區位置(通常在地標附近但不直接在地標上)
|
963 |
+
adjacent_regions = {
|
964 |
+
"top_left": ["top_center", "middle_left"],
|
965 |
+
"top_center": ["top_left", "top_right"],
|
966 |
+
"top_right": ["top_center", "middle_right"],
|
967 |
+
"middle_left": ["top_left", "bottom_left"],
|
968 |
+
"middle_center": ["middle_left", "middle_right"],
|
969 |
+
"middle_right": ["top_right", "bottom_right"],
|
970 |
+
"bottom_left": ["middle_left", "bottom_center"],
|
971 |
+
"bottom_center": ["bottom_left", "bottom_right"],
|
972 |
+
"bottom_right": ["bottom_center", "middle_right"]
|
973 |
+
}
|
974 |
+
|
975 |
+
if landmark_region in adjacent_regions:
|
976 |
+
souvenir_region = adjacent_regions[landmark_region][0] # 選擇第一個相鄰區域
|
977 |
+
|
978 |
+
landmark_zones[f"souvenir_area_{i+1}"] = {
|
979 |
+
"name": f"{landmark_name} Souvenir Area",
|
980 |
+
"description": f"Area where visitors can purchase souvenirs and memorabilia related to {landmark_name}.",
|
981 |
+
"objects": ["person", "handbag", "backpack"],
|
982 |
+
"region": souvenir_region,
|
983 |
+
"primary_function": "Tourism commerce"
|
984 |
+
}
|
985 |
+
except Exception as e:
|
986 |
+
print(f"Error creating souvenir area zone: {e}")
|
987 |
+
|
988 |
+
except Exception as e:
|
989 |
+
print(f"Error in _identify_landmark_zones: {e}")
|
990 |
+
import traceback
|
991 |
+
traceback.print_exc()
|
992 |
+
|
993 |
+
return landmark_zones
|
994 |
+
|
995 |
def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict],
|
996 |
region_distribution: Dict) -> Dict:
|
997 |
"""
|
|
|
1061 |
if not vehicles:
|
1062 |
return traffic_zones
|
1063 |
|
1064 |
+
# 把運輸工具歸成一區
|
1065 |
vehicle_regions = {}
|
1066 |
for v in vehicles:
|
1067 |
region = v["region"]
|
|
|
1112 |
|
1113 |
def _get_directional_description(self, region: str) -> str:
|
1114 |
"""
|
1115 |
+
把方向轉換成方位(東西南北)
|
1116 |
|
1117 |
Args:
|
1118 |
region: Region name from the grid
|
|
|
1893 |
return max(region_objects_dict.items(),
|
1894 |
key=lambda x: len(x[1]),
|
1895 |
default=("unknown", []))[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
video_processor.py
CHANGED
@@ -222,7 +222,7 @@ class VideoProcessor:
|
|
222 |
else:
|
223 |
obj_id = next_object_id
|
224 |
next_object_id += 1
|
225 |
-
|
226 |
# 使用更明顯的顏色
|
227 |
bright_colors = [
|
228 |
(0, 0, 255), # red
|
|
|
222 |
else:
|
223 |
obj_id = next_object_id
|
224 |
next_object_id += 1
|
225 |
+
|
226 |
# 使用更明顯的顏色
|
227 |
bright_colors = [
|
228 |
(0, 0, 255), # red
|