Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
import clip | |
from PIL import Image | |
import numpy as np | |
from typing import List, Dict, Tuple, Optional, Union, Any | |
from landmark_data import ALL_LANDMARKS, get_all_landmark_prompts | |
from landmark_activities import LANDMARK_ACTIVITIES | |
class CLIPZeroShotClassifier: | |
""" | |
使用CLIP模型進行零樣本分類,專注於識別世界知名地標。 | |
作為YOLO檢測的補充,處理標準對象檢測無法識別的地標建築。 | |
""" | |
def __init__(self, model_name: str = "ViT-B/16", device: str = None): | |
""" | |
初始化CLIP零樣本分類器 | |
Args: | |
model_name: CLIP模型名稱,默認為"ViT-B/16" | |
device: 運行設備,None則自動選擇 | |
""" | |
# 設置運行設備 | |
if device is None: | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
else: | |
self.device = device | |
print(f"Initializing CLIP Zero-Shot Landmark Classifier ({model_name}) on {self.device}") | |
try: | |
self.model, self.preprocess = clip.load(model_name, device=self.device) | |
print(f"Successfully loaded CLIP model") | |
except Exception as e: | |
print(f"Error loading CLIP model: {e}") | |
raise | |
# 加載地標數據 | |
try: | |
self.landmark_data = ALL_LANDMARKS | |
self.landmark_prompts = get_all_landmark_prompts() | |
print(f"Loaded {len(self.landmark_prompts)} landmark prompts for classification") | |
# 預計算地標文本特徵 | |
self.landmark_text_features = self._precompute_text_features(self.landmark_prompts) | |
# 創建地標ID到索引的映射,可快速查找 | |
self.landmark_id_to_index = {landmark_id: i for i, landmark_id in enumerate(ALL_LANDMARKS.keys())} | |
# 初始化批處理參數 | |
self.batch_size = 16 # 默認批處理大小 | |
self.confidence_threshold_multipliers = { | |
"close_up": 0.9, # 近景標準閾值 | |
"partial": 0.6, # 部分可見降低閾值要求 | |
"distant": 0.5, # 遠景更低閾值要求 | |
"full_image": 0.7 # 整張圖像需要更高閾值 | |
} | |
self.landmark_type_thresholds = { | |
"tower": 0.5, # 塔型建築需要更高閾值 | |
"skyscraper": 0.4, # 摩天大樓使用較低閾值 | |
"building": 0.55, # 一般建築物閾值略微降低 | |
"monument": 0.5, # 紀念碑閾值 | |
"natural": 0.6 # 自然地標可以使用較低閾值 | |
} | |
# 初始化結果快取 | |
self.results_cache = {} # 使用圖像hash作為鍵 | |
self.cache_max_size = 100 # 最大快取項目數 | |
except ImportError: | |
print("Warning: landmark_data.py not found. Landmark classification will be limited") | |
self.landmark_data = {} | |
self.landmark_prompts = [] | |
self.landmark_text_features = None | |
self.landmark_id_to_index = {} | |
self.results_cache = {} | |
def _get_image_hash(self, image): | |
""" | |
為圖像生成簡單的 hash 值用於快取 | |
Args: | |
image: PIL Image 或 numpy 數組 | |
Returns: | |
str: 圖像的 hash 值 | |
""" | |
if isinstance(image, np.ndarray): | |
# 對於 numpy 數組,降採樣並計算簡單 hash | |
small_img = image[::10, ::10] if image.ndim == 3 else image | |
return hash(small_img.tobytes()) | |
else: | |
# 對於 PIL 圖像,調整大小後轉換為 bytes | |
small_img = image.resize((32, 32)) | |
return hash(small_img.tobytes()) | |
def _manage_cache(self): | |
""" | |
管理結果快取大小 | |
""" | |
if len(self.results_cache) > self.cache_max_size: | |
oldest_key = next(iter(self.results_cache)) | |
del self.results_cache[oldest_key] | |
def set_batch_size(self, batch_size: int): | |
""" | |
設置批處理大小 | |
Args: | |
batch_size: 新的批處理大小 | |
""" | |
self.batch_size = max(1, batch_size) | |
print(f"Batch size set to {self.batch_size}") | |
def adjust_confidence_threshold(self, detection_type: str, multiplier: float): | |
""" | |
調整特定檢測類型的置信度閾值乘數 | |
Args: | |
detection_type: 檢測類型 ('close_up', 'partial', 'distant', 'full_image') | |
multiplier: 置信度閾值乘數 | |
""" | |
if detection_type in self.confidence_threshold_multipliers: | |
self.confidence_threshold_multipliers[detection_type] = max(0.1, min(1.5, multiplier)) | |
print(f"Adjusted confidence threshold multiplier for {detection_type} to {multiplier}") | |
else: | |
print(f"Unknown detection type: {detection_type}") | |
def _precompute_text_features(self, text_prompts: List[str]) -> torch.Tensor: | |
""" | |
預計算文本提示的CLIP特徵,提高批處理效率 | |
Args: | |
text_prompts: 文本提示列表 | |
Returns: | |
torch.Tensor: 預計算的文本特徵 | |
""" | |
if not text_prompts: | |
return None | |
with torch.no_grad(): | |
# Process in batches to avoid CUDA memory issues | |
batch_size = 128 # Adjust based on GPU memory | |
features_list = [] | |
for i in range(0, len(text_prompts), batch_size): | |
batch_prompts = text_prompts[i:i+batch_size] | |
text_tokens = clip.tokenize(batch_prompts).to(self.device) | |
batch_features = self.model.encode_text(text_tokens) | |
batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True) | |
features_list.append(batch_features) | |
# Concatenate all batches | |
if len(features_list) > 1: | |
text_features = torch.cat(features_list, dim=0) | |
else: | |
text_features = features_list[0] | |
return text_features | |
def _perform_pyramid_analysis(self, | |
image: Union[Image.Image, np.ndarray], | |
levels: int = 4, | |
base_threshold: float = 0.25, | |
aspect_ratios: List[float] = [1.0, 0.75, 1.5]) -> Dict[str, Any]: | |
""" | |
Performs multi-scale pyramid analysis on the image to improve landmark detection. | |
Args: | |
image: Input image | |
levels: Number of pyramid levels | |
base_threshold: Base confidence threshold | |
aspect_ratios: Different aspect ratios to try (for tall buildings vs wide landscapes) | |
Returns: | |
Dict: Results of pyramid analysis | |
""" | |
# Ensure image is PIL format | |
if not isinstance(image, Image.Image): | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
else: | |
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.") | |
width, height = image.size | |
pyramid_results = [] | |
# 對每個縮放和縱橫比組合進行處理 | |
for level in range(levels): | |
# 計算縮放因子 | |
scale_factor = 1.0 - (level * 0.2) | |
for aspect_ratio in aspect_ratios: | |
# 計算新尺寸,保持面積近似不變 | |
if aspect_ratio != 1.0: | |
# 保持面積近似不變的情況下調整縱橫比 | |
new_width = int(width * scale_factor * (1/aspect_ratio)**0.5) | |
new_height = int(height * scale_factor * aspect_ratio**0.5) | |
else: | |
new_width = int(width * scale_factor) | |
new_height = int(height * scale_factor) | |
# 調整圖像大小 | |
scaled_image = image.resize((new_width, new_height), Image.LANCZOS) | |
# 預處理圖像 | |
image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device) | |
# 獲取圖像特徵 | |
with torch.no_grad(): | |
image_features = self.model.encode_image(image_input) | |
image_features = image_features / image_features.norm(dim=-1, keepdim=True) | |
# 計算相似度 | |
similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1) | |
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0] | |
# 找到最佳匹配 | |
best_idx = similarity.argmax().item() | |
best_score = similarity[best_idx] | |
if best_score >= base_threshold: | |
landmark_id = list(self.landmark_data.keys())[best_idx] | |
landmark_info = self.landmark_data[landmark_id] | |
pyramid_results.append({ | |
"landmark_id": landmark_id, | |
"landmark_name": landmark_info["name"], | |
"confidence": float(best_score), | |
"scale_factor": scale_factor, | |
"aspect_ratio": aspect_ratio, | |
"location": landmark_info["location"] | |
}) | |
# 按置信度排序 | |
pyramid_results.sort(key=lambda x: x["confidence"], reverse=True) | |
return { | |
"is_landmark": len(pyramid_results) > 0, | |
"results": pyramid_results, | |
"best_result": pyramid_results[0] if pyramid_results else None | |
} | |
def _enhance_features(self, image: Union[Image.Image, np.ndarray]) -> Image.Image: | |
""" | |
Enhances image features to improve landmark detection. | |
Args: | |
image: Input image | |
Returns: | |
PIL.Image: Enhanced image | |
""" | |
# Ensure image is PIL format | |
if not isinstance(image, Image.Image): | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
else: | |
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.") | |
# Convert to numpy for processing | |
img_array = np.array(image) | |
# Skip processing for grayscale images | |
if len(img_array.shape) < 3: | |
return image | |
# Apply adaptive contrast enhancement | |
# Convert to LAB color space | |
from skimage import color, exposure | |
try: | |
# Convert to LAB color space | |
if img_array.shape[2] == 4: # Handle RGBA | |
img_array = img_array[:,:,:3] | |
lab = color.rgb2lab(img_array[:,:,:3] / 255.0) | |
l_channel = lab[:,:,0] | |
# Enhance contrast of L channel | |
p2, p98 = np.percentile(l_channel, (2, 98)) | |
l_channel_enhanced = exposure.rescale_intensity(l_channel, in_range=(p2, p98)) | |
# Replace L channel and convert back to RGB | |
lab[:,:,0] = l_channel_enhanced | |
enhanced_img = color.lab2rgb(lab) * 255.0 | |
enhanced_img = enhanced_img.astype(np.uint8) | |
return Image.fromarray(enhanced_img) | |
except ImportError: | |
print("Warning: skimage not available for feature enhancement") | |
return image | |
except Exception as e: | |
print(f"Error in feature enhancement: {e}") | |
return image | |
def _determine_landmark_type(self, landmark_id): | |
""" | |
自動判斷地標類型,基於地標數據和命名 | |
Returns: | |
str: 地標類型,用於調整閾值 | |
""" | |
if not landmark_id: | |
return "building" # 預設類型 | |
# 獲取地標詳細數據 | |
landmark_data = self.landmark_data if hasattr(self, 'landmark_data') else {} | |
landmark_info = landmark_data.get(landmark_id, {}) | |
# 獲取地標相關文本 | |
landmark_id_lower = landmark_id.lower() | |
landmark_name = landmark_info.get("name", "").lower() | |
landmark_location = landmark_info.get("location", "").lower() | |
landmark_aliases = [alias.lower() for alias in landmark_info.get("aliases", [])] | |
# 合併所有文本數據用於特徵判斷 | |
combined_text = " ".join([landmark_id_lower, landmark_name] + landmark_aliases) | |
# 地標類型的特色特徵 | |
type_features = { | |
"skyscraper": ["skyscraper", "tall", "tower", "高樓", "摩天", "大厦", "タワー"], | |
"tower": ["tower", "bell", "clock", "塔", "鐘樓", "タワー", "campanile"], | |
"monument": ["monument", "memorial", "statue", "紀念", "雕像", "像", "memorial"], | |
"natural": ["mountain", "lake", "canyon", "falls", "beach", "山", "湖", "峽谷", "瀑布", "海灘"], | |
"temple": ["temple", "shrine", "寺", "神社", "廟"], | |
"palace": ["palace", "castle", "宮", "城", "皇宮", "宫殿"], | |
"distinctive": ["unique", "leaning", "slanted", "傾斜", "斜", "獨特", "傾く"] | |
} | |
# 檢查是否位於亞洲地區 | |
asian_regions = ["china", "japan", "korea", "taiwan", "singapore", "vietnam", "thailand", | |
"hong kong", "中國", "日本", "韓國", "台灣", "新加坡", "越南", "泰國", "香港"] | |
is_asian = any(region in landmark_location for region in asian_regions) | |
# 判斷地標類型 | |
best_type = None | |
max_matches = 0 | |
for type_name, features in type_features.items(): | |
# 計算特徵詞匹配數量 | |
matches = sum(1 for feature in features if feature in combined_text) | |
if matches > max_matches: | |
max_matches = matches | |
best_type = type_name | |
# 處理亞洲地區特例 | |
if is_asian and best_type == "tower": | |
best_type = "skyscraper" # 亞洲地區的塔型建築閾值較低 | |
# 特例處理:檢測傾斜建築 | |
if any(term in combined_text for term in ["leaning", "slanted", "tilt", "inclined", "斜", "傾斜"]): | |
return "distinctive" # 傾斜建築需要特殊處理 | |
return best_type if best_type and max_matches > 0 else "building" # 預設為一般建築 | |
def classify_image_region(self, | |
image: Union[Image.Image, np.ndarray], | |
box: List[float], | |
threshold: float = 0.25, | |
detection_type: str = "close_up") -> Dict[str, Any]: | |
""" | |
對圖像的特定區域進行地標分類,具有增強的多尺度和部分識別能力 | |
Args: | |
image: 原始圖像 (PIL Image 或 numpy數組) | |
box: 邊界框 [x1, y1, x2, y2] | |
threshold: 基礎分類置信度閾值 | |
detection_type: 檢測類型,影響置信度調整 | |
Returns: | |
Dict: 地標分類結果 | |
""" | |
# 確保圖像是PIL格式 | |
if not isinstance(image, Image.Image): | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
else: | |
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.") | |
# 生成圖像區域的hash用於快取 | |
region_key = (self._get_image_hash(image), tuple(box), detection_type) | |
if region_key in self.results_cache: | |
return self.results_cache[region_key] | |
# 裁剪區域 | |
x1, y1, x2, y2 = map(int, box) | |
cropped_image = image.crop((x1, y1, x2, y2)) | |
enhanced_image = self._enhance_features(cropped_image) | |
# 分析視角信息 | |
viewpoint_info = self._analyze_viewpoint(enhanced_image) | |
dominant_viewpoint = viewpoint_info["dominant_viewpoint"] | |
# 計算區域信息 | |
region_width = x2 - x1 | |
region_height = y2 - y1 | |
image_width, image_height = image.size | |
# 根據區域大小判斷可能的檢測類型 | |
region_area_ratio = (region_width * region_height) / (image_width * image_height) | |
if detection_type == "auto": | |
if region_area_ratio > 0.5: | |
detection_type = "close_up" | |
elif region_area_ratio > 0.2: | |
detection_type = "partial" | |
else: | |
detection_type = "distant" | |
# 根據視角調整檢測類型 | |
if dominant_viewpoint == "close_up" and detection_type != "close_up": | |
detection_type = "close_up" | |
elif dominant_viewpoint == "distant" and detection_type != "distant": | |
detection_type = "distant" | |
elif dominant_viewpoint == "angled_view": | |
detection_type = "partial" # 角度視圖可能是部分可見 | |
# 調整置信度閾值 | |
base_multiplier = self.confidence_threshold_multipliers.get(detection_type, 1.0) | |
adjusted_threshold = threshold * base_multiplier | |
# 調整多尺度處理的尺度範圍和縱橫比 - 增強對傾斜建築的支持 | |
scales = [1.0] # 默認尺度 | |
# 基於視角選擇合適的尺度和縱橫比 | |
if detection_type in ["partial", "distant"]: | |
scales = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3] # 標準範圍 | |
# 如果是特殊視角,進一步調整尺度和縱橫比 - 新增 | |
if dominant_viewpoint in ["angled_view", "low_angle"]: | |
scales = [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4] # 更寬的範圍 | |
# 準備縱橫比 - 同時支持水平和垂直地標 | |
aspect_ratios = [1.0, 0.8, 1.2] # 標準縱橫比 | |
# 針對可能的傾斜建築增加更多縱橫比 - 新增 | |
if dominant_viewpoint in ["angled_view", "unique_feature"]: | |
aspect_ratios = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5] # 更多樣的縱橫比 | |
best_result = { | |
"landmark_id": None, | |
"landmark_name": None, | |
"confidence": 0.0, | |
"is_landmark": False | |
} | |
# 多尺度和縱橫比分析 | |
for scale in scales: | |
for aspect_ratio in aspect_ratios: | |
# 縮放裁剪區域 | |
current_width, current_height = cropped_image.size | |
# 計算新尺寸,保持面積不變但調整縱橫比 | |
if aspect_ratio != 1.0: | |
new_width = int(current_width * scale * (1/aspect_ratio)**0.5) | |
new_height = int(current_height * scale * aspect_ratio**0.5) | |
else: | |
new_width = int(current_width * scale) | |
new_height = int(current_height * scale) | |
# 確保尺寸至少為1像素 | |
new_width = max(1, new_width) | |
new_height = max(1, new_height) | |
# 縮放圖像 | |
try: | |
scaled_image = cropped_image.resize((new_width, new_height), Image.LANCZOS) | |
except Exception as e: | |
print(f"Failed to resize image to {new_width}x{new_height}: {e}") | |
continue | |
# 預處理裁剪圖像 | |
try: | |
image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device) | |
except Exception as e: | |
print(f"Failed to preprocess image: {e}") | |
continue | |
# 獲取圖像特徵 | |
with torch.no_grad(): | |
try: | |
image_features = self.model.encode_image(image_input) | |
image_features = image_features / image_features.norm(dim=-1, keepdim=True) | |
# 計算與地標提示的相似度 | |
similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1) | |
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0] | |
# 找到最佳匹配 | |
best_idx = similarity.argmax().item() | |
best_score = similarity[best_idx] | |
# 如果當前尺度結果更好,則更新 | |
if best_score > best_result["confidence"]: | |
landmark_id = list(self.landmark_data.keys())[best_idx] | |
landmark_info = self.landmark_data[landmark_id] | |
best_result = { | |
"landmark_id": landmark_id, | |
"landmark_name": landmark_info["name"], | |
"location": landmark_info["location"], | |
"confidence": float(best_score), | |
"is_landmark": best_score >= adjusted_threshold, | |
"scale_used": scale, | |
"aspect_ratio_used": aspect_ratio, | |
"viewpoint": dominant_viewpoint | |
} | |
# 添加額外可用信息 | |
for key in ["year_built", "architectural_style", "significance"]: | |
if key in landmark_info: | |
best_result[key] = landmark_info[key] | |
except Exception as e: | |
print(f"Error in calculating similarity: {e}") | |
continue | |
# 只有在有識別出地標ID且信心度足夠高時才應用地標類型閾值調整 | |
if best_result["landmark_id"]: | |
landmark_type = self._determine_landmark_type(best_result["landmark_id"]) | |
# 檢測是否為特殊類型的建築如斜塔 | |
if landmark_type == "distinctive": | |
# 特殊建築的閾值降低25% | |
type_multiplier = 0.75 | |
else: | |
# 使用已有的類型閾值 | |
type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5 | |
# 更新判斷是否為地標的標準 | |
final_threshold = adjusted_threshold * type_multiplier | |
best_result["is_landmark"] = best_result["confidence"] >= final_threshold | |
best_result["landmark_type"] = landmark_type # 添加地標類型信息 | |
best_result["threshold_applied"] = final_threshold # 記錄應用的閾值 | |
# 快取結果 | |
self.results_cache[region_key] = best_result | |
self._manage_cache() | |
return best_result | |
def classify_batch_regions(self, | |
image: Union[Image.Image, np.ndarray], | |
boxes: List[List[float]], | |
threshold: float = 0.28) -> List[Dict[str, Any]]: | |
""" | |
批量處理多個圖像區域,提高效率 | |
Args: | |
image: 原始圖像 | |
boxes: 邊界框列表 | |
threshold: 置信度閾值 | |
Returns: | |
List[Dict]: 分類結果列表 | |
""" | |
if not self.landmark_text_features is not None: | |
return [{"is_landmark": False, "confidence": 0.0} for _ in boxes] | |
# 確保圖像是PIL格式 | |
if not isinstance(image, Image.Image): | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
else: | |
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.") | |
# 無框可處理時 | |
if not boxes: | |
return [] | |
# 裁剪並預處理所有區域 | |
cropped_inputs = [] | |
for box in boxes: | |
x1, y1, x2, y2 = map(int, box) | |
cropped_image = image.crop((x1, y1, x2, y2)) | |
processed_image = self.preprocess(cropped_image).unsqueeze(0) | |
cropped_inputs.append(processed_image) | |
# batch process | |
batch_tensor = torch.cat(cropped_inputs).to(self.device) | |
# batch encoding | |
with torch.no_grad(): | |
image_features = self.model.encode_image(batch_tensor) | |
image_features = image_features / image_features.norm(dim=-1, keepdim=True) | |
# 計算相似度 | |
similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1) | |
similarity = similarity.cpu().numpy() if self.device == "cuda" else similarity.numpy() | |
# 處理每個區域的結果 | |
results = [] | |
for i, sim in enumerate(similarity): | |
best_idx = sim.argmax().item() | |
best_score = sim[best_idx] | |
if best_score >= threshold: | |
landmark_id = list(self.landmark_data.keys())[best_idx] | |
landmark_info = self.landmark_data[landmark_id] | |
results.append({ | |
"landmark_id": landmark_id, | |
"landmark_name": landmark_info["name"], | |
"location": landmark_info["location"], | |
"confidence": float(best_score), | |
"is_landmark": True, | |
"box": boxes[i] | |
}) | |
else: | |
results.append({ | |
"landmark_id": None, | |
"landmark_name": None, | |
"confidence": float(best_score), | |
"is_landmark": False, | |
"box": boxes[i] | |
}) | |
return results | |
def search_entire_image(self, | |
image: Union[Image.Image, np.ndarray], | |
threshold: float = 0.35, | |
detailed_analysis: bool = False) -> Dict[str, Any]: | |
""" | |
檢查整張圖像是否包含地標,具有增強的分析能力 | |
Args: | |
image: 原始圖像 | |
threshold: 置信度閾值 | |
detailed_analysis: 是否進行詳細分析,包括多區域檢測 | |
Returns: | |
Dict: 地標分類結果 | |
""" | |
# 確保圖像是PIL格式 | |
if not isinstance(image, Image.Image): | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
else: | |
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.") | |
# 檢查快取 | |
image_key = (self._get_image_hash(image), "entire_image", detailed_analysis) | |
if image_key in self.results_cache: | |
return self.results_cache[image_key] | |
# 調整閾值 | |
adjusted_threshold = threshold * self.confidence_threshold_multipliers.get("full_image", 1.0) | |
# 預處理圖像 | |
image_input = self.preprocess(image).unsqueeze(0).to(self.device) | |
# 獲取圖像特徵 | |
with torch.no_grad(): | |
image_features = self.model.encode_image(image_input) | |
image_features = image_features / image_features.norm(dim=-1, keepdim=True) | |
# 計算與地標提示的相似度 | |
similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1) | |
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0] | |
# 找到最佳匹配 | |
best_idx = similarity.argmax().item() | |
best_score = similarity[best_idx] | |
# top3 landmark | |
top_indices = similarity.argsort()[-3:][::-1] | |
top_landmarks = [] | |
for idx in top_indices: | |
score = similarity[idx] | |
landmark_id = list(self.landmark_data.keys())[idx] | |
landmark_info = self.landmark_data[landmark_id] | |
landmark_result = { | |
"landmark_id": landmark_id, | |
"landmark_name": landmark_info["name"], | |
"location": landmark_info["location"], | |
"confidence": float(score) | |
} | |
# 添加額外可用信息 | |
if "year_built" in landmark_info: | |
landmark_result["year_built"] = landmark_info["year_built"] | |
if "architectural_style" in landmark_info: | |
landmark_result["architectural_style"] = landmark_info["architectural_style"] | |
if "significance" in landmark_info: | |
landmark_result["significance"] = landmark_info["significance"] | |
top_landmarks.append(landmark_result) | |
# main result | |
result = {} | |
if best_score >= adjusted_threshold: | |
landmark_id = list(self.landmark_data.keys())[best_idx] | |
landmark_info = self.landmark_data[landmark_id] | |
# 應用地標類型特定閾值 | |
landmark_type = self._determine_landmark_type(landmark_id) | |
type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5 | |
final_threshold = adjusted_threshold * type_multiplier | |
if best_score >= final_threshold: | |
result = { | |
"landmark_id": landmark_id, | |
"landmark_name": landmark_info["name"], | |
"location": landmark_info["location"], | |
"confidence": float(best_score), | |
"is_landmark": True, | |
"landmark_type": landmark_type, | |
"top_landmarks": top_landmarks | |
} | |
# 添加額外可用信息 | |
if "year_built" in landmark_info: | |
result["year_built"] = landmark_info["year_built"] | |
if "architectural_style" in landmark_info: | |
result["architectural_style"] = landmark_info["architectural_style"] | |
if "significance" in landmark_info: | |
result["significance"] = landmark_info["significance"] | |
else: | |
result = { | |
"landmark_id": None, | |
"landmark_name": None, | |
"confidence": float(best_score), | |
"is_landmark": False, | |
"top_landmarks": top_landmarks | |
} | |
# 如果請求詳細分析且是地標,進一步分析圖像區域 | |
if detailed_analysis and result.get("is_landmark", False): | |
# 創建不同區域進行更深入分析 | |
width, height = image.size | |
regions = [ | |
# 中心區域 | |
[width * 0.25, height * 0.25, width * 0.75, height * 0.75], | |
# 左半部 | |
[0, 0, width * 0.5, height], | |
# 右半部 | |
[width * 0.5, 0, width, height], | |
# 上半部 | |
[0, 0, width, height * 0.5], | |
# 下半部 | |
[0, height * 0.5, width, height] | |
] | |
region_results = [] | |
for i, box in enumerate(regions): | |
region_result = self.classify_image_region( | |
image, | |
box, | |
threshold=threshold * 0.9, | |
detection_type="partial" | |
) | |
if region_result["is_landmark"]: | |
region_result["region_name"] = ["center", "left", "right", "top", "bottom"][i] | |
region_results.append(region_result) | |
# 添加區域分析結果 | |
if region_results: | |
result["region_analyses"] = region_results | |
# 快取結果 | |
self.results_cache[image_key] = result | |
self._manage_cache() | |
return result | |
def enhanced_landmark_detection(self, | |
image: Union[Image.Image, np.ndarray], | |
threshold: float = 0.3) -> Dict[str, Any]: | |
""" | |
Enhanced landmark detection using multiple analysis techniques. | |
Args: | |
image: Input image | |
threshold: Base confidence threshold | |
Returns: | |
Dict: Comprehensive landmark detection results | |
""" | |
# Ensure image is PIL format | |
if not isinstance(image, Image.Image): | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
else: | |
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.") | |
# Phase 1: Analyze viewpoint to adjust detection parameters | |
viewpoint_info = self._analyze_viewpoint(image) | |
viewpoint = viewpoint_info["dominant_viewpoint"] | |
# Adjust threshold based on viewpoint | |
if viewpoint == "distant": | |
adjusted_threshold = threshold * 0.7 # Lower threshold for distant views | |
elif viewpoint == "close_up": | |
adjusted_threshold = threshold * 1.1 # Higher threshold for close-ups | |
else: | |
adjusted_threshold = threshold | |
# Phase 2: Perform multi-scale pyramid analysis | |
pyramid_results = self._perform_pyramid_analysis(image, levels=3, base_threshold=adjusted_threshold) | |
# Phase 3: Perform grid-based region analysis | |
grid_results = [] | |
width, height = image.size | |
# Create adaptive grid based on viewpoint | |
if viewpoint == "distant": | |
grid_size = 3 # Coarser grid for distant views | |
elif viewpoint == "close_up": | |
grid_size = 5 # Finer grid for close-ups | |
else: | |
grid_size = 4 # Default grid size | |
# Generate grid regions | |
for i in range(grid_size): | |
for j in range(grid_size): | |
box = [ | |
width * (j/grid_size), | |
height * (i/grid_size), | |
width * ((j+1)/grid_size), | |
height * ((i+1)/grid_size) | |
] | |
# Apply feature enhancement | |
region_result = self.classify_image_region( | |
image, | |
box, | |
threshold=adjusted_threshold, | |
detection_type="auto" | |
) | |
if region_result["is_landmark"]: | |
region_result["grid_position"] = (i, j) | |
grid_results.append(region_result) | |
# Phase 4: Cross-validate and combine results | |
all_detections = [] | |
# Add pyramid results | |
if pyramid_results["is_landmark"] and pyramid_results["best_result"]: | |
all_detections.append({ | |
"source": "pyramid", | |
"landmark_id": pyramid_results["best_result"]["landmark_id"], | |
"landmark_name": pyramid_results["best_result"]["landmark_name"], | |
"confidence": pyramid_results["best_result"]["confidence"], | |
"scale_factor": pyramid_results["best_result"].get("scale_factor", 1.0) | |
}) | |
# Add grid results | |
for result in grid_results: | |
all_detections.append({ | |
"source": "grid", | |
"landmark_id": result["landmark_id"], | |
"landmark_name": result["landmark_name"], | |
"confidence": result["confidence"], | |
"grid_position": result.get("grid_position", (0, 0)) | |
}) | |
# Search entire image | |
full_image_result = self.search_entire_image(image, threshold=adjusted_threshold) | |
if full_image_result and full_image_result.get("is_landmark", False): | |
all_detections.append({ | |
"source": "full_image", | |
"landmark_id": full_image_result["landmark_id"], | |
"landmark_name": full_image_result["landmark_name"], | |
"confidence": full_image_result["confidence"] | |
}) | |
# Group by landmark_id and calculate aggregate confidence | |
landmark_groups = {} | |
for detection in all_detections: | |
landmark_id = detection["landmark_id"] | |
if landmark_id not in landmark_groups: | |
landmark_groups[landmark_id] = { | |
"landmark_id": landmark_id, | |
"landmark_name": detection["landmark_name"], | |
"detections": [], | |
"sources": set() | |
} | |
landmark_groups[landmark_id]["detections"].append(detection) | |
landmark_groups[landmark_id]["sources"].add(detection["source"]) | |
# Calculate aggregate confidence for each landmark | |
for landmark_id, group in landmark_groups.items(): | |
detections = group["detections"] | |
# Base confidence is the maximum confidence from any source | |
max_confidence = max(d["confidence"] for d in detections) | |
# Bonus for detection from multiple sources | |
source_count = len(group["sources"]) | |
source_bonus = min(0.15, (source_count - 1) * 0.05) # Up to 15% bonus | |
# Consistency bonus for multiple detections of the same landmark | |
detection_count = len(detections) | |
consistency_bonus = min(0.1, (detection_count - 1) * 0.02) # Up to 10% bonus | |
# Calculate final confidence | |
aggregate_confidence = min(1.0, max_confidence + source_bonus + consistency_bonus) | |
group["confidence"] = aggregate_confidence | |
group["detection_count"] = detection_count | |
group["source_count"] = source_count | |
# Sort landmarks by confidence | |
sorted_landmarks = sorted( | |
landmark_groups.values(), | |
key=lambda x: x["confidence"], | |
reverse=True | |
) | |
return { | |
"is_landmark_scene": len(sorted_landmarks) > 0, | |
"detected_landmarks": sorted_landmarks, | |
"viewpoint_info": viewpoint_info, | |
"primary_landmark": sorted_landmarks[0] if sorted_landmarks else None | |
} | |
def _analyze_architectural_features(self, image): | |
""" | |
Analyzes the architectural features of a structure in the image without hardcoding specific landmarks. | |
Args: | |
image: Input image | |
Returns: | |
Dict: Architectural feature analysis results | |
""" | |
# Define universal architectural feature prompts that apply to all types of landmarks | |
architecture_prompts = { | |
"tall_structure": "a tall vertical structure standing alone", | |
"tiered_building": "a building with multiple stacked tiers or segments", | |
"historical_structure": "a building with historical architectural elements", | |
"modern_design": "a modern structure with contemporary architectural design", | |
"segmented_exterior": "a structure with visible segmented or sectioned exterior", | |
"viewing_platform": "a tall structure with observation area at the top", | |
"time_display": "a structure with timepiece features", | |
"glass_facade": "a building with prominent glass exterior surfaces", | |
"memorial_structure": "a monument or memorial structure", | |
"ancient_construction": "ancient constructed elements or archaeological features", | |
"natural_landmark": "a natural geographic formation or landmark", | |
"slanted_design": "a structure with non-vertical or leaning profile" | |
} | |
# Calculate similarity scores against universal architectural patterns | |
context_scores = self.calculate_similarity_scores(image, architecture_prompts) | |
# Determine most relevant architectural features | |
top_features = sorted(context_scores.items(), key=lambda x: x[1], reverse=True)[:3] | |
# Calculate feature confidence | |
context_confidence = sum(score for _, score in top_features) / 3 | |
# Determine primary architectural category based on top features | |
architectural_categories = { | |
"tower": ["tall_structure", "viewing_platform", "time_display"], | |
"skyscraper": ["tall_structure", "modern_design", "glass_facade"], | |
"historical": ["historical_structure", "ancient_construction", "memorial_structure"], | |
"natural": ["natural_landmark"], | |
"distinctive": ["tiered_building", "segmented_exterior", "slanted_design"] | |
} | |
# Score each category based on the top features | |
category_scores = {} | |
for category, features in architectural_categories.items(): | |
category_score = 0 | |
for feature, score in context_scores.items(): | |
if feature in features: | |
category_score += score | |
category_scores[category] = category_score | |
primary_category = max(category_scores.items(), key=lambda x: x[1])[0] | |
return { | |
"architectural_features": top_features, | |
"context_confidence": context_confidence, | |
"primary_category": primary_category, | |
"category_scores": category_scores | |
} | |
def intelligent_landmark_search(self, | |
image: Union[Image.Image, np.ndarray], | |
yolo_boxes: Optional[List[List[float]]] = None, | |
base_threshold: float = 0.25) -> Dict[str, Any]: | |
""" | |
對圖像進行智能地標搜索,綜合整張圖像分析和區域分析 | |
Args: | |
image: 原始圖像 | |
yolo_boxes: YOLO檢測到的邊界框 (可選) | |
base_threshold: 基礎置信度閾值 | |
Returns: | |
Dict: 包含所有檢測結果的綜合分析 | |
""" | |
# 確保圖像是PIL格式 | |
if not isinstance(image, Image.Image): | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
else: | |
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.") | |
# No YOLO 框時,可以稍微降低閾值以提高召回率 | |
actual_threshold = base_threshold * 0.85 if yolo_boxes is None or len(yolo_boxes) == 0 else base_threshold | |
# 首先對整張圖像進行分析 | |
try: | |
full_image_result = self.search_entire_image( | |
image, | |
threshold=actual_threshold, | |
detailed_analysis=True # 確保詳細分析開啟 | |
) | |
# No YOLO 框,則進行多尺度分析以提高檢測機會 | |
if (yolo_boxes is None or len(yolo_boxes) == 0) and (not full_image_result or not full_image_result.get("is_landmark", False)): | |
print("No YOLO boxes provided, attempting multi-scale pyramid analysis") | |
try: | |
if hasattr(self, '_perform_pyramid_analysis'): | |
pyramid_results = self._perform_pyramid_analysis( | |
image, | |
levels=4, # | |
base_threshold=actual_threshold, | |
aspect_ratios=[1.0, 0.75, 1.5, 0.5, 2.0] | |
) | |
if pyramid_results and pyramid_results.get("is_landmark", False) and pyramid_results.get("best_result", {}).get("confidence", 0) > actual_threshold: | |
# 使用金字塔分析結果增強或替代全圖結果 | |
if not full_image_result or not full_image_result.get("is_landmark", False): | |
full_image_result = { | |
"is_landmark": True, | |
"landmark_id": pyramid_results["best_result"]["landmark_id"], | |
"landmark_name": pyramid_results["best_result"]["landmark_name"], | |
"confidence": pyramid_results["best_result"]["confidence"], | |
"location": pyramid_results["best_result"].get("location", "Unknown Location") | |
} | |
print(f"Pyramid analysis detected landmark: {pyramid_results['best_result']['landmark_name']} with confidence {pyramid_results['best_result']['confidence']:.3f}") | |
else: | |
print("Pyramid analysis not available, skipping multi-scale detection") | |
except Exception as e: | |
print(f"Error in pyramid analysis: {e}") | |
except Exception as e: | |
print(f"Error in search_entire_image: {e}") | |
import traceback | |
traceback.print_exc() | |
full_image_result = None | |
# 初始化結果字典 | |
result = { | |
"full_image_analysis": full_image_result if full_image_result else {}, | |
"is_landmark_scene": False, # 默認值 | |
"detected_landmarks": [] | |
} | |
# 上下文感知比較,處理接近的排名結果 | |
if full_image_result and "top_landmarks" in full_image_result and len(full_image_result["top_landmarks"]) >= 2: | |
top_landmarks = full_image_result["top_landmarks"] | |
# 檢查前兩個結果是否非常接近(信心度差異小於 0.1) | |
if len(top_landmarks) >= 2 and abs(top_landmarks[0]["confidence"] - top_landmarks[1]["confidence"]) < 0.1: | |
# 對於接近的結果,使用通用建築特徵分析進行區分 | |
try: | |
# 分析建築特徵 | |
if hasattr(self, '_analyze_architectural_features'): | |
architectural_analysis = self._analyze_architectural_features(image) | |
top_features = architectural_analysis.get("architectural_features", []) | |
primary_category = architectural_analysis.get("primary_category", "") | |
# 根據建築特徵調整地標置信度 | |
for i, landmark in enumerate(top_landmarks[:2]): | |
if i >= len(top_landmarks): | |
continue | |
landmark_id = landmark.get("landmark_id", "").lower() | |
confidence_boost = 0 | |
# 使用主要建築類別來調整置信度,使用通用條件而非特定地標名稱 | |
if primary_category == "tower" and any(term in landmark_id for term in ["tower", "spire", "needle"]): | |
confidence_boost += 0.05 | |
elif primary_category == "skyscraper" and any(term in landmark_id for term in ["building", "skyscraper", "tall"]): | |
confidence_boost += 0.05 | |
elif primary_category == "historical" and any(term in landmark_id for term in ["monument", "castle", "palace", "temple"]): | |
confidence_boost += 0.05 | |
elif primary_category == "distinctive" and any(term in landmark_id for term in ["unusual", "unique", "special", "famous"]): | |
confidence_boost += 0.05 | |
# 根據特定特徵進一步微調,使用通用特徵描述而非特定地標 | |
for feature, score in top_features: | |
if feature == "time_display" and "clock" in landmark_id: | |
confidence_boost += 0.03 | |
elif feature == "segmented_exterior" and "segmented" in landmark_id: | |
confidence_boost += 0.03 | |
elif feature == "slanted_design" and "leaning" in landmark_id: | |
confidence_boost += 0.03 | |
# 應用信心度調整 | |
if confidence_boost > 0 and i < len(top_landmarks): | |
top_landmarks[i]["confidence"] += confidence_boost | |
print(f"Boosted {landmark['landmark_name']} confidence by {confidence_boost:.2f} based on architectural features ({primary_category})") | |
# 重新排序 | |
top_landmarks.sort(key=lambda x: x["confidence"], reverse=True) | |
full_image_result["top_landmarks"] = top_landmarks | |
if top_landmarks: | |
full_image_result["landmark_id"] = top_landmarks[0]["landmark_id"] | |
full_image_result["landmark_name"] = top_landmarks[0]["landmark_name"] | |
full_image_result["confidence"] = top_landmarks[0]["confidence"] | |
full_image_result["location"] = top_landmarks[0].get("location", "Unknown Location") | |
except Exception as e: | |
print(f"Error in architectural feature analysis: {e}") | |
import traceback | |
traceback.print_exc() | |
if full_image_result and full_image_result.get("is_landmark", False): | |
result["is_landmark_scene"] = True | |
landmark_id = full_image_result.get("landmark_id", "unknown") | |
# extract landmark info | |
landmark_specific_info = self._extract_landmark_specific_info(landmark_id) | |
landmark_info = { | |
"landmark_id": landmark_id, | |
"landmark_name": full_image_result.get("landmark_name", "Unknown Landmark"), | |
"confidence": full_image_result.get("confidence", 0.0), | |
"location": full_image_result.get("location", "Unknown Location"), | |
"region_type": "full_image", | |
"box": [0, 0, getattr(image, 'width', 0), getattr(image, 'height', 0)] | |
} | |
# 整合地標特定info,確保正確的名稱被使用 | |
landmark_info.update(landmark_specific_info) | |
# 如果特定信息中有更準確的地標名稱,使用它 | |
if landmark_specific_info.get("landmark_name"): | |
landmark_info["landmark_name"] = landmark_specific_info["landmark_name"] | |
result["detected_landmarks"].append(landmark_info) | |
# 確保地標特定活動被正確設置為主要結果 | |
if landmark_specific_info.get("has_specific_activities", False): | |
result["primary_landmark_activities"] = landmark_specific_info.get("landmark_specific_activities", []) | |
print(f"Set primary landmark activities: {len(result['primary_landmark_activities'])} activities for {landmark_info['landmark_name']}") | |
# 如果提供了YOLO邊界框,分析這些區域 | |
if yolo_boxes and len(yolo_boxes) > 0: | |
for box in yolo_boxes: | |
try: | |
if hasattr(self, 'classify_image_region'): | |
box_result = self.classify_image_region( | |
image, | |
box, | |
threshold=base_threshold, | |
detection_type="auto" | |
) | |
# 如果檢測到地標 | |
if box_result and box_result.get("is_landmark", False): | |
# 檢查是否與已檢測的地標重複 | |
is_duplicate = False | |
for existing in result["detected_landmarks"]: | |
if existing.get("landmark_id") == box_result.get("landmark_id"): | |
# 如果新的置信度更高,則更新 | |
if box_result.get("confidence", 0) > existing.get("confidence", 0): | |
existing.update({ | |
"confidence": box_result.get("confidence", 0), | |
"region_type": "yolo_box", | |
"box": box | |
}) | |
is_duplicate = True | |
break | |
# 如果不是重複的,添加到列表 | |
if not is_duplicate: | |
result["detected_landmarks"].append({ | |
"landmark_id": box_result.get("landmark_id", "unknown"), | |
"landmark_name": box_result.get("landmark_name", "Unknown Landmark"), | |
"confidence": box_result.get("confidence", 0.0), | |
"location": box_result.get("location", "Unknown Location"), | |
"region_type": "yolo_box", | |
"box": box | |
}) | |
except Exception as e: | |
print(f"Error in analyzing YOLO box: {e}") | |
continue | |
# 最後,執行額外的網格搜索以捕獲可能被遺漏的地標 | |
# 但只有在尚未發現地標或僅發現低置信度地標時 | |
should_do_grid_search = ( | |
len(result["detected_landmarks"]) == 0 or | |
max([landmark.get("confidence", 0) for landmark in result["detected_landmarks"]], default=0) < 0.5 | |
) | |
if should_do_grid_search and hasattr(self, 'classify_image_region'): | |
try: | |
# 創建5x5網格 | |
width, height = getattr(image, 'size', (getattr(image, 'width', 0), getattr(image, 'height', 0))) | |
if not isinstance(width, (int, float)) or width <= 0: | |
width = getattr(image, 'width', 0) | |
if not isinstance(height, (int, float)) or height <= 0: | |
height = getattr(image, 'height', 0) | |
if width > 0 and height > 0: | |
grid_boxes = [] | |
for i in range(5): | |
for j in range(5): | |
grid_boxes.append([ | |
width * (j/5), height * (i/5), | |
width * ((j+1)/5), height * ((i+1)/5) | |
]) | |
# 分析每個網格區域 | |
for box in grid_boxes: | |
try: | |
grid_result = self.classify_image_region( | |
image, | |
box, | |
threshold=base_threshold * 0.9, # 稍微降低網格搜索閾值 | |
detection_type="partial" | |
) | |
# 如果檢測到地標 | |
if grid_result and grid_result.get("is_landmark", False): | |
# 檢查是否與已檢測的地標重複 | |
is_duplicate = False | |
for existing in result["detected_landmarks"]: | |
if existing.get("landmark_id") == grid_result.get("landmark_id"): | |
is_duplicate = True | |
break | |
# 如果不是重複的,添加到列表 | |
if not is_duplicate: | |
result["detected_landmarks"].append({ | |
"landmark_id": grid_result.get("landmark_id", "unknown"), | |
"landmark_name": grid_result.get("landmark_name", "Unknown Landmark"), | |
"confidence": grid_result.get("confidence", 0.0), | |
"location": grid_result.get("location", "Unknown Location"), | |
"region_type": "grid", | |
"box": box | |
}) | |
except Exception as e: | |
print(f"Error in analyzing grid region: {e}") | |
continue | |
except Exception as e: | |
print(f"Error in grid search: {e}") | |
import traceback | |
traceback.print_exc() | |
# 按置信度排序檢測結果 | |
result["detected_landmarks"].sort(key=lambda x: x.get("confidence", 0), reverse=True) | |
# 更新整體場景類型判斷 | |
if len(result["detected_landmarks"]) > 0: | |
result["is_landmark_scene"] = True | |
result["primary_landmark"] = result["detected_landmarks"][0] | |
# 添加 clip_analysis_on_full_image 結果,以便給 LLM 提供更多上下文 | |
if full_image_result and "clip_analysis" in full_image_result: | |
result["clip_analysis_on_full_image"] = full_image_result["clip_analysis"] | |
return result | |
def _extract_landmark_specific_info(self, landmark_id: str) -> Dict[str, Any]: | |
""" | |
提取特定地標的詳細信息,包括特色模板和活動建議 | |
Args: | |
landmark_id: 地標ID | |
Returns: | |
Dict: 地標特定信息 | |
""" | |
if not landmark_id or landmark_id == "unknown": | |
return {"has_specific_activities": False} | |
specific_info = {"has_specific_activities": False} | |
# 從 ALL_LANDMARKS 或 self.landmark_data 中提取基本信息 | |
landmark_data_source = None | |
# 優先嘗試從類屬性獲取 | |
if hasattr(self, 'landmark_data') and self.landmark_data and landmark_id in self.landmark_data: | |
landmark_data_source = self.landmark_data[landmark_id] | |
print(f"Using landmark data from class attribute for {landmark_id}") | |
else: | |
try: | |
if landmark_id in ALL_LANDMARKS: | |
landmark_data_source = ALL_LANDMARKS[landmark_id] | |
print(f"Using landmark data from ALL_LANDMARKS for {landmark_id}") | |
except ImportError: | |
print("Warning: Could not import ALL_LANDMARKS from landmark_data") | |
except Exception as e: | |
print(f"Error accessing ALL_LANDMARKS: {e}") | |
# 處理地標基本數據 | |
if landmark_data_source: | |
# 提取正確的地標名稱 | |
if "name" in landmark_data_source: | |
specific_info["landmark_name"] = landmark_data_source["name"] | |
# 提取所有可用的 prompts 作為特色模板 | |
if "prompts" in landmark_data_source: | |
specific_info["feature_templates"] = landmark_data_source["prompts"][:5] | |
specific_info["primary_template"] = landmark_data_source["prompts"][0] | |
# 提取別名info | |
if "aliases" in landmark_data_source: | |
specific_info["aliases"] = landmark_data_source["aliases"] | |
# 提取位置信息 | |
if "location" in landmark_data_source: | |
specific_info["location"] = landmark_data_source["location"] | |
# 提取其他相關信息 | |
for key in ["year_built", "architectural_style", "significance", "description"]: | |
if key in landmark_data_source: | |
specific_info[key] = landmark_data_source[key] | |
# 嘗試從 LANDMARK_ACTIVITIES 中提取活動建議 | |
try: | |
if landmark_id in LANDMARK_ACTIVITIES: | |
activities = LANDMARK_ACTIVITIES[landmark_id] | |
specific_info["landmark_specific_activities"] = activities | |
specific_info["has_specific_activities"] = True | |
print(f"Found {len(activities)} specific activities for landmark {landmark_id}") | |
else: | |
print(f"No specific activities found for landmark {landmark_id} in LANDMARK_ACTIVITIES") | |
specific_info["has_specific_activities"] = False | |
except ImportError: | |
print("Warning: Could not import LANDMARK_ACTIVITIES from landmark_activities") | |
specific_info["has_specific_activities"] = False | |
except Exception as e: | |
print(f"Error loading landmark activities for {landmark_id}: {e}") | |
specific_info["has_specific_activities"] = False | |
return specific_info | |
def _analyze_viewpoint(self, image: Union[Image.Image, np.ndarray]) -> Dict[str, float]: | |
""" | |
Analyzes the image viewpoint to adjust detection parameters. | |
Args: | |
image: Input image | |
Returns: | |
Dict: Viewpoint analysis results | |
""" | |
viewpoint_prompts = { | |
"aerial_view": "an aerial view from above looking down", | |
"street_level": "a street level view looking up at a tall structure", | |
"eye_level": "an eye-level horizontal view of a landmark", | |
"distant": "a distant view of a landmark on the horizon", | |
"close_up": "a close-up detailed view of architectural features", | |
"interior": "an interior view inside a structure" | |
} | |
# Calculate similarity scores | |
viewpoint_scores = self.calculate_similarity_scores(image, viewpoint_prompts) | |
# Find dominant viewpoint | |
dominant_viewpoint = max(viewpoint_scores.items(), key=lambda x: x[1]) | |
return { | |
"viewpoint_scores": viewpoint_scores, | |
"dominant_viewpoint": dominant_viewpoint[0], | |
"confidence": dominant_viewpoint[1] | |
} | |
def calculate_similarity_scores(self, image: Union[Image.Image, np.ndarray], | |
prompts: Dict[str, str]) -> Dict[str, float]: | |
""" | |
計算圖像與一組特定提示之間的相似度分數 | |
Args: | |
image: 輸入圖像 | |
prompts: 提示詞字典 {名稱: 提示文本} | |
Returns: | |
Dict[str, float]: 每個提示的相似度分數 | |
""" | |
# 確保圖像是PIL格式 | |
if not isinstance(image, Image.Image): | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
else: | |
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.") | |
# 預處理圖像 | |
image_input = self.preprocess(image).unsqueeze(0).to(self.device) | |
# 獲取圖像特徵 | |
with torch.no_grad(): | |
image_features = self.model.encode_image(image_input) | |
image_features = image_features / image_features.norm(dim=-1, keepdim=True) | |
# 計算與每個提示的相似度 | |
scores = {} | |
prompt_texts = list(prompts.values()) | |
prompt_tokens = clip.tokenize(prompt_texts).to(self.device) | |
with torch.no_grad(): | |
prompt_features = self.model.encode_text(prompt_tokens) | |
prompt_features = prompt_features / prompt_features.norm(dim=-1, keepdim=True) | |
# calculate similarity | |
similarity = (100.0 * image_features @ prompt_features.T).softmax(dim=-1) | |
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0] | |
# 填充結果字典 | |
for i, (name, _) in enumerate(prompts.items()): | |
scores[name] = float(similarity[i]) | |
return scores | |