VisionScout / clip_zero_shot_classifier.py
DawnC's picture
Update clip_zero_shot_classifier.py
05b8fc5 verified
import torch
import clip
from PIL import Image
import numpy as np
from typing import List, Dict, Tuple, Optional, Union, Any
from landmark_data import ALL_LANDMARKS, get_all_landmark_prompts
from landmark_activities import LANDMARK_ACTIVITIES
class CLIPZeroShotClassifier:
"""
使用CLIP模型進行零樣本分類,專注於識別世界知名地標。
作為YOLO檢測的補充,處理標準對象檢測無法識別的地標建築。
"""
def __init__(self, model_name: str = "ViT-B/16", device: str = None):
"""
初始化CLIP零樣本分類器
Args:
model_name: CLIP模型名稱,默認為"ViT-B/16"
device: 運行設備,None則自動選擇
"""
# 設置運行設備
if device is None:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
else:
self.device = device
print(f"Initializing CLIP Zero-Shot Landmark Classifier ({model_name}) on {self.device}")
try:
self.model, self.preprocess = clip.load(model_name, device=self.device)
print(f"Successfully loaded CLIP model")
except Exception as e:
print(f"Error loading CLIP model: {e}")
raise
# 加載地標數據
try:
self.landmark_data = ALL_LANDMARKS
self.landmark_prompts = get_all_landmark_prompts()
print(f"Loaded {len(self.landmark_prompts)} landmark prompts for classification")
# 預計算地標文本特徵
self.landmark_text_features = self._precompute_text_features(self.landmark_prompts)
# 創建地標ID到索引的映射,可快速查找
self.landmark_id_to_index = {landmark_id: i for i, landmark_id in enumerate(ALL_LANDMARKS.keys())}
# 初始化批處理參數
self.batch_size = 16 # 默認批處理大小
self.confidence_threshold_multipliers = {
"close_up": 0.9, # 近景標準閾值
"partial": 0.6, # 部分可見降低閾值要求
"distant": 0.5, # 遠景更低閾值要求
"full_image": 0.7 # 整張圖像需要更高閾值
}
self.landmark_type_thresholds = {
"tower": 0.5, # 塔型建築需要更高閾值
"skyscraper": 0.4, # 摩天大樓使用較低閾值
"building": 0.55, # 一般建築物閾值略微降低
"monument": 0.5, # 紀念碑閾值
"natural": 0.6 # 自然地標可以使用較低閾值
}
# 初始化結果快取
self.results_cache = {} # 使用圖像hash作為鍵
self.cache_max_size = 100 # 最大快取項目數
except ImportError:
print("Warning: landmark_data.py not found. Landmark classification will be limited")
self.landmark_data = {}
self.landmark_prompts = []
self.landmark_text_features = None
self.landmark_id_to_index = {}
self.results_cache = {}
def _get_image_hash(self, image):
"""
為圖像生成簡單的 hash 值用於快取
Args:
image: PIL Image 或 numpy 數組
Returns:
str: 圖像的 hash 值
"""
if isinstance(image, np.ndarray):
# 對於 numpy 數組,降採樣並計算簡單 hash
small_img = image[::10, ::10] if image.ndim == 3 else image
return hash(small_img.tobytes())
else:
# 對於 PIL 圖像,調整大小後轉換為 bytes
small_img = image.resize((32, 32))
return hash(small_img.tobytes())
def _manage_cache(self):
"""
管理結果快取大小
"""
if len(self.results_cache) > self.cache_max_size:
oldest_key = next(iter(self.results_cache))
del self.results_cache[oldest_key]
def set_batch_size(self, batch_size: int):
"""
設置批處理大小
Args:
batch_size: 新的批處理大小
"""
self.batch_size = max(1, batch_size)
print(f"Batch size set to {self.batch_size}")
def adjust_confidence_threshold(self, detection_type: str, multiplier: float):
"""
調整特定檢測類型的置信度閾值乘數
Args:
detection_type: 檢測類型 ('close_up', 'partial', 'distant', 'full_image')
multiplier: 置信度閾值乘數
"""
if detection_type in self.confidence_threshold_multipliers:
self.confidence_threshold_multipliers[detection_type] = max(0.1, min(1.5, multiplier))
print(f"Adjusted confidence threshold multiplier for {detection_type} to {multiplier}")
else:
print(f"Unknown detection type: {detection_type}")
def _precompute_text_features(self, text_prompts: List[str]) -> torch.Tensor:
"""
預計算文本提示的CLIP特徵,提高批處理效率
Args:
text_prompts: 文本提示列表
Returns:
torch.Tensor: 預計算的文本特徵
"""
if not text_prompts:
return None
with torch.no_grad():
# Process in batches to avoid CUDA memory issues
batch_size = 128 # Adjust based on GPU memory
features_list = []
for i in range(0, len(text_prompts), batch_size):
batch_prompts = text_prompts[i:i+batch_size]
text_tokens = clip.tokenize(batch_prompts).to(self.device)
batch_features = self.model.encode_text(text_tokens)
batch_features = batch_features / batch_features.norm(dim=-1, keepdim=True)
features_list.append(batch_features)
# Concatenate all batches
if len(features_list) > 1:
text_features = torch.cat(features_list, dim=0)
else:
text_features = features_list[0]
return text_features
def _perform_pyramid_analysis(self,
image: Union[Image.Image, np.ndarray],
levels: int = 4,
base_threshold: float = 0.25,
aspect_ratios: List[float] = [1.0, 0.75, 1.5]) -> Dict[str, Any]:
"""
Performs multi-scale pyramid analysis on the image to improve landmark detection.
Args:
image: Input image
levels: Number of pyramid levels
base_threshold: Base confidence threshold
aspect_ratios: Different aspect ratios to try (for tall buildings vs wide landscapes)
Returns:
Dict: Results of pyramid analysis
"""
# Ensure image is PIL format
if not isinstance(image, Image.Image):
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
else:
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
width, height = image.size
pyramid_results = []
# 對每個縮放和縱橫比組合進行處理
for level in range(levels):
# 計算縮放因子
scale_factor = 1.0 - (level * 0.2)
for aspect_ratio in aspect_ratios:
# 計算新尺寸,保持面積近似不變
if aspect_ratio != 1.0:
# 保持面積近似不變的情況下調整縱橫比
new_width = int(width * scale_factor * (1/aspect_ratio)**0.5)
new_height = int(height * scale_factor * aspect_ratio**0.5)
else:
new_width = int(width * scale_factor)
new_height = int(height * scale_factor)
# 調整圖像大小
scaled_image = image.resize((new_width, new_height), Image.LANCZOS)
# 預處理圖像
image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)
# 獲取圖像特徵
with torch.no_grad():
image_features = self.model.encode_image(image_input)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
# 計算相似度
similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
# 找到最佳匹配
best_idx = similarity.argmax().item()
best_score = similarity[best_idx]
if best_score >= base_threshold:
landmark_id = list(self.landmark_data.keys())[best_idx]
landmark_info = self.landmark_data[landmark_id]
pyramid_results.append({
"landmark_id": landmark_id,
"landmark_name": landmark_info["name"],
"confidence": float(best_score),
"scale_factor": scale_factor,
"aspect_ratio": aspect_ratio,
"location": landmark_info["location"]
})
# 按置信度排序
pyramid_results.sort(key=lambda x: x["confidence"], reverse=True)
return {
"is_landmark": len(pyramid_results) > 0,
"results": pyramid_results,
"best_result": pyramid_results[0] if pyramid_results else None
}
def _enhance_features(self, image: Union[Image.Image, np.ndarray]) -> Image.Image:
"""
Enhances image features to improve landmark detection.
Args:
image: Input image
Returns:
PIL.Image: Enhanced image
"""
# Ensure image is PIL format
if not isinstance(image, Image.Image):
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
else:
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
# Convert to numpy for processing
img_array = np.array(image)
# Skip processing for grayscale images
if len(img_array.shape) < 3:
return image
# Apply adaptive contrast enhancement
# Convert to LAB color space
from skimage import color, exposure
try:
# Convert to LAB color space
if img_array.shape[2] == 4: # Handle RGBA
img_array = img_array[:,:,:3]
lab = color.rgb2lab(img_array[:,:,:3] / 255.0)
l_channel = lab[:,:,0]
# Enhance contrast of L channel
p2, p98 = np.percentile(l_channel, (2, 98))
l_channel_enhanced = exposure.rescale_intensity(l_channel, in_range=(p2, p98))
# Replace L channel and convert back to RGB
lab[:,:,0] = l_channel_enhanced
enhanced_img = color.lab2rgb(lab) * 255.0
enhanced_img = enhanced_img.astype(np.uint8)
return Image.fromarray(enhanced_img)
except ImportError:
print("Warning: skimage not available for feature enhancement")
return image
except Exception as e:
print(f"Error in feature enhancement: {e}")
return image
def _determine_landmark_type(self, landmark_id):
"""
自動判斷地標類型,基於地標數據和命名
Returns:
str: 地標類型,用於調整閾值
"""
if not landmark_id:
return "building" # 預設類型
# 獲取地標詳細數據
landmark_data = self.landmark_data if hasattr(self, 'landmark_data') else {}
landmark_info = landmark_data.get(landmark_id, {})
# 獲取地標相關文本
landmark_id_lower = landmark_id.lower()
landmark_name = landmark_info.get("name", "").lower()
landmark_location = landmark_info.get("location", "").lower()
landmark_aliases = [alias.lower() for alias in landmark_info.get("aliases", [])]
# 合併所有文本數據用於特徵判斷
combined_text = " ".join([landmark_id_lower, landmark_name] + landmark_aliases)
# 地標類型的特色特徵
type_features = {
"skyscraper": ["skyscraper", "tall", "tower", "高樓", "摩天", "大厦", "タワー"],
"tower": ["tower", "bell", "clock", "塔", "鐘樓", "タワー", "campanile"],
"monument": ["monument", "memorial", "statue", "紀念", "雕像", "像", "memorial"],
"natural": ["mountain", "lake", "canyon", "falls", "beach", "山", "湖", "峽谷", "瀑布", "海灘"],
"temple": ["temple", "shrine", "寺", "神社", "廟"],
"palace": ["palace", "castle", "宮", "城", "皇宮", "宫殿"],
"distinctive": ["unique", "leaning", "slanted", "傾斜", "斜", "獨特", "傾く"]
}
# 檢查是否位於亞洲地區
asian_regions = ["china", "japan", "korea", "taiwan", "singapore", "vietnam", "thailand",
"hong kong", "中國", "日本", "韓國", "台灣", "新加坡", "越南", "泰國", "香港"]
is_asian = any(region in landmark_location for region in asian_regions)
# 判斷地標類型
best_type = None
max_matches = 0
for type_name, features in type_features.items():
# 計算特徵詞匹配數量
matches = sum(1 for feature in features if feature in combined_text)
if matches > max_matches:
max_matches = matches
best_type = type_name
# 處理亞洲地區特例
if is_asian and best_type == "tower":
best_type = "skyscraper" # 亞洲地區的塔型建築閾值較低
# 特例處理:檢測傾斜建築
if any(term in combined_text for term in ["leaning", "slanted", "tilt", "inclined", "斜", "傾斜"]):
return "distinctive" # 傾斜建築需要特殊處理
return best_type if best_type and max_matches > 0 else "building" # 預設為一般建築
def classify_image_region(self,
image: Union[Image.Image, np.ndarray],
box: List[float],
threshold: float = 0.25,
detection_type: str = "close_up") -> Dict[str, Any]:
"""
對圖像的特定區域進行地標分類,具有增強的多尺度和部分識別能力
Args:
image: 原始圖像 (PIL Image 或 numpy數組)
box: 邊界框 [x1, y1, x2, y2]
threshold: 基礎分類置信度閾值
detection_type: 檢測類型,影響置信度調整
Returns:
Dict: 地標分類結果
"""
# 確保圖像是PIL格式
if not isinstance(image, Image.Image):
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
else:
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
# 生成圖像區域的hash用於快取
region_key = (self._get_image_hash(image), tuple(box), detection_type)
if region_key in self.results_cache:
return self.results_cache[region_key]
# 裁剪區域
x1, y1, x2, y2 = map(int, box)
cropped_image = image.crop((x1, y1, x2, y2))
enhanced_image = self._enhance_features(cropped_image)
# 分析視角信息
viewpoint_info = self._analyze_viewpoint(enhanced_image)
dominant_viewpoint = viewpoint_info["dominant_viewpoint"]
# 計算區域信息
region_width = x2 - x1
region_height = y2 - y1
image_width, image_height = image.size
# 根據區域大小判斷可能的檢測類型
region_area_ratio = (region_width * region_height) / (image_width * image_height)
if detection_type == "auto":
if region_area_ratio > 0.5:
detection_type = "close_up"
elif region_area_ratio > 0.2:
detection_type = "partial"
else:
detection_type = "distant"
# 根據視角調整檢測類型
if dominant_viewpoint == "close_up" and detection_type != "close_up":
detection_type = "close_up"
elif dominant_viewpoint == "distant" and detection_type != "distant":
detection_type = "distant"
elif dominant_viewpoint == "angled_view":
detection_type = "partial" # 角度視圖可能是部分可見
# 調整置信度閾值
base_multiplier = self.confidence_threshold_multipliers.get(detection_type, 1.0)
adjusted_threshold = threshold * base_multiplier
# 調整多尺度處理的尺度範圍和縱橫比 - 增強對傾斜建築的支持
scales = [1.0] # 默認尺度
# 基於視角選擇合適的尺度和縱橫比
if detection_type in ["partial", "distant"]:
scales = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3] # 標準範圍
# 如果是特殊視角,進一步調整尺度和縱橫比 - 新增
if dominant_viewpoint in ["angled_view", "low_angle"]:
scales = [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4] # 更寬的範圍
# 準備縱橫比 - 同時支持水平和垂直地標
aspect_ratios = [1.0, 0.8, 1.2] # 標準縱橫比
# 針對可能的傾斜建築增加更多縱橫比 - 新增
if dominant_viewpoint in ["angled_view", "unique_feature"]:
aspect_ratios = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5] # 更多樣的縱橫比
best_result = {
"landmark_id": None,
"landmark_name": None,
"confidence": 0.0,
"is_landmark": False
}
# 多尺度和縱橫比分析
for scale in scales:
for aspect_ratio in aspect_ratios:
# 縮放裁剪區域
current_width, current_height = cropped_image.size
# 計算新尺寸,保持面積不變但調整縱橫比
if aspect_ratio != 1.0:
new_width = int(current_width * scale * (1/aspect_ratio)**0.5)
new_height = int(current_height * scale * aspect_ratio**0.5)
else:
new_width = int(current_width * scale)
new_height = int(current_height * scale)
# 確保尺寸至少為1像素
new_width = max(1, new_width)
new_height = max(1, new_height)
# 縮放圖像
try:
scaled_image = cropped_image.resize((new_width, new_height), Image.LANCZOS)
except Exception as e:
print(f"Failed to resize image to {new_width}x{new_height}: {e}")
continue
# 預處理裁剪圖像
try:
image_input = self.preprocess(scaled_image).unsqueeze(0).to(self.device)
except Exception as e:
print(f"Failed to preprocess image: {e}")
continue
# 獲取圖像特徵
with torch.no_grad():
try:
image_features = self.model.encode_image(image_input)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
# 計算與地標提示的相似度
similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
# 找到最佳匹配
best_idx = similarity.argmax().item()
best_score = similarity[best_idx]
# 如果當前尺度結果更好,則更新
if best_score > best_result["confidence"]:
landmark_id = list(self.landmark_data.keys())[best_idx]
landmark_info = self.landmark_data[landmark_id]
best_result = {
"landmark_id": landmark_id,
"landmark_name": landmark_info["name"],
"location": landmark_info["location"],
"confidence": float(best_score),
"is_landmark": best_score >= adjusted_threshold,
"scale_used": scale,
"aspect_ratio_used": aspect_ratio,
"viewpoint": dominant_viewpoint
}
# 添加額外可用信息
for key in ["year_built", "architectural_style", "significance"]:
if key in landmark_info:
best_result[key] = landmark_info[key]
except Exception as e:
print(f"Error in calculating similarity: {e}")
continue
# 只有在有識別出地標ID且信心度足夠高時才應用地標類型閾值調整
if best_result["landmark_id"]:
landmark_type = self._determine_landmark_type(best_result["landmark_id"])
# 檢測是否為特殊類型的建築如斜塔
if landmark_type == "distinctive":
# 特殊建築的閾值降低25%
type_multiplier = 0.75
else:
# 使用已有的類型閾值
type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5
# 更新判斷是否為地標的標準
final_threshold = adjusted_threshold * type_multiplier
best_result["is_landmark"] = best_result["confidence"] >= final_threshold
best_result["landmark_type"] = landmark_type # 添加地標類型信息
best_result["threshold_applied"] = final_threshold # 記錄應用的閾值
# 快取結果
self.results_cache[region_key] = best_result
self._manage_cache()
return best_result
def classify_batch_regions(self,
image: Union[Image.Image, np.ndarray],
boxes: List[List[float]],
threshold: float = 0.28) -> List[Dict[str, Any]]:
"""
批量處理多個圖像區域,提高效率
Args:
image: 原始圖像
boxes: 邊界框列表
threshold: 置信度閾值
Returns:
List[Dict]: 分類結果列表
"""
if not self.landmark_text_features is not None:
return [{"is_landmark": False, "confidence": 0.0} for _ in boxes]
# 確保圖像是PIL格式
if not isinstance(image, Image.Image):
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
else:
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
# 無框可處理時
if not boxes:
return []
# 裁剪並預處理所有區域
cropped_inputs = []
for box in boxes:
x1, y1, x2, y2 = map(int, box)
cropped_image = image.crop((x1, y1, x2, y2))
processed_image = self.preprocess(cropped_image).unsqueeze(0)
cropped_inputs.append(processed_image)
# batch process
batch_tensor = torch.cat(cropped_inputs).to(self.device)
# batch encoding
with torch.no_grad():
image_features = self.model.encode_image(batch_tensor)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
# 計算相似度
similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
similarity = similarity.cpu().numpy() if self.device == "cuda" else similarity.numpy()
# 處理每個區域的結果
results = []
for i, sim in enumerate(similarity):
best_idx = sim.argmax().item()
best_score = sim[best_idx]
if best_score >= threshold:
landmark_id = list(self.landmark_data.keys())[best_idx]
landmark_info = self.landmark_data[landmark_id]
results.append({
"landmark_id": landmark_id,
"landmark_name": landmark_info["name"],
"location": landmark_info["location"],
"confidence": float(best_score),
"is_landmark": True,
"box": boxes[i]
})
else:
results.append({
"landmark_id": None,
"landmark_name": None,
"confidence": float(best_score),
"is_landmark": False,
"box": boxes[i]
})
return results
def search_entire_image(self,
image: Union[Image.Image, np.ndarray],
threshold: float = 0.35,
detailed_analysis: bool = False) -> Dict[str, Any]:
"""
檢查整張圖像是否包含地標,具有增強的分析能力
Args:
image: 原始圖像
threshold: 置信度閾值
detailed_analysis: 是否進行詳細分析,包括多區域檢測
Returns:
Dict: 地標分類結果
"""
# 確保圖像是PIL格式
if not isinstance(image, Image.Image):
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
else:
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
# 檢查快取
image_key = (self._get_image_hash(image), "entire_image", detailed_analysis)
if image_key in self.results_cache:
return self.results_cache[image_key]
# 調整閾值
adjusted_threshold = threshold * self.confidence_threshold_multipliers.get("full_image", 1.0)
# 預處理圖像
image_input = self.preprocess(image).unsqueeze(0).to(self.device)
# 獲取圖像特徵
with torch.no_grad():
image_features = self.model.encode_image(image_input)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
# 計算與地標提示的相似度
similarity = (100.0 * image_features @ self.landmark_text_features.T).softmax(dim=-1)
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
# 找到最佳匹配
best_idx = similarity.argmax().item()
best_score = similarity[best_idx]
# top3 landmark
top_indices = similarity.argsort()[-3:][::-1]
top_landmarks = []
for idx in top_indices:
score = similarity[idx]
landmark_id = list(self.landmark_data.keys())[idx]
landmark_info = self.landmark_data[landmark_id]
landmark_result = {
"landmark_id": landmark_id,
"landmark_name": landmark_info["name"],
"location": landmark_info["location"],
"confidence": float(score)
}
# 添加額外可用信息
if "year_built" in landmark_info:
landmark_result["year_built"] = landmark_info["year_built"]
if "architectural_style" in landmark_info:
landmark_result["architectural_style"] = landmark_info["architectural_style"]
if "significance" in landmark_info:
landmark_result["significance"] = landmark_info["significance"]
top_landmarks.append(landmark_result)
# main result
result = {}
if best_score >= adjusted_threshold:
landmark_id = list(self.landmark_data.keys())[best_idx]
landmark_info = self.landmark_data[landmark_id]
# 應用地標類型特定閾值
landmark_type = self._determine_landmark_type(landmark_id)
type_multiplier = self.landmark_type_thresholds.get(landmark_type, 1.0) / 0.5
final_threshold = adjusted_threshold * type_multiplier
if best_score >= final_threshold:
result = {
"landmark_id": landmark_id,
"landmark_name": landmark_info["name"],
"location": landmark_info["location"],
"confidence": float(best_score),
"is_landmark": True,
"landmark_type": landmark_type,
"top_landmarks": top_landmarks
}
# 添加額外可用信息
if "year_built" in landmark_info:
result["year_built"] = landmark_info["year_built"]
if "architectural_style" in landmark_info:
result["architectural_style"] = landmark_info["architectural_style"]
if "significance" in landmark_info:
result["significance"] = landmark_info["significance"]
else:
result = {
"landmark_id": None,
"landmark_name": None,
"confidence": float(best_score),
"is_landmark": False,
"top_landmarks": top_landmarks
}
# 如果請求詳細分析且是地標,進一步分析圖像區域
if detailed_analysis and result.get("is_landmark", False):
# 創建不同區域進行更深入分析
width, height = image.size
regions = [
# 中心區域
[width * 0.25, height * 0.25, width * 0.75, height * 0.75],
# 左半部
[0, 0, width * 0.5, height],
# 右半部
[width * 0.5, 0, width, height],
# 上半部
[0, 0, width, height * 0.5],
# 下半部
[0, height * 0.5, width, height]
]
region_results = []
for i, box in enumerate(regions):
region_result = self.classify_image_region(
image,
box,
threshold=threshold * 0.9,
detection_type="partial"
)
if region_result["is_landmark"]:
region_result["region_name"] = ["center", "left", "right", "top", "bottom"][i]
region_results.append(region_result)
# 添加區域分析結果
if region_results:
result["region_analyses"] = region_results
# 快取結果
self.results_cache[image_key] = result
self._manage_cache()
return result
def enhanced_landmark_detection(self,
image: Union[Image.Image, np.ndarray],
threshold: float = 0.3) -> Dict[str, Any]:
"""
Enhanced landmark detection using multiple analysis techniques.
Args:
image: Input image
threshold: Base confidence threshold
Returns:
Dict: Comprehensive landmark detection results
"""
# Ensure image is PIL format
if not isinstance(image, Image.Image):
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
else:
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
# Phase 1: Analyze viewpoint to adjust detection parameters
viewpoint_info = self._analyze_viewpoint(image)
viewpoint = viewpoint_info["dominant_viewpoint"]
# Adjust threshold based on viewpoint
if viewpoint == "distant":
adjusted_threshold = threshold * 0.7 # Lower threshold for distant views
elif viewpoint == "close_up":
adjusted_threshold = threshold * 1.1 # Higher threshold for close-ups
else:
adjusted_threshold = threshold
# Phase 2: Perform multi-scale pyramid analysis
pyramid_results = self._perform_pyramid_analysis(image, levels=3, base_threshold=adjusted_threshold)
# Phase 3: Perform grid-based region analysis
grid_results = []
width, height = image.size
# Create adaptive grid based on viewpoint
if viewpoint == "distant":
grid_size = 3 # Coarser grid for distant views
elif viewpoint == "close_up":
grid_size = 5 # Finer grid for close-ups
else:
grid_size = 4 # Default grid size
# Generate grid regions
for i in range(grid_size):
for j in range(grid_size):
box = [
width * (j/grid_size),
height * (i/grid_size),
width * ((j+1)/grid_size),
height * ((i+1)/grid_size)
]
# Apply feature enhancement
region_result = self.classify_image_region(
image,
box,
threshold=adjusted_threshold,
detection_type="auto"
)
if region_result["is_landmark"]:
region_result["grid_position"] = (i, j)
grid_results.append(region_result)
# Phase 4: Cross-validate and combine results
all_detections = []
# Add pyramid results
if pyramid_results["is_landmark"] and pyramid_results["best_result"]:
all_detections.append({
"source": "pyramid",
"landmark_id": pyramid_results["best_result"]["landmark_id"],
"landmark_name": pyramid_results["best_result"]["landmark_name"],
"confidence": pyramid_results["best_result"]["confidence"],
"scale_factor": pyramid_results["best_result"].get("scale_factor", 1.0)
})
# Add grid results
for result in grid_results:
all_detections.append({
"source": "grid",
"landmark_id": result["landmark_id"],
"landmark_name": result["landmark_name"],
"confidence": result["confidence"],
"grid_position": result.get("grid_position", (0, 0))
})
# Search entire image
full_image_result = self.search_entire_image(image, threshold=adjusted_threshold)
if full_image_result and full_image_result.get("is_landmark", False):
all_detections.append({
"source": "full_image",
"landmark_id": full_image_result["landmark_id"],
"landmark_name": full_image_result["landmark_name"],
"confidence": full_image_result["confidence"]
})
# Group by landmark_id and calculate aggregate confidence
landmark_groups = {}
for detection in all_detections:
landmark_id = detection["landmark_id"]
if landmark_id not in landmark_groups:
landmark_groups[landmark_id] = {
"landmark_id": landmark_id,
"landmark_name": detection["landmark_name"],
"detections": [],
"sources": set()
}
landmark_groups[landmark_id]["detections"].append(detection)
landmark_groups[landmark_id]["sources"].add(detection["source"])
# Calculate aggregate confidence for each landmark
for landmark_id, group in landmark_groups.items():
detections = group["detections"]
# Base confidence is the maximum confidence from any source
max_confidence = max(d["confidence"] for d in detections)
# Bonus for detection from multiple sources
source_count = len(group["sources"])
source_bonus = min(0.15, (source_count - 1) * 0.05) # Up to 15% bonus
# Consistency bonus for multiple detections of the same landmark
detection_count = len(detections)
consistency_bonus = min(0.1, (detection_count - 1) * 0.02) # Up to 10% bonus
# Calculate final confidence
aggregate_confidence = min(1.0, max_confidence + source_bonus + consistency_bonus)
group["confidence"] = aggregate_confidence
group["detection_count"] = detection_count
group["source_count"] = source_count
# Sort landmarks by confidence
sorted_landmarks = sorted(
landmark_groups.values(),
key=lambda x: x["confidence"],
reverse=True
)
return {
"is_landmark_scene": len(sorted_landmarks) > 0,
"detected_landmarks": sorted_landmarks,
"viewpoint_info": viewpoint_info,
"primary_landmark": sorted_landmarks[0] if sorted_landmarks else None
}
def _analyze_architectural_features(self, image):
"""
Analyzes the architectural features of a structure in the image without hardcoding specific landmarks.
Args:
image: Input image
Returns:
Dict: Architectural feature analysis results
"""
# Define universal architectural feature prompts that apply to all types of landmarks
architecture_prompts = {
"tall_structure": "a tall vertical structure standing alone",
"tiered_building": "a building with multiple stacked tiers or segments",
"historical_structure": "a building with historical architectural elements",
"modern_design": "a modern structure with contemporary architectural design",
"segmented_exterior": "a structure with visible segmented or sectioned exterior",
"viewing_platform": "a tall structure with observation area at the top",
"time_display": "a structure with timepiece features",
"glass_facade": "a building with prominent glass exterior surfaces",
"memorial_structure": "a monument or memorial structure",
"ancient_construction": "ancient constructed elements or archaeological features",
"natural_landmark": "a natural geographic formation or landmark",
"slanted_design": "a structure with non-vertical or leaning profile"
}
# Calculate similarity scores against universal architectural patterns
context_scores = self.calculate_similarity_scores(image, architecture_prompts)
# Determine most relevant architectural features
top_features = sorted(context_scores.items(), key=lambda x: x[1], reverse=True)[:3]
# Calculate feature confidence
context_confidence = sum(score for _, score in top_features) / 3
# Determine primary architectural category based on top features
architectural_categories = {
"tower": ["tall_structure", "viewing_platform", "time_display"],
"skyscraper": ["tall_structure", "modern_design", "glass_facade"],
"historical": ["historical_structure", "ancient_construction", "memorial_structure"],
"natural": ["natural_landmark"],
"distinctive": ["tiered_building", "segmented_exterior", "slanted_design"]
}
# Score each category based on the top features
category_scores = {}
for category, features in architectural_categories.items():
category_score = 0
for feature, score in context_scores.items():
if feature in features:
category_score += score
category_scores[category] = category_score
primary_category = max(category_scores.items(), key=lambda x: x[1])[0]
return {
"architectural_features": top_features,
"context_confidence": context_confidence,
"primary_category": primary_category,
"category_scores": category_scores
}
def intelligent_landmark_search(self,
image: Union[Image.Image, np.ndarray],
yolo_boxes: Optional[List[List[float]]] = None,
base_threshold: float = 0.25) -> Dict[str, Any]:
"""
對圖像進行智能地標搜索,綜合整張圖像分析和區域分析
Args:
image: 原始圖像
yolo_boxes: YOLO檢測到的邊界框 (可選)
base_threshold: 基礎置信度閾值
Returns:
Dict: 包含所有檢測結果的綜合分析
"""
# 確保圖像是PIL格式
if not isinstance(image, Image.Image):
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
else:
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
# No YOLO 框時,可以稍微降低閾值以提高召回率
actual_threshold = base_threshold * 0.85 if yolo_boxes is None or len(yolo_boxes) == 0 else base_threshold
# 首先對整張圖像進行分析
try:
full_image_result = self.search_entire_image(
image,
threshold=actual_threshold,
detailed_analysis=True # 確保詳細分析開啟
)
# No YOLO 框,則進行多尺度分析以提高檢測機會
if (yolo_boxes is None or len(yolo_boxes) == 0) and (not full_image_result or not full_image_result.get("is_landmark", False)):
print("No YOLO boxes provided, attempting multi-scale pyramid analysis")
try:
if hasattr(self, '_perform_pyramid_analysis'):
pyramid_results = self._perform_pyramid_analysis(
image,
levels=4, #
base_threshold=actual_threshold,
aspect_ratios=[1.0, 0.75, 1.5, 0.5, 2.0]
)
if pyramid_results and pyramid_results.get("is_landmark", False) and pyramid_results.get("best_result", {}).get("confidence", 0) > actual_threshold:
# 使用金字塔分析結果增強或替代全圖結果
if not full_image_result or not full_image_result.get("is_landmark", False):
full_image_result = {
"is_landmark": True,
"landmark_id": pyramid_results["best_result"]["landmark_id"],
"landmark_name": pyramid_results["best_result"]["landmark_name"],
"confidence": pyramid_results["best_result"]["confidence"],
"location": pyramid_results["best_result"].get("location", "Unknown Location")
}
print(f"Pyramid analysis detected landmark: {pyramid_results['best_result']['landmark_name']} with confidence {pyramid_results['best_result']['confidence']:.3f}")
else:
print("Pyramid analysis not available, skipping multi-scale detection")
except Exception as e:
print(f"Error in pyramid analysis: {e}")
except Exception as e:
print(f"Error in search_entire_image: {e}")
import traceback
traceback.print_exc()
full_image_result = None
# 初始化結果字典
result = {
"full_image_analysis": full_image_result if full_image_result else {},
"is_landmark_scene": False, # 默認值
"detected_landmarks": []
}
# 上下文感知比較,處理接近的排名結果
if full_image_result and "top_landmarks" in full_image_result and len(full_image_result["top_landmarks"]) >= 2:
top_landmarks = full_image_result["top_landmarks"]
# 檢查前兩個結果是否非常接近(信心度差異小於 0.1)
if len(top_landmarks) >= 2 and abs(top_landmarks[0]["confidence"] - top_landmarks[1]["confidence"]) < 0.1:
# 對於接近的結果,使用通用建築特徵分析進行區分
try:
# 分析建築特徵
if hasattr(self, '_analyze_architectural_features'):
architectural_analysis = self._analyze_architectural_features(image)
top_features = architectural_analysis.get("architectural_features", [])
primary_category = architectural_analysis.get("primary_category", "")
# 根據建築特徵調整地標置信度
for i, landmark in enumerate(top_landmarks[:2]):
if i >= len(top_landmarks):
continue
landmark_id = landmark.get("landmark_id", "").lower()
confidence_boost = 0
# 使用主要建築類別來調整置信度,使用通用條件而非特定地標名稱
if primary_category == "tower" and any(term in landmark_id for term in ["tower", "spire", "needle"]):
confidence_boost += 0.05
elif primary_category == "skyscraper" and any(term in landmark_id for term in ["building", "skyscraper", "tall"]):
confidence_boost += 0.05
elif primary_category == "historical" and any(term in landmark_id for term in ["monument", "castle", "palace", "temple"]):
confidence_boost += 0.05
elif primary_category == "distinctive" and any(term in landmark_id for term in ["unusual", "unique", "special", "famous"]):
confidence_boost += 0.05
# 根據特定特徵進一步微調,使用通用特徵描述而非特定地標
for feature, score in top_features:
if feature == "time_display" and "clock" in landmark_id:
confidence_boost += 0.03
elif feature == "segmented_exterior" and "segmented" in landmark_id:
confidence_boost += 0.03
elif feature == "slanted_design" and "leaning" in landmark_id:
confidence_boost += 0.03
# 應用信心度調整
if confidence_boost > 0 and i < len(top_landmarks):
top_landmarks[i]["confidence"] += confidence_boost
print(f"Boosted {landmark['landmark_name']} confidence by {confidence_boost:.2f} based on architectural features ({primary_category})")
# 重新排序
top_landmarks.sort(key=lambda x: x["confidence"], reverse=True)
full_image_result["top_landmarks"] = top_landmarks
if top_landmarks:
full_image_result["landmark_id"] = top_landmarks[0]["landmark_id"]
full_image_result["landmark_name"] = top_landmarks[0]["landmark_name"]
full_image_result["confidence"] = top_landmarks[0]["confidence"]
full_image_result["location"] = top_landmarks[0].get("location", "Unknown Location")
except Exception as e:
print(f"Error in architectural feature analysis: {e}")
import traceback
traceback.print_exc()
if full_image_result and full_image_result.get("is_landmark", False):
result["is_landmark_scene"] = True
landmark_id = full_image_result.get("landmark_id", "unknown")
# extract landmark info
landmark_specific_info = self._extract_landmark_specific_info(landmark_id)
landmark_info = {
"landmark_id": landmark_id,
"landmark_name": full_image_result.get("landmark_name", "Unknown Landmark"),
"confidence": full_image_result.get("confidence", 0.0),
"location": full_image_result.get("location", "Unknown Location"),
"region_type": "full_image",
"box": [0, 0, getattr(image, 'width', 0), getattr(image, 'height', 0)]
}
# 整合地標特定info,確保正確的名稱被使用
landmark_info.update(landmark_specific_info)
# 如果特定信息中有更準確的地標名稱,使用它
if landmark_specific_info.get("landmark_name"):
landmark_info["landmark_name"] = landmark_specific_info["landmark_name"]
result["detected_landmarks"].append(landmark_info)
# 確保地標特定活動被正確設置為主要結果
if landmark_specific_info.get("has_specific_activities", False):
result["primary_landmark_activities"] = landmark_specific_info.get("landmark_specific_activities", [])
print(f"Set primary landmark activities: {len(result['primary_landmark_activities'])} activities for {landmark_info['landmark_name']}")
# 如果提供了YOLO邊界框,分析這些區域
if yolo_boxes and len(yolo_boxes) > 0:
for box in yolo_boxes:
try:
if hasattr(self, 'classify_image_region'):
box_result = self.classify_image_region(
image,
box,
threshold=base_threshold,
detection_type="auto"
)
# 如果檢測到地標
if box_result and box_result.get("is_landmark", False):
# 檢查是否與已檢測的地標重複
is_duplicate = False
for existing in result["detected_landmarks"]:
if existing.get("landmark_id") == box_result.get("landmark_id"):
# 如果新的置信度更高,則更新
if box_result.get("confidence", 0) > existing.get("confidence", 0):
existing.update({
"confidence": box_result.get("confidence", 0),
"region_type": "yolo_box",
"box": box
})
is_duplicate = True
break
# 如果不是重複的,添加到列表
if not is_duplicate:
result["detected_landmarks"].append({
"landmark_id": box_result.get("landmark_id", "unknown"),
"landmark_name": box_result.get("landmark_name", "Unknown Landmark"),
"confidence": box_result.get("confidence", 0.0),
"location": box_result.get("location", "Unknown Location"),
"region_type": "yolo_box",
"box": box
})
except Exception as e:
print(f"Error in analyzing YOLO box: {e}")
continue
# 最後,執行額外的網格搜索以捕獲可能被遺漏的地標
# 但只有在尚未發現地標或僅發現低置信度地標時
should_do_grid_search = (
len(result["detected_landmarks"]) == 0 or
max([landmark.get("confidence", 0) for landmark in result["detected_landmarks"]], default=0) < 0.5
)
if should_do_grid_search and hasattr(self, 'classify_image_region'):
try:
# 創建5x5網格
width, height = getattr(image, 'size', (getattr(image, 'width', 0), getattr(image, 'height', 0)))
if not isinstance(width, (int, float)) or width <= 0:
width = getattr(image, 'width', 0)
if not isinstance(height, (int, float)) or height <= 0:
height = getattr(image, 'height', 0)
if width > 0 and height > 0:
grid_boxes = []
for i in range(5):
for j in range(5):
grid_boxes.append([
width * (j/5), height * (i/5),
width * ((j+1)/5), height * ((i+1)/5)
])
# 分析每個網格區域
for box in grid_boxes:
try:
grid_result = self.classify_image_region(
image,
box,
threshold=base_threshold * 0.9, # 稍微降低網格搜索閾值
detection_type="partial"
)
# 如果檢測到地標
if grid_result and grid_result.get("is_landmark", False):
# 檢查是否與已檢測的地標重複
is_duplicate = False
for existing in result["detected_landmarks"]:
if existing.get("landmark_id") == grid_result.get("landmark_id"):
is_duplicate = True
break
# 如果不是重複的,添加到列表
if not is_duplicate:
result["detected_landmarks"].append({
"landmark_id": grid_result.get("landmark_id", "unknown"),
"landmark_name": grid_result.get("landmark_name", "Unknown Landmark"),
"confidence": grid_result.get("confidence", 0.0),
"location": grid_result.get("location", "Unknown Location"),
"region_type": "grid",
"box": box
})
except Exception as e:
print(f"Error in analyzing grid region: {e}")
continue
except Exception as e:
print(f"Error in grid search: {e}")
import traceback
traceback.print_exc()
# 按置信度排序檢測結果
result["detected_landmarks"].sort(key=lambda x: x.get("confidence", 0), reverse=True)
# 更新整體場景類型判斷
if len(result["detected_landmarks"]) > 0:
result["is_landmark_scene"] = True
result["primary_landmark"] = result["detected_landmarks"][0]
# 添加 clip_analysis_on_full_image 結果,以便給 LLM 提供更多上下文
if full_image_result and "clip_analysis" in full_image_result:
result["clip_analysis_on_full_image"] = full_image_result["clip_analysis"]
return result
def _extract_landmark_specific_info(self, landmark_id: str) -> Dict[str, Any]:
"""
提取特定地標的詳細信息,包括特色模板和活動建議
Args:
landmark_id: 地標ID
Returns:
Dict: 地標特定信息
"""
if not landmark_id or landmark_id == "unknown":
return {"has_specific_activities": False}
specific_info = {"has_specific_activities": False}
# 從 ALL_LANDMARKS 或 self.landmark_data 中提取基本信息
landmark_data_source = None
# 優先嘗試從類屬性獲取
if hasattr(self, 'landmark_data') and self.landmark_data and landmark_id in self.landmark_data:
landmark_data_source = self.landmark_data[landmark_id]
print(f"Using landmark data from class attribute for {landmark_id}")
else:
try:
if landmark_id in ALL_LANDMARKS:
landmark_data_source = ALL_LANDMARKS[landmark_id]
print(f"Using landmark data from ALL_LANDMARKS for {landmark_id}")
except ImportError:
print("Warning: Could not import ALL_LANDMARKS from landmark_data")
except Exception as e:
print(f"Error accessing ALL_LANDMARKS: {e}")
# 處理地標基本數據
if landmark_data_source:
# 提取正確的地標名稱
if "name" in landmark_data_source:
specific_info["landmark_name"] = landmark_data_source["name"]
# 提取所有可用的 prompts 作為特色模板
if "prompts" in landmark_data_source:
specific_info["feature_templates"] = landmark_data_source["prompts"][:5]
specific_info["primary_template"] = landmark_data_source["prompts"][0]
# 提取別名info
if "aliases" in landmark_data_source:
specific_info["aliases"] = landmark_data_source["aliases"]
# 提取位置信息
if "location" in landmark_data_source:
specific_info["location"] = landmark_data_source["location"]
# 提取其他相關信息
for key in ["year_built", "architectural_style", "significance", "description"]:
if key in landmark_data_source:
specific_info[key] = landmark_data_source[key]
# 嘗試從 LANDMARK_ACTIVITIES 中提取活動建議
try:
if landmark_id in LANDMARK_ACTIVITIES:
activities = LANDMARK_ACTIVITIES[landmark_id]
specific_info["landmark_specific_activities"] = activities
specific_info["has_specific_activities"] = True
print(f"Found {len(activities)} specific activities for landmark {landmark_id}")
else:
print(f"No specific activities found for landmark {landmark_id} in LANDMARK_ACTIVITIES")
specific_info["has_specific_activities"] = False
except ImportError:
print("Warning: Could not import LANDMARK_ACTIVITIES from landmark_activities")
specific_info["has_specific_activities"] = False
except Exception as e:
print(f"Error loading landmark activities for {landmark_id}: {e}")
specific_info["has_specific_activities"] = False
return specific_info
def _analyze_viewpoint(self, image: Union[Image.Image, np.ndarray]) -> Dict[str, float]:
"""
Analyzes the image viewpoint to adjust detection parameters.
Args:
image: Input image
Returns:
Dict: Viewpoint analysis results
"""
viewpoint_prompts = {
"aerial_view": "an aerial view from above looking down",
"street_level": "a street level view looking up at a tall structure",
"eye_level": "an eye-level horizontal view of a landmark",
"distant": "a distant view of a landmark on the horizon",
"close_up": "a close-up detailed view of architectural features",
"interior": "an interior view inside a structure"
}
# Calculate similarity scores
viewpoint_scores = self.calculate_similarity_scores(image, viewpoint_prompts)
# Find dominant viewpoint
dominant_viewpoint = max(viewpoint_scores.items(), key=lambda x: x[1])
return {
"viewpoint_scores": viewpoint_scores,
"dominant_viewpoint": dominant_viewpoint[0],
"confidence": dominant_viewpoint[1]
}
def calculate_similarity_scores(self, image: Union[Image.Image, np.ndarray],
prompts: Dict[str, str]) -> Dict[str, float]:
"""
計算圖像與一組特定提示之間的相似度分數
Args:
image: 輸入圖像
prompts: 提示詞字典 {名稱: 提示文本}
Returns:
Dict[str, float]: 每個提示的相似度分數
"""
# 確保圖像是PIL格式
if not isinstance(image, Image.Image):
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
else:
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
# 預處理圖像
image_input = self.preprocess(image).unsqueeze(0).to(self.device)
# 獲取圖像特徵
with torch.no_grad():
image_features = self.model.encode_image(image_input)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
# 計算與每個提示的相似度
scores = {}
prompt_texts = list(prompts.values())
prompt_tokens = clip.tokenize(prompt_texts).to(self.device)
with torch.no_grad():
prompt_features = self.model.encode_text(prompt_tokens)
prompt_features = prompt_features / prompt_features.norm(dim=-1, keepdim=True)
# calculate similarity
similarity = (100.0 * image_features @ prompt_features.T).softmax(dim=-1)
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
# 填充結果字典
for i, (name, _) in enumerate(prompts.items()):
scores[name] = float(similarity[i])
return scores