import torch import torch.nn as nn import torchvision.transforms as transforms import numpy as np from PIL import Image from typing import Dict, List, Tuple, Optional, Any import logging class Places365Model: """ Places365 scene classification model wrapper for scene understanding integration. Provides scene classification and scene attribute prediction capabilities. """ def __init__(self, model_name: str = 'resnet50_places365', device: Optional[str] = None): """ Initialize Places365 model with configurable architecture and device. Args: model_name: Model architecture name (默認 resnet50) device: Target device for inference (auto-detected if None) """ self.logger = logging.getLogger(self.__class__.__name__) # Device configuration with fallback logic if device is None: self.device = "cuda" if torch.cuda.is_available() else "cpu" else: self.device = device self.model_name = model_name self.model = None self.scene_classes = [] self.scene_attributes = [] # Model configuration mapping self.model_configs = { 'resnet18_places365': { 'arch': 'resnet18', 'num_classes': 365, 'url': 'http://places2.csail.mit.edu/models_places365/resnet18_places365.pth.tar' }, 'resnet50_places365': { 'arch': 'resnet50', 'num_classes': 365, 'url': 'http://places2.csail.mit.edu/models_places365/resnet50_places365.pth.tar' }, 'densenet161_places365': { 'arch': 'densenet161', 'num_classes': 365, 'url': 'http://places2.csail.mit.edu/models_places365/densenet161_places365.pth.tar' } } self._load_model() self._load_class_names() self._setup_scene_mapping() def _load_model(self): """載入與初始化 Places365 model""" try: if self.model_name not in self.model_configs: raise ValueError(f"Unsupported model name: {self.model_name}") config = self.model_configs[self.model_name] # Import model architecture if config['arch'].startswith('resnet'): import torchvision.models as models if config['arch'] == 'resnet18': self.model = models.resnet18(num_classes=config['num_classes']) elif config['arch'] == 'resnet50': self.model = models.resnet50(num_classes=config['num_classes']) elif config['arch'] == 'densenet161': import torchvision.models as models self.model = models.densenet161(num_classes=config['num_classes']) # Load pretrained weights checkpoint = torch.hub.load_state_dict_from_url( config['url'], map_location=self.device, progress=True ) # Handle different checkpoint formats if 'state_dict' in checkpoint: state_dict = checkpoint['state_dict'] # Remove 'module.' prefix if present state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()} else: state_dict = checkpoint self.model.load_state_dict(state_dict) self.model.to(self.device) self.model.eval() self.logger.info(f"Places365 model {self.model_name} loaded successfully on {self.device}") except Exception as e: self.logger.error(f"Error loading Places365 model: {str(e)}") raise def _load_class_names(self): """Load Places365 class names and scene attributes.""" try: # Load scene class names (365 categories) import urllib.request class_url = 'https://raw.githubusercontent.com/csailvision/places365/master/categories_places365.txt' class_file = urllib.request.urlopen(class_url) self.scene_classes = [] for line in class_file: class_name = line.decode('utf-8').strip().split(' ')[0][3:] # Remove /x/ prefix self.scene_classes.append(class_name) # Load scene attributes (optional, for enhanced description) attr_url = 'https://raw.githubusercontent.com/csailvision/places365/master/labels_sunattribute.txt' try: attr_file = urllib.request.urlopen(attr_url) self.scene_attributes = [] for line in attr_file: attr_name = line.decode('utf-8').strip() self.scene_attributes.append(attr_name) except: self.logger.warning("Scene attributes not loaded, continuing with basic classification") self.scene_attributes = [] self.logger.info(f"Loaded {len(self.scene_classes)} scene classes and {len(self.scene_attributes)} attributes") except Exception as e: self.logger.error(f"Error loading class names: {str(e)}") # Fallback to basic class names if download fails self.scene_classes = [f"scene_class_{i}" for i in range(365)] self.scene_attributes = [] def _setup_scene_mapping(self): """Setup mapping from Places365 classes to common scene types.""" # 建立Places365類別到通用場景類型的映射關係 self.scene_type_mapping = { # Indoor scenes 'living_room': 'living_room', 'bedroom': 'bedroom', 'kitchen': 'kitchen', 'dining_room': 'dining_area', 'bathroom': 'bathroom', 'office': 'office_workspace', 'conference_room': 'office_workspace', 'classroom': 'educational_setting', 'library': 'library', 'restaurant': 'restaurant', 'cafe': 'cafe', 'bar': 'bar', 'hotel_room': 'hotel_room', 'hospital_room': 'medical_facility', 'gym': 'gym', 'supermarket': 'retail_store', 'clothing_store': 'retail_store', # Outdoor urban scenes 'street': 'city_street', 'crosswalk': 'intersection', 'parking_lot': 'parking_lot', 'gas_station': 'gas_station', 'bus_station': 'bus_stop', 'train_station': 'train_station', 'airport_terminal': 'airport', 'subway_station': 'subway_station', 'bridge': 'bridge', 'highway': 'highway', 'downtown': 'commercial_district', 'shopping_mall': 'shopping_mall', # Natural outdoor scenes 'park': 'park_area', 'beach': 'beach', 'forest': 'forest', 'mountain': 'mountain', 'lake': 'lake', 'river': 'river', 'ocean': 'ocean', 'desert': 'desert', 'field': 'field', 'garden': 'garden', # Landmark and tourist areas 'castle': 'historical_monument', 'palace': 'historical_monument', 'temple': 'temple', 'church': 'church', 'mosque': 'mosque', 'museum': 'museum', 'art_gallery': 'art_gallery', 'tower': 'tourist_landmark', 'monument': 'historical_monument', # Sports and entertainment 'stadium': 'stadium', 'basketball_court': 'sports_field', 'tennis_court': 'sports_field', 'swimming_pool': 'swimming_pool', 'playground': 'playground', 'amusement_park': 'amusement_park', 'theater': 'theater', 'concert_hall': 'concert_hall', # Transportation 'airplane_cabin': 'airplane_cabin', 'train_interior': 'train_interior', 'car_interior': 'car_interior', # Construction and industrial 'construction_site': 'construction_site', 'factory': 'factory', 'warehouse': 'warehouse' } # Indoor/outdoor classification helper self.indoor_classes = { 'living_room', 'bedroom', 'kitchen', 'dining_room', 'bathroom', 'office', 'conference_room', 'classroom', 'library', 'restaurant', 'cafe', 'bar', 'hotel_room', 'hospital_room', 'gym', 'supermarket', 'clothing_store', 'airplane_cabin', 'train_interior', 'car_interior', 'theater', 'concert_hall', 'museum', 'art_gallery', 'shopping_mall' } self.outdoor_classes = { 'street', 'crosswalk', 'parking_lot', 'gas_station', 'bus_station', 'train_station', 'airport_terminal', 'bridge', 'highway', 'downtown', 'park', 'beach', 'forest', 'mountain', 'lake', 'river', 'ocean', 'desert', 'field', 'garden', 'stadium', 'basketball_court', 'tennis_court', 'swimming_pool', 'playground', 'amusement_park', 'construction_site', 'factory', 'warehouse', 'castle', 'palace', 'temple', 'church', 'mosque', 'tower', 'monument' } def preprocess(self, image_pil: Image.Image) -> torch.Tensor: """ Preprocess PIL image for Places365 model inference. Args: image_pil: Input PIL image Returns: torch.Tensor: Preprocessed image tensor """ # Places365 standard preprocessing transform = transforms.Compose([ transforms.Resize((256, 256)), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) # Convert to RGB if needed if image_pil.mode != 'RGB': image_pil = image_pil.convert('RGB') # Apply preprocessing input_tensor = transform(image_pil).unsqueeze(0) return input_tensor.to(self.device) def predict(self, image_pil: Image.Image) -> Dict[str, Any]: """ Predict scene classification and attributes for input image. Args: image_pil: Input PIL image Returns: Dict containing scene predictions and confidence scores """ try: # Preprocess image input_tensor = self.preprocess(image_pil) # Model inference with torch.no_grad(): outputs = self.model(input_tensor) probabilities = torch.nn.functional.softmax(outputs, dim=1) # 返回最有可能的項目 top_k = min(10, len(self.scene_classes)) # Configurable top-k top_probs, top_indices = torch.topk(probabilities, top_k, dim=1) # Extract results top_probs = top_probs.cpu().numpy()[0] top_indices = top_indices.cpu().numpy()[0] # Build prediction results predictions = [] for i in range(top_k): class_idx = top_indices[i] confidence = float(top_probs[i]) scene_class = self.scene_classes[class_idx] predictions.append({ 'class_name': scene_class, 'class_index': class_idx, 'confidence': confidence }) # Get primary prediction primary_prediction = predictions[0] primary_class = primary_prediction['class_name'] # 確認是 indoor/outdoor is_indoor = self._classify_indoor_outdoor(primary_class) # Map to common scene type mapped_scene_type = self._map_places365_to_scene_types(primary_class) # Determine scene attributes (basic inference based on class) scene_attributes = self._infer_scene_attributes(primary_class) result = { 'scene_label': primary_class, 'mapped_scene_type': mapped_scene_type, 'confidence': primary_prediction['confidence'], 'is_indoor': is_indoor, 'attributes': scene_attributes, 'top_predictions': predictions, 'all_probabilities': probabilities.cpu().numpy()[0].tolist() } return result except Exception as e: self.logger.error(f"Error in Places365 prediction: {str(e)}") return { 'scene_label': 'unknown', 'mapped_scene_type': 'unknown', 'confidence': 0.0, 'is_indoor': None, 'attributes': [], 'top_predictions': [], 'error': str(e) } def _classify_indoor_outdoor(self, scene_class: str) -> Optional[bool]: """ Classify if scene is indoor or outdoor based on Places365 class. Args: scene_class: Places365 scene class name Returns: bool or None: True for indoor, False for outdoor, None if uncertain """ if scene_class in self.indoor_classes: return True elif scene_class in self.outdoor_classes: return False else: # For ambiguous classes, use heuristics indoor_keywords = ['room', 'office', 'store', 'shop', 'hall', 'interior', 'indoor'] outdoor_keywords = ['street', 'road', 'park', 'field', 'beach', 'mountain', 'outdoor'] scene_lower = scene_class.lower() if any(keyword in scene_lower for keyword in indoor_keywords): return True elif any(keyword in scene_lower for keyword in outdoor_keywords): return False else: return None def _map_places365_to_scene_types(self, places365_class: str) -> str: """ Map Places365 class to common scene type used by the system. Args: places365_class: Places365 scene class name Returns: str: Mapped scene type """ # Direct mapping lookup if places365_class in self.scene_type_mapping: return self.scene_type_mapping[places365_class] # Fuzzy matching for similar classes places365_lower = places365_class.lower() # Indoor fuzzy matching if any(keyword in places365_lower for keyword in ['living', 'bedroom', 'kitchen']): return 'general_indoor_space' elif any(keyword in places365_lower for keyword in ['office', 'conference', 'meeting']): return 'office_workspace' elif any(keyword in places365_lower for keyword in ['dining', 'restaurant', 'cafe']): return 'dining_area' elif any(keyword in places365_lower for keyword in ['store', 'shop', 'market']): return 'retail_store' elif any(keyword in places365_lower for keyword in ['school', 'class', 'library']): return 'educational_setting' # Outdoor fuzzy matching elif any(keyword in places365_lower for keyword in ['street', 'road', 'crosswalk']): return 'city_street' elif any(keyword in places365_lower for keyword in ['park', 'garden', 'plaza']): return 'park_area' elif any(keyword in places365_lower for keyword in ['beach', 'ocean', 'lake']): return 'beach' elif any(keyword in places365_lower for keyword in ['mountain', 'forest', 'desert']): return 'natural_outdoor_area' elif any(keyword in places365_lower for keyword in ['parking', 'garage']): return 'parking_lot' elif any(keyword in places365_lower for keyword in ['station', 'terminal', 'airport']): return 'transportation_hub' # Landmark fuzzy matching elif any(keyword in places365_lower for keyword in ['castle', 'palace', 'monument', 'temple']): return 'historical_monument' elif any(keyword in places365_lower for keyword in ['tower', 'landmark']): return 'tourist_landmark' elif any(keyword in places365_lower for keyword in ['museum', 'gallery']): return 'cultural_venue' # Default fallback based on indoor/outdoor is_indoor = self._classify_indoor_outdoor(places365_class) if is_indoor is True: return 'general_indoor_space' elif is_indoor is False: return 'generic_street_view' else: return 'unknown' def _infer_scene_attributes(self, scene_class: str) -> List[str]: """ Infer basic scene attributes from Places365 class. Args: scene_class: Places365 scene class name Returns: List[str]: Inferred scene attributes """ attributes = [] scene_lower = scene_class.lower() # Lighting attributes if any(keyword in scene_lower for keyword in ['outdoor', 'street', 'park', 'beach']): attributes.append('natural_lighting') elif any(keyword in scene_lower for keyword in ['indoor', 'room', 'office']): attributes.append('artificial_lighting') # Functional attributes if any(keyword in scene_lower for keyword in ['commercial', 'store', 'shop', 'restaurant']): attributes.append('commercial') elif any(keyword in scene_lower for keyword in ['residential', 'home', 'living', 'bedroom']): attributes.append('residential') elif any(keyword in scene_lower for keyword in ['office', 'conference', 'meeting']): attributes.append('workplace') elif any(keyword in scene_lower for keyword in ['recreation', 'park', 'playground', 'stadium']): attributes.append('recreational') elif any(keyword in scene_lower for keyword in ['educational', 'school', 'library', 'classroom']): attributes.append('educational') # Spatial attributes if any(keyword in scene_lower for keyword in ['open', 'field', 'plaza', 'stadium']): attributes.append('open_space') elif any(keyword in scene_lower for keyword in ['enclosed', 'room', 'interior']): attributes.append('enclosed_space') return attributes def get_scene_probabilities(self, image_pil: Image.Image) -> Dict[str, float]: """ Get probability distribution over all scene classes. Args: image_pil: Input PIL image Returns: Dict mapping scene class names to probabilities """ try: input_tensor = self.preprocess(image_pil) with torch.no_grad(): outputs = self.model(input_tensor) probabilities = torch.nn.functional.softmax(outputs, dim=1) probs = probabilities.cpu().numpy()[0] return { self.scene_classes[i]: float(probs[i]) for i in range(len(self.scene_classes)) } except Exception as e: self.logger.error(f"Error getting scene probabilities: {str(e)}") return {}