from pathlib import Path import json import sys import os import logging import traceback from typing import Dict, List, Any, Optional, Union, Tuple from concurrent.futures import ThreadPoolExecutor, as_completed import time # Third-party imports with robust error handling try: from PIL import Image, ExifTags HAS_PIL = True except ImportError: HAS_PIL = False logging.warning("PIL not installed - image processing disabled") try: import gradio as gr HAS_GRADIO = True except ImportError: HAS_GRADIO = False logging.warning("Gradio not installed - UI disabled") try: from datasets import Dataset HAS_DATASETS = True except ImportError: HAS_DATASETS = False logging.warning("Datasets library not installed - HF upload disabled") # Advanced logging configuration logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s:%(lineno)d - %(message)s", handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler("geo_extractor.log") ] ) logger = logging.getLogger("geo_metadata_extractor") # Configurable settings with environment variable overrides and validation class Config: """Configuration container with validation and defaults""" DEFAULT_IMAGE_DIR = Path(os.environ.get("IMAGE_DIR", "./images")) DEFAULT_OUTPUT_FILE = Path(os.environ.get("OUTPUT_METADATA_FILE", "./metadata.jsonl")) HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks") DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata") MAX_WORKERS = int(os.environ.get("MAX_WORKERS", "4")) BATCH_SIZE = int(os.environ.get("BATCH_SIZE", "100")) # Image formats with EXIF support prioritized first SUPPORTED_EXTENSIONS = { # Primary formats with good EXIF support '.jpg', '.jpeg', '.tiff', '.tif', # Secondary formats with limited metadata support '.png', '.heic', '.bmp', '.webp' } @classmethod def validate(cls) -> List[str]: """Validate configuration settings and return warnings""" warnings = [] if cls.MAX_WORKERS < 1: cls.MAX_WORKERS = 1 warnings.append(f"Invalid MAX_WORKERS value, reset to {cls.MAX_WORKERS}") if cls.BATCH_SIZE < 10: cls.BATCH_SIZE = 10 warnings.append(f"BATCH_SIZE too small, reset to {cls.BATCH_SIZE}") return warnings # Run config validation at import time config_warnings = Config.validate() for warning in config_warnings: logger.warning(warning) class GeoMetadataExtractor: """Core metadata extraction logic with advanced error handling""" @staticmethod def convert_to_degrees(value: Union[tuple, list]) -> Optional[float]: """ Convert GPS coordinates (degrees, minutes, seconds) to decimal degrees Args: value: Tuple of degrees, minutes, seconds Returns: Decimal degrees as float, or None if conversion fails """ try: if not isinstance(value, (tuple, list)) or len(value) != 3: raise ValueError(f"GPS value must be a tuple of 3 elements, got {type(value)}") d, m, s = value degrees = float(d) + (float(m) / 60.0) + (float(s) / 3600.0) # Validate range if not -180 <= degrees <= 180: raise ValueError(f"GPS degrees out of valid range: {degrees}") return degrees except (TypeError, ValueError, ZeroDivisionError) as e: logger.error(f"Failed to convert GPS coordinates: {e}") return None @staticmethod def extract_gps_info(gps_info: Dict[int, Any]) -> Optional[Dict[str, Any]]: """ Extract and format GPS metadata from EXIF Args: gps_info: Dictionary of GPS EXIF tags Returns: Formatted GPS data including decimal latitude/longitude """ if not isinstance(gps_info, dict): logger.warning("GPS info is not a dictionary, skipping") return None gps_data = {} try: # Extract tag data for key, val in gps_info.items(): tag_name = ExifTags.GPSTAGS.get(key, f"unknown_gps_tag_{key}") gps_data[tag_name] = val # Process coordinates if available if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data: lat = GeoMetadataExtractor.convert_to_degrees(gps_data['GPSLatitude']) lon = GeoMetadataExtractor.convert_to_degrees(gps_data['GPSLongitude']) if lat is None or lon is None: logger.error("Failed to convert latitude/longitude, skipping GPS data") return None # Apply hemispheric references lat_ref = gps_data.get('GPSLatitudeRef', 'N') lon_ref = gps_data.get('GPSLongitudeRef', 'E') if lat_ref not in {'N', 'S'} or lon_ref not in {'E', 'W'}: logger.warning(f"Invalid GPS reference values: lat_ref={lat_ref}, lon_ref={lon_ref}") else: if lat_ref == 'S': lat = -lat if lon_ref == 'W': lon = -lon # Add calculated decimal coordinates gps_data['Latitude'] = round(lat, 6) # 6 decimal places ≈ 10cm precision gps_data['Longitude'] = round(lon, 6) # Add additional derived fields if 'GPSAltitude' in gps_data: try: altitude = gps_data['GPSAltitude'] if hasattr(altitude, 'numerator') and hasattr(altitude, 'denominator'): gps_data['AltitudeMeters'] = float(altitude.numerator) / float(altitude.denominator) except Exception as e: logger.warning(f"Failed to process altitude: {e}") return gps_data except Exception as e: stack_trace = traceback.format_exc() logger.error(f"GPS extraction error: {e}\n{stack_trace}") return None @staticmethod def make_serializable(value: Any) -> Any: """ Recursively convert non-serializable types to JSON-compatible values Args: value: Any value to convert Returns: JSON-serializable representation of value """ try: # Handle rational numbers (fractions) if hasattr(value, 'numerator') and hasattr(value, 'denominator'): if value.denominator == 0: return "undefined (division by zero)" return float(value.numerator) / float(value.denominator) # Handle nested structures elif isinstance(value, (tuple, list)): return [GeoMetadataExtractor.make_serializable(item) for item in value] elif isinstance(value, dict): return {str(k): GeoMetadataExtractor.make_serializable(v) for k, v in value.items()} # Handle binary data elif isinstance(value, bytes): return value.decode('utf-8', errors='replace') # Test if directly serializable json.dumps(value) return value except Exception as e: logger.warning(f"Value serialization failed, converting to string: {e}") return str(value) @staticmethod def get_image_metadata(image_path: Path) -> Dict[str, Any]: """ Extract comprehensive metadata from an image file Args: image_path: Path to image file Returns: Dictionary of extracted metadata """ # Core metadata with absolute file path metadata = { "file_name": str(image_path.absolute()), "extraction_time": time.strftime("%Y-%m-%d %H:%M:%S") } try: # Process file system metadata first (always available) stat_info = image_path.stat() metadata.update({ "file_size": stat_info.st_size, "file_extension": image_path.suffix.lower(), "last_modified": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat_info.st_mtime)), "creation_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat_info.st_ctime)) }) # Exit early if PIL not available if not HAS_PIL: metadata["error"] = "PIL library not available" return metadata # Extract image and EXIF data with Image.open(image_path) as image: # Basic image properties metadata.update({ "format": image.format or "unknown", "size": list(image.size), "width": image.width, "height": image.height, "mode": image.mode or "unknown", "aspect_ratio": round(image.width / image.height, 3) if image.height > 0 else None }) # Extract EXIF data if available exif_data = None try: # Different methods depending on image format if hasattr(image, '_getexif'): exif_data = image._getexif() elif hasattr(image, 'getexif'): exif_data = image.getexif() # Some formats like PNG store metadata differently if not exif_data and image.format == 'PNG' and 'exif' in image.info: exif_data = image.info.get('exif') metadata["exif_source"] = "PNG info block" except AttributeError: metadata["exif_error"] = "No EXIF extraction method available" except Exception as e: metadata["exif_error"] = f"EXIF extraction failed: {str(e)}" # Process EXIF data if found if exif_data and isinstance(exif_data, dict): for tag_id, value in exif_data.items(): # Handle GPS data specially if tag_id in ExifTags.TAGS and ExifTags.TAGS[tag_id] == "GPSInfo": gps_info = GeoMetadataExtractor.extract_gps_info(value) if gps_info: metadata["gps_info"] = GeoMetadataExtractor.make_serializable(gps_info) else: # Get tag name or use numeric ID with tag_ prefix tag_name = ExifTags.TAGS.get(tag_id, f"tag_{tag_id}").lower() metadata[tag_name] = GeoMetadataExtractor.make_serializable(value) # Add camera model and date taken for convenience if available if 'model' in metadata: metadata["camera_model"] = metadata['model'] if 'datetimeoriginal' in metadata: metadata["date_taken"] = metadata['datetimeoriginal'] return metadata except Exception as e: # Capture full stack trace for debugging stack_trace = traceback.format_exc() logger.error(f"Image {image_path} processing failed: {e}\n{stack_trace}") # Return partial metadata with error information metadata["error"] = str(e) metadata["error_trace"] = stack_trace return metadata class MetadataProcessor: """Handles batch processing and file operations""" @staticmethod def process_images(input_path: Union[str, Path]) -> List[Dict[str, Any]]: """ Process image files to extract metadata Args: input_path: Path to image file or directory Returns: List of metadata dictionaries for all processed images """ metadata_list = [] input_path = Path(input_path) start_time = time.time() # Handle single file case if input_path.is_file() and input_path.suffix.lower() in Config.SUPPORTED_EXTENSIONS: logger.info(f"Processing single image: {input_path}") metadata = GeoMetadataExtractor.get_image_metadata(input_path) if metadata: metadata_list.append(metadata) # Handle directory case elif input_path.is_dir(): logger.info(f"Processing directory: {input_path}") # Collect all image files first image_paths = [ path for path in input_path.rglob("*") if path.is_file() and path.suffix.lower() in Config.SUPPORTED_EXTENSIONS ] total_images = len(image_paths) logger.info(f"Found {total_images} images to process") # Process in parallel with progress tracking if total_images > 0: processed = 0 with ThreadPoolExecutor(max_workers=Config.MAX_WORKERS) as executor: # Submit all tasks future_to_path = { executor.submit(GeoMetadataExtractor.get_image_metadata, path): path for path in image_paths } # Process as they complete for future in as_completed(future_to_path): path = future_to_path[future] try: metadata = future.result() if metadata: metadata_list.append(metadata) # Update progress processed += 1 if processed % 10 == 0 or processed == total_images: elapsed = time.time() - start_time rate = processed / elapsed if elapsed > 0 else 0 logger.info(f"Processed {processed}/{total_images} images ({processed/total_images*100:.1f}%) - {rate:.2f} images/sec") except Exception as e: logger.error(f"Error processing {path}: {e}") else: logger.warning(f"No images found in directory: {input_path}") else: logger.error(f"Invalid input: {input_path} is not a file or directory") return [{"error": f"Invalid input: {input_path} is not a file or directory"}] # Summarize results elapsed = time.time() - start_time images_per_second = len(metadata_list) / elapsed if elapsed > 0 else 0 logger.info(f"Completed processing {len(metadata_list)} images in {elapsed:.2f} seconds ({images_per_second:.2f} images/sec)") return metadata_list @staticmethod def save_metadata_to_jsonl(metadata_list: List[Dict[str, Any]], output_file: Path) -> bool: """ Save metadata to JSONL format with error handling Args: metadata_list: List of metadata dictionaries output_file: Path to output file Returns: True if save was successful, False otherwise """ try: # Create directory if needed output_file.parent.mkdir(parents=True, exist_ok=True) # Write to file with output_file.open('w', encoding='utf-8') as f: for entry in metadata_list: f.write(json.dumps(entry, ensure_ascii=False) + '\n') logger.info(f"Successfully saved {len(metadata_list)} entries to {output_file}") return True except Exception as e: stack_trace = traceback.format_exc() logger.error(f"Failed to save metadata: {e}\n{stack_trace}") return False @staticmethod def upload_to_huggingface(metadata_file: Path, username: str, dataset_name: str) -> str: """ Upload metadata to Hugging Face as a dataset Args: metadata_file: Path to JSONL file username: Hugging Face username dataset_name: Dataset name to create/update Returns: Status message """ if not HAS_DATASETS: return "Hugging Face datasets library not installed" try: # Read metadata metadata_list = [] with metadata_file.open('r', encoding='utf-8') as f: for line in f: metadata_list.append(json.loads(line)) if not metadata_list: return "No metadata to upload" # Create dataset logger.info(f"Creating dataset with {len(metadata_list)} entries") dataset = Dataset.from_dict({ "images": [entry.get("file_name", "unknown") for entry in metadata_list], "metadata": metadata_list }) # Push to Hub dataset_path = f"{username}/{dataset_name}" logger.info(f"Pushing dataset to {dataset_path}") dataset.push_to_hub(dataset_path, private=False) return f"Successfully uploaded to {dataset_path} with {len(metadata_list)} entries" except Exception as e: stack_trace = traceback.format_exc() logger.error(f"Upload failed: {e}\n{stack_trace}") return f"Upload failed: {str(e)}" class GradioInterface: """Gradio UI interface""" @staticmethod def create_interface(): """ Create the Gradio interface Returns: Gradio interface object """ if not HAS_GRADIO: logger.error("Gradio not installed, cannot create interface") return None def process_input(image_file, dir_path: str, username: str, dataset_name: str) -> str: """ Process inputs from Gradio UI Args: image_file: Uploaded file object or None dir_path: Directory path string username: Hugging Face username dataset_name: Dataset name Returns: Results as formatted text """ output_lines = [] metadata_list = [] # Handle single image upload if image_file: image_path = Path(image_file.name) output_lines.append(f"## Processing Single Image: {image_path.name}") single_metadata = MetadataProcessor.process_images(image_path) metadata_list.extend(single_metadata) # Format first entry for display if single_metadata: output_lines.append("### Image Metadata:") output_lines.append("```json") output_lines.append(json.dumps(single_metadata[0], indent=2)) output_lines.append("```") # Handle directory processing if dir_path: dir_path = Path(dir_path) if dir_path.is_dir(): output_lines.append(f"## Processing Directory: {dir_path}") dir_metadata = MetadataProcessor.process_images(dir_path) # Add to full list metadata_list.extend(dir_metadata) # Summarize results output_lines.append(f"### Directory Results:") output_lines.append(f"- Processed {len(dir_metadata)} images") # Location data summary location_count = sum(1 for entry in dir_metadata if entry.get("gps_info") is not None) output_lines.append(f"- Found location data in {location_count} images ({location_count/len(dir_metadata)*100:.1f}% if len(dir_metadata) > 0 else 0}%)") # Show a few examples if available if dir_metadata: output_lines.append("\n### Sample Entry:") output_lines.append("```json") output_lines.append(json.dumps(dir_metadata[0], indent=2)) output_lines.append("```") else: output_lines.append(f"⚠️ Error: {dir_path} is not a directory") # Save and upload if we have metadata if metadata_list: temp_output_file = Path("temp_metadata.jsonl") output_lines.append(f"\n## Saving and Uploading") if MetadataProcessor.save_metadata_to_jsonl(metadata_list, temp_output_file): output_lines.append(f"✅ Saved metadata to {temp_output_file}") # Upload to Hugging Face upload_result = MetadataProcessor.upload_to_huggingface( temp_output_file, username, dataset_name ) output_lines.append(f"📤 {upload_result}") else: output_lines.append("❌ Failed to save metadata") return "\n".join(output_lines) if output_lines else "Please upload an image or provide a directory path" # Create the interface demo = gr.Interface( fn=process_input, inputs=[ gr.File(label="Upload Image", file_types=list(Config.SUPPORTED_EXTENSIONS)), gr.Textbox(label="Image Directory", placeholder=str(Config.DEFAULT_IMAGE_DIR), value=str(Config.DEFAULT_IMAGE_DIR)), gr.Textbox(label="Hugging Face Username", value=Config.HF_USERNAME), gr.Textbox(label="Dataset Name", value=Config.DATASET_NAME) ], outputs=gr.Markdown(label="Results"), title="Enhanced Geo-Metadata Extractor", description=( "Upload an image or process a directory to extract location metadata and other EXIF data. " "Results can be automatically uploaded to Hugging Face Datasets." ), allow_flagging="never", examples=[ [None, "sample_images", Config.HF_USERNAME, "sample-geo-metadata"] ] ) return demo def main(): """Main entry point""" logger.info("Starting Geo-Metadata Extractor") # Check dependencies if not HAS_PIL: logger.error("PIL is required for image processing. Please install: pip install pillow") sys.exit(1) # Create and launch the UI if running directly if HAS_GRADIO: logger.info("Creating Gradio interface") demo = GradioInterface.create_interface() if demo: logger.info("Launching Gradio interface") demo.launch(server_name="0.0.0.0", server_port=7860) else: logger.error("Failed to create Gradio interface") else: logger.warning("Gradio not installed, running in CLI mode") # Process default directory as fallback if Config.DEFAULT_IMAGE_DIR.exists(): logger.info(f"Processing default directory: {Config.DEFAULT_IMAGE_DIR}") metadata = MetadataProcessor.process_images(Config.DEFAULT_IMAGE_DIR) if metadata: logger.info(f"Saving {len(metadata)} entries to {Config.DEFAULT_OUTPUT_FILE}") MetadataProcessor.save_metadata_to_jsonl(metadata, Config.DEFAULT_OUTPUT_FILE) logger.info(f"Metadata saved to {Config.DEFAULT_OUTPUT_FILE}") else: logger.error(f"Default directory not found: {Config.DEFAULT_IMAGE_DIR}") if __name__ == "__main__": main()