Spaces:

latterworks
/

geo-metadata-extractor-gradio

Runtime error

File size: 24,750 Bytes

from pathlib import Path
import json
import sys
import os
import logging
import traceback
from typing import Dict, List, Any, Optional, Union, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Third-party imports with robust error handling
try:
    from PIL import Image, ExifTags
    HAS_PIL = True
except ImportError:
    HAS_PIL = False
    logging.warning("PIL not installed - image processing disabled")

try:
    import gradio as gr
    HAS_GRADIO = True
except ImportError:
    HAS_GRADIO = False
    logging.warning("Gradio not installed - UI disabled")

try:
    from datasets import Dataset
    HAS_DATASETS = True
except ImportError:
    HAS_DATASETS = False
    logging.warning("Datasets library not installed - HF upload disabled")

# Advanced logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s:%(lineno)d - %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler("geo_extractor.log")
    ]
)
logger = logging.getLogger("geo_metadata_extractor")

# Configurable settings with environment variable overrides and validation
class Config:
    """Configuration container with validation and defaults"""
    
    DEFAULT_IMAGE_DIR = Path(os.environ.get("IMAGE_DIR", "./images"))
    DEFAULT_OUTPUT_FILE = Path(os.environ.get("OUTPUT_METADATA_FILE", "./metadata.jsonl"))
    HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks")
    DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata")
    MAX_WORKERS = int(os.environ.get("MAX_WORKERS", "4"))
    BATCH_SIZE = int(os.environ.get("BATCH_SIZE", "100"))
    
    # Image formats with EXIF support prioritized first
    SUPPORTED_EXTENSIONS = {
        # Primary formats with good EXIF support
        '.jpg', '.jpeg', '.tiff', '.tif',
        # Secondary formats with limited metadata support
        '.png', '.heic', '.bmp', '.webp'
    }
    
    @classmethod
    def validate(cls) -> List[str]:
        """Validate configuration settings and return warnings"""
        warnings = []
        
        if cls.MAX_WORKERS < 1:
            cls.MAX_WORKERS = 1
            warnings.append(f"Invalid MAX_WORKERS value, reset to {cls.MAX_WORKERS}")
            
        if cls.BATCH_SIZE < 10:
            cls.BATCH_SIZE = 10
            warnings.append(f"BATCH_SIZE too small, reset to {cls.BATCH_SIZE}")
            
        return warnings

# Run config validation at import time
config_warnings = Config.validate()
for warning in config_warnings:
    logger.warning(warning)

class GeoMetadataExtractor:
    """Core metadata extraction logic with advanced error handling"""
    
    @staticmethod
    def convert_to_degrees(value: Union[tuple, list]) -> Optional[float]:
        """
        Convert GPS coordinates (degrees, minutes, seconds) to decimal degrees
        
        Args:
            value: Tuple of degrees, minutes, seconds
            
        Returns:
            Decimal degrees as float, or None if conversion fails
        """
        try:
            if not isinstance(value, (tuple, list)) or len(value) != 3:
                raise ValueError(f"GPS value must be a tuple of 3 elements, got {type(value)}")
                
            d, m, s = value
            degrees = float(d) + (float(m) / 60.0) + (float(s) / 3600.0)
            
            # Validate range
            if not -180 <= degrees <= 180:
                raise ValueError(f"GPS degrees out of valid range: {degrees}")
                
            return degrees
        except (TypeError, ValueError, ZeroDivisionError) as e:
            logger.error(f"Failed to convert GPS coordinates: {e}")
            return None
    
    @staticmethod
    def extract_gps_info(gps_info: Dict[int, Any]) -> Optional[Dict[str, Any]]:
        """
        Extract and format GPS metadata from EXIF
        
        Args:
            gps_info: Dictionary of GPS EXIF tags
            
        Returns:
            Formatted GPS data including decimal latitude/longitude
        """
        if not isinstance(gps_info, dict):
            logger.warning("GPS info is not a dictionary, skipping")
            return None

        gps_data = {}
        try:
            # Extract tag data
            for key, val in gps_info.items():
                tag_name = ExifTags.GPSTAGS.get(key, f"unknown_gps_tag_{key}")
                gps_data[tag_name] = val

            # Process coordinates if available
            if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
                lat = GeoMetadataExtractor.convert_to_degrees(gps_data['GPSLatitude'])
                lon = GeoMetadataExtractor.convert_to_degrees(gps_data['GPSLongitude'])
                
                if lat is None or lon is None:
                    logger.error("Failed to convert latitude/longitude, skipping GPS data")
                    return None

                # Apply hemispheric references
                lat_ref = gps_data.get('GPSLatitudeRef', 'N')
                lon_ref = gps_data.get('GPSLongitudeRef', 'E')
                
                if lat_ref not in {'N', 'S'} or lon_ref not in {'E', 'W'}:
                    logger.warning(f"Invalid GPS reference values: lat_ref={lat_ref}, lon_ref={lon_ref}")
                else:
                    if lat_ref == 'S':
                        lat = -lat
                    if lon_ref == 'W':
                        lon = -lon

                # Add calculated decimal coordinates
                gps_data['Latitude'] = round(lat, 6)  # 6 decimal places ≈ 10cm precision
                gps_data['Longitude'] = round(lon, 6)
                
                # Add additional derived fields
                if 'GPSAltitude' in gps_data:
                    try:
                        altitude = gps_data['GPSAltitude']
                        if hasattr(altitude, 'numerator') and hasattr(altitude, 'denominator'):
                            gps_data['AltitudeMeters'] = float(altitude.numerator) / float(altitude.denominator)
                    except Exception as e:
                        logger.warning(f"Failed to process altitude: {e}")

            return gps_data
        except Exception as e:
            stack_trace = traceback.format_exc()
            logger.error(f"GPS extraction error: {e}\n{stack_trace}")
            return None
    
    @staticmethod
    def make_serializable(value: Any) -> Any:
        """
        Recursively convert non-serializable types to JSON-compatible values
        
        Args:
            value: Any value to convert
            
        Returns:
            JSON-serializable representation of value
        """
        try:
            # Handle rational numbers (fractions)
            if hasattr(value, 'numerator') and hasattr(value, 'denominator'):
                if value.denominator == 0:
                    return "undefined (division by zero)"
                return float(value.numerator) / float(value.denominator)
                
            # Handle nested structures
            elif isinstance(value, (tuple, list)):
                return [GeoMetadataExtractor.make_serializable(item) for item in value]
                
            elif isinstance(value, dict):
                return {str(k): GeoMetadataExtractor.make_serializable(v) for k, v in value.items()}
                
            # Handle binary data
            elif isinstance(value, bytes):
                return value.decode('utf-8', errors='replace')
                
            # Test if directly serializable
            json.dumps(value)
            return value
            
        except Exception as e:
            logger.warning(f"Value serialization failed, converting to string: {e}")
            return str(value)
    
    @staticmethod
    def get_image_metadata(image_path: Path) -> Dict[str, Any]:
        """
        Extract comprehensive metadata from an image file
        
        Args:
            image_path: Path to image file
            
        Returns:
            Dictionary of extracted metadata
        """
        # Core metadata with absolute file path
        metadata = {
            "file_name": str(image_path.absolute()),
            "extraction_time": time.strftime("%Y-%m-%d %H:%M:%S")
        }
        
        try:
            # Process file system metadata first (always available)
            stat_info = image_path.stat()
            metadata.update({
                "file_size": stat_info.st_size,
                "file_extension": image_path.suffix.lower(),
                "last_modified": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat_info.st_mtime)),
                "creation_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat_info.st_ctime))
            })
            
            # Exit early if PIL not available
            if not HAS_PIL:
                metadata["error"] = "PIL library not available"
                return metadata
                
            # Extract image and EXIF data
            with Image.open(image_path) as image:
                # Basic image properties
                metadata.update({
                    "format": image.format or "unknown",
                    "size": list(image.size),
                    "width": image.width,
                    "height": image.height,
                    "mode": image.mode or "unknown",
                    "aspect_ratio": round(image.width / image.height, 3) if image.height > 0 else None
                })

                # Extract EXIF data if available
                exif_data = None
                try:
                    # Different methods depending on image format
                    if hasattr(image, '_getexif'):
                        exif_data = image._getexif()
                    elif hasattr(image, 'getexif'):
                        exif_data = image.getexif()
                    
                    # Some formats like PNG store metadata differently
                    if not exif_data and image.format == 'PNG' and 'exif' in image.info:
                        exif_data = image.info.get('exif')
                        metadata["exif_source"] = "PNG info block"
                except AttributeError:
                    metadata["exif_error"] = "No EXIF extraction method available"
                except Exception as e:
                    metadata["exif_error"] = f"EXIF extraction failed: {str(e)}"

                # Process EXIF data if found
                if exif_data and isinstance(exif_data, dict):
                    for tag_id, value in exif_data.items():
                        # Handle GPS data specially
                        if tag_id in ExifTags.TAGS and ExifTags.TAGS[tag_id] == "GPSInfo":
                            gps_info = GeoMetadataExtractor.extract_gps_info(value)
                            if gps_info:
                                metadata["gps_info"] = GeoMetadataExtractor.make_serializable(gps_info)
                        else:
                            # Get tag name or use numeric ID with tag_ prefix
                            tag_name = ExifTags.TAGS.get(tag_id, f"tag_{tag_id}").lower()
                            metadata[tag_name] = GeoMetadataExtractor.make_serializable(value)
                
                # Add camera model and date taken for convenience if available
                if 'model' in metadata:
                    metadata["camera_model"] = metadata['model']
                if 'datetimeoriginal' in metadata:
                    metadata["date_taken"] = metadata['datetimeoriginal']

            return metadata
        except Exception as e:
            # Capture full stack trace for debugging
            stack_trace = traceback.format_exc()
            logger.error(f"Image {image_path} processing failed: {e}\n{stack_trace}")
            
            # Return partial metadata with error information
            metadata["error"] = str(e)
            metadata["error_trace"] = stack_trace
            return metadata

class MetadataProcessor:
    """Handles batch processing and file operations"""
    
    @staticmethod
    def process_images(input_path: Union[str, Path]) -> List[Dict[str, Any]]:
        """
        Process image files to extract metadata
        
        Args:
            input_path: Path to image file or directory
            
        Returns:
            List of metadata dictionaries for all processed images
        """
        metadata_list = []
        input_path = Path(input_path)
        start_time = time.time()
        
        # Handle single file case
        if input_path.is_file() and input_path.suffix.lower() in Config.SUPPORTED_EXTENSIONS:
            logger.info(f"Processing single image: {input_path}")
            metadata = GeoMetadataExtractor.get_image_metadata(input_path)
            if metadata:
                metadata_list.append(metadata)
                
        # Handle directory case
        elif input_path.is_dir():
            logger.info(f"Processing directory: {input_path}")
            
            # Collect all image files first
            image_paths = [
                path for path in input_path.rglob("*") 
                if path.is_file() and path.suffix.lower() in Config.SUPPORTED_EXTENSIONS
            ]
            
            total_images = len(image_paths)
            logger.info(f"Found {total_images} images to process")
            
            # Process in parallel with progress tracking
            if total_images > 0:
                processed = 0
                with ThreadPoolExecutor(max_workers=Config.MAX_WORKERS) as executor:
                    # Submit all tasks
                    future_to_path = {
                        executor.submit(GeoMetadataExtractor.get_image_metadata, path): path 
                        for path in image_paths
                    }
                    
                    # Process as they complete
                    for future in as_completed(future_to_path):
                        path = future_to_path[future]
                        try:
                            metadata = future.result()
                            if metadata:
                                metadata_list.append(metadata)
                                
                            # Update progress
                            processed += 1
                            if processed % 10 == 0 or processed == total_images:
                                elapsed = time.time() - start_time
                                rate = processed / elapsed if elapsed > 0 else 0
                                logger.info(f"Processed {processed}/{total_images} images ({processed/total_images*100:.1f}%) - {rate:.2f} images/sec")
                                
                        except Exception as e:
                            logger.error(f"Error processing {path}: {e}")
            else:
                logger.warning(f"No images found in directory: {input_path}")
        else:
            logger.error(f"Invalid input: {input_path} is not a file or directory")
            return [{"error": f"Invalid input: {input_path} is not a file or directory"}]

        # Summarize results
        elapsed = time.time() - start_time
        images_per_second = len(metadata_list) / elapsed if elapsed > 0 else 0
        logger.info(f"Completed processing {len(metadata_list)} images in {elapsed:.2f} seconds ({images_per_second:.2f} images/sec)")
        
        return metadata_list

    @staticmethod
    def save_metadata_to_jsonl(metadata_list: List[Dict[str, Any]], output_file: Path) -> bool:
        """
        Save metadata to JSONL format with error handling
        
        Args:
            metadata_list: List of metadata dictionaries
            output_file: Path to output file
            
        Returns:
            True if save was successful, False otherwise
        """
        try:
            # Create directory if needed
            output_file.parent.mkdir(parents=True, exist_ok=True)
            
            # Write to file
            with output_file.open('w', encoding='utf-8') as f:
                for entry in metadata_list:
                    f.write(json.dumps(entry, ensure_ascii=False) + '\n')
                    
            logger.info(f"Successfully saved {len(metadata_list)} entries to {output_file}")
            return True
            
        except Exception as e:
            stack_trace = traceback.format_exc()
            logger.error(f"Failed to save metadata: {e}\n{stack_trace}")
            return False

    @staticmethod
    def upload_to_huggingface(metadata_file: Path, username: str, dataset_name: str) -> str:
        """
        Upload metadata to Hugging Face as a dataset
        
        Args:
            metadata_file: Path to JSONL file
            username: Hugging Face username
            dataset_name: Dataset name to create/update
            
        Returns:
            Status message
        """
        if not HAS_DATASETS:
            return "Hugging Face datasets library not installed"
            
        try:
            # Read metadata
            metadata_list = []
            with metadata_file.open('r', encoding='utf-8') as f:
                for line in f:
                    metadata_list.append(json.loads(line))

            if not metadata_list:
                return "No metadata to upload"

            # Create dataset
            logger.info(f"Creating dataset with {len(metadata_list)} entries")
            dataset = Dataset.from_dict({
                "images": [entry.get("file_name", "unknown") for entry in metadata_list],
                "metadata": metadata_list
            })
            
            # Push to Hub
            dataset_path = f"{username}/{dataset_name}"
            logger.info(f"Pushing dataset to {dataset_path}")
            dataset.push_to_hub(dataset_path, private=False)
            
            return f"Successfully uploaded to {dataset_path} with {len(metadata_list)} entries"
            
        except Exception as e:
            stack_trace = traceback.format_exc()
            logger.error(f"Upload failed: {e}\n{stack_trace}")
            return f"Upload failed: {str(e)}"

class GradioInterface:
    """Gradio UI interface"""
    
    @staticmethod
    def create_interface():
        """
        Create the Gradio interface
        
        Returns:
            Gradio interface object
        """
        if not HAS_GRADIO:
            logger.error("Gradio not installed, cannot create interface")
            return None
            
        def process_input(image_file, dir_path: str, username: str, dataset_name: str) -> str:
            """
            Process inputs from Gradio UI
            
            Args:
                image_file: Uploaded file object or None
                dir_path: Directory path string
                username: Hugging Face username
                dataset_name: Dataset name
                
            Returns:
                Results as formatted text
            """
            output_lines = []
            metadata_list = []

            # Handle single image upload
            if image_file:
                image_path = Path(image_file.name)
                output_lines.append(f"## Processing Single Image: {image_path.name}")
                
                single_metadata = MetadataProcessor.process_images(image_path)
                metadata_list.extend(single_metadata)
                
                # Format first entry for display
                if single_metadata:
                    output_lines.append("### Image Metadata:")
                    output_lines.append("```json")
                    output_lines.append(json.dumps(single_metadata[0], indent=2))
                    output_lines.append("```")

            # Handle directory processing
            if dir_path:
                dir_path = Path(dir_path)
                if dir_path.is_dir():
                    output_lines.append(f"## Processing Directory: {dir_path}")
                    dir_metadata = MetadataProcessor.process_images(dir_path)
                    
                    # Add to full list
                    metadata_list.extend(dir_metadata)
                    
                    # Summarize results
                    output_lines.append(f"### Directory Results:")
                    output_lines.append(f"- Processed {len(dir_metadata)} images")
                    
                    # Location data summary
                    location_count = sum(1 for entry in dir_metadata if entry.get("gps_info") is not None)
                    output_lines.append(f"- Found location data in {location_count} images ({location_count/len(dir_metadata)*100:.1f}% if len(dir_metadata) > 0 else 0}%)")
                    
                    # Show a few examples if available
                    if dir_metadata:
                        output_lines.append("\n### Sample Entry:")
                        output_lines.append("```json")
                        output_lines.append(json.dumps(dir_metadata[0], indent=2))
                        output_lines.append("```")
                else:
                    output_lines.append(f"⚠️ Error: {dir_path} is not a directory")

            # Save and upload if we have metadata
            if metadata_list:
                temp_output_file = Path("temp_metadata.jsonl")
                output_lines.append(f"\n## Saving and Uploading")
                
                if MetadataProcessor.save_metadata_to_jsonl(metadata_list, temp_output_file):
                    output_lines.append(f"✅ Saved metadata to {temp_output_file}")
                    
                    # Upload to Hugging Face
                    upload_result = MetadataProcessor.upload_to_huggingface(
                        temp_output_file, username, dataset_name
                    )
                    output_lines.append(f"📤 {upload_result}")
                else:
                    output_lines.append("❌ Failed to save metadata")

            return "\n".join(output_lines) if output_lines else "Please upload an image or provide a directory path"

        # Create the interface
        demo = gr.Interface(
            fn=process_input,
            inputs=[
                gr.File(label="Upload Image", file_types=list(Config.SUPPORTED_EXTENSIONS)),
                gr.Textbox(label="Image Directory", placeholder=str(Config.DEFAULT_IMAGE_DIR), value=str(Config.DEFAULT_IMAGE_DIR)),
                gr.Textbox(label="Hugging Face Username", value=Config.HF_USERNAME),
                gr.Textbox(label="Dataset Name", value=Config.DATASET_NAME)
            ],
            outputs=gr.Markdown(label="Results"),
            title="Enhanced Geo-Metadata Extractor",
            description=(
                "Upload an image or process a directory to extract location metadata and other EXIF data. "
                "Results can be automatically uploaded to Hugging Face Datasets."
            ),
            allow_flagging="never",
            examples=[
                [None, "sample_images", Config.HF_USERNAME, "sample-geo-metadata"]
            ]
        )
        
        return demo

def main():
    """Main entry point"""
    logger.info("Starting Geo-Metadata Extractor")
    
    # Check dependencies
    if not HAS_PIL:
        logger.error("PIL is required for image processing. Please install: pip install pillow")
        sys.exit(1)
        
    # Create and launch the UI if running directly
    if HAS_GRADIO:
        logger.info("Creating Gradio interface")
        demo = GradioInterface.create_interface()
        if demo:
            logger.info("Launching Gradio interface")
            demo.launch(server_name="0.0.0.0", server_port=7860)
        else:
            logger.error("Failed to create Gradio interface")
    else:
        logger.warning("Gradio not installed, running in CLI mode")
        
        # Process default directory as fallback
        if Config.DEFAULT_IMAGE_DIR.exists():
            logger.info(f"Processing default directory: {Config.DEFAULT_IMAGE_DIR}")
            metadata = MetadataProcessor.process_images(Config.DEFAULT_IMAGE_DIR)
            
            if metadata:
                logger.info(f"Saving {len(metadata)} entries to {Config.DEFAULT_OUTPUT_FILE}")
                MetadataProcessor.save_metadata_to_jsonl(metadata, Config.DEFAULT_OUTPUT_FILE)
                logger.info(f"Metadata saved to {Config.DEFAULT_OUTPUT_FILE}")
        else:
            logger.error(f"Default directory not found: {Config.DEFAULT_IMAGE_DIR}")

if __name__ == "__main__":
    main()