Spaces:
Runtime error
Runtime error
from pathlib import Path | |
import json | |
import sys | |
import os | |
import logging | |
import traceback | |
from typing import Dict, List, Any, Optional, Union, Tuple | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
import time | |
# Third-party imports with robust error handling | |
try: | |
from PIL import Image, ExifTags | |
HAS_PIL = True | |
except ImportError: | |
HAS_PIL = False | |
logging.warning("PIL not installed - image processing disabled") | |
try: | |
import gradio as gr | |
HAS_GRADIO = True | |
except ImportError: | |
HAS_GRADIO = False | |
logging.warning("Gradio not installed - UI disabled") | |
try: | |
from datasets import Dataset | |
HAS_DATASETS = True | |
except ImportError: | |
HAS_DATASETS = False | |
logging.warning("Datasets library not installed - HF upload disabled") | |
# Advanced logging configuration | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s [%(levelname)s] %(name)s:%(lineno)d - %(message)s", | |
handlers=[ | |
logging.StreamHandler(sys.stdout), | |
logging.FileHandler("geo_extractor.log") | |
] | |
) | |
logger = logging.getLogger("geo_metadata_extractor") | |
# Configurable settings with environment variable overrides and validation | |
class Config: | |
"""Configuration container with validation and defaults""" | |
DEFAULT_IMAGE_DIR = Path(os.environ.get("IMAGE_DIR", "./images")) | |
DEFAULT_OUTPUT_FILE = Path(os.environ.get("OUTPUT_METADATA_FILE", "./metadata.jsonl")) | |
HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks") | |
DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata") | |
MAX_WORKERS = int(os.environ.get("MAX_WORKERS", "4")) | |
BATCH_SIZE = int(os.environ.get("BATCH_SIZE", "100")) | |
# Image formats with EXIF support prioritized first | |
SUPPORTED_EXTENSIONS = { | |
# Primary formats with good EXIF support | |
'.jpg', '.jpeg', '.tiff', '.tif', | |
# Secondary formats with limited metadata support | |
'.png', '.heic', '.bmp', '.webp' | |
} | |
def validate(cls) -> List[str]: | |
"""Validate configuration settings and return warnings""" | |
warnings = [] | |
if cls.MAX_WORKERS < 1: | |
cls.MAX_WORKERS = 1 | |
warnings.append(f"Invalid MAX_WORKERS value, reset to {cls.MAX_WORKERS}") | |
if cls.BATCH_SIZE < 10: | |
cls.BATCH_SIZE = 10 | |
warnings.append(f"BATCH_SIZE too small, reset to {cls.BATCH_SIZE}") | |
return warnings | |
# Run config validation at import time | |
config_warnings = Config.validate() | |
for warning in config_warnings: | |
logger.warning(warning) | |
class GeoMetadataExtractor: | |
"""Core metadata extraction logic with advanced error handling""" | |
def convert_to_degrees(value: Union[tuple, list]) -> Optional[float]: | |
""" | |
Convert GPS coordinates (degrees, minutes, seconds) to decimal degrees | |
Args: | |
value: Tuple of degrees, minutes, seconds | |
Returns: | |
Decimal degrees as float, or None if conversion fails | |
""" | |
try: | |
if not isinstance(value, (tuple, list)) or len(value) != 3: | |
raise ValueError(f"GPS value must be a tuple of 3 elements, got {type(value)}") | |
d, m, s = value | |
degrees = float(d) + (float(m) / 60.0) + (float(s) / 3600.0) | |
# Validate range | |
if not -180 <= degrees <= 180: | |
raise ValueError(f"GPS degrees out of valid range: {degrees}") | |
return degrees | |
except (TypeError, ValueError, ZeroDivisionError) as e: | |
logger.error(f"Failed to convert GPS coordinates: {e}") | |
return None | |
def extract_gps_info(gps_info: Dict[int, Any]) -> Optional[Dict[str, Any]]: | |
""" | |
Extract and format GPS metadata from EXIF | |
Args: | |
gps_info: Dictionary of GPS EXIF tags | |
Returns: | |
Formatted GPS data including decimal latitude/longitude | |
""" | |
if not isinstance(gps_info, dict): | |
logger.warning("GPS info is not a dictionary, skipping") | |
return None | |
gps_data = {} | |
try: | |
# Extract tag data | |
for key, val in gps_info.items(): | |
tag_name = ExifTags.GPSTAGS.get(key, f"unknown_gps_tag_{key}") | |
gps_data[tag_name] = val | |
# Process coordinates if available | |
if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data: | |
lat = GeoMetadataExtractor.convert_to_degrees(gps_data['GPSLatitude']) | |
lon = GeoMetadataExtractor.convert_to_degrees(gps_data['GPSLongitude']) | |
if lat is None or lon is None: | |
logger.error("Failed to convert latitude/longitude, skipping GPS data") | |
return None | |
# Apply hemispheric references | |
lat_ref = gps_data.get('GPSLatitudeRef', 'N') | |
lon_ref = gps_data.get('GPSLongitudeRef', 'E') | |
if lat_ref not in {'N', 'S'} or lon_ref not in {'E', 'W'}: | |
logger.warning(f"Invalid GPS reference values: lat_ref={lat_ref}, lon_ref={lon_ref}") | |
else: | |
if lat_ref == 'S': | |
lat = -lat | |
if lon_ref == 'W': | |
lon = -lon | |
# Add calculated decimal coordinates | |
gps_data['Latitude'] = round(lat, 6) # 6 decimal places ≈ 10cm precision | |
gps_data['Longitude'] = round(lon, 6) | |
# Add additional derived fields | |
if 'GPSAltitude' in gps_data: | |
try: | |
altitude = gps_data['GPSAltitude'] | |
if hasattr(altitude, 'numerator') and hasattr(altitude, 'denominator'): | |
gps_data['AltitudeMeters'] = float(altitude.numerator) / float(altitude.denominator) | |
except Exception as e: | |
logger.warning(f"Failed to process altitude: {e}") | |
return gps_data | |
except Exception as e: | |
stack_trace = traceback.format_exc() | |
logger.error(f"GPS extraction error: {e}\n{stack_trace}") | |
return None | |
def make_serializable(value: Any) -> Any: | |
""" | |
Recursively convert non-serializable types to JSON-compatible values | |
Args: | |
value: Any value to convert | |
Returns: | |
JSON-serializable representation of value | |
""" | |
try: | |
# Handle rational numbers (fractions) | |
if hasattr(value, 'numerator') and hasattr(value, 'denominator'): | |
if value.denominator == 0: | |
return "undefined (division by zero)" | |
return float(value.numerator) / float(value.denominator) | |
# Handle nested structures | |
elif isinstance(value, (tuple, list)): | |
return [GeoMetadataExtractor.make_serializable(item) for item in value] | |
elif isinstance(value, dict): | |
return {str(k): GeoMetadataExtractor.make_serializable(v) for k, v in value.items()} | |
# Handle binary data | |
elif isinstance(value, bytes): | |
return value.decode('utf-8', errors='replace') | |
# Test if directly serializable | |
json.dumps(value) | |
return value | |
except Exception as e: | |
logger.warning(f"Value serialization failed, converting to string: {e}") | |
return str(value) | |
def get_image_metadata(image_path: Path) -> Dict[str, Any]: | |
""" | |
Extract comprehensive metadata from an image file | |
Args: | |
image_path: Path to image file | |
Returns: | |
Dictionary of extracted metadata | |
""" | |
# Core metadata with absolute file path | |
metadata = { | |
"file_name": str(image_path.absolute()), | |
"extraction_time": time.strftime("%Y-%m-%d %H:%M:%S") | |
} | |
try: | |
# Process file system metadata first (always available) | |
stat_info = image_path.stat() | |
metadata.update({ | |
"file_size": stat_info.st_size, | |
"file_extension": image_path.suffix.lower(), | |
"last_modified": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat_info.st_mtime)), | |
"creation_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat_info.st_ctime)) | |
}) | |
# Exit early if PIL not available | |
if not HAS_PIL: | |
metadata["error"] = "PIL library not available" | |
return metadata | |
# Extract image and EXIF data | |
with Image.open(image_path) as image: | |
# Basic image properties | |
metadata.update({ | |
"format": image.format or "unknown", | |
"size": list(image.size), | |
"width": image.width, | |
"height": image.height, | |
"mode": image.mode or "unknown", | |
"aspect_ratio": round(image.width / image.height, 3) if image.height > 0 else None | |
}) | |
# Extract EXIF data if available | |
exif_data = None | |
try: | |
# Different methods depending on image format | |
if hasattr(image, '_getexif'): | |
exif_data = image._getexif() | |
elif hasattr(image, 'getexif'): | |
exif_data = image.getexif() | |
# Some formats like PNG store metadata differently | |
if not exif_data and image.format == 'PNG' and 'exif' in image.info: | |
exif_data = image.info.get('exif') | |
metadata["exif_source"] = "PNG info block" | |
except AttributeError: | |
metadata["exif_error"] = "No EXIF extraction method available" | |
except Exception as e: | |
metadata["exif_error"] = f"EXIF extraction failed: {str(e)}" | |
# Process EXIF data if found | |
if exif_data and isinstance(exif_data, dict): | |
for tag_id, value in exif_data.items(): | |
# Handle GPS data specially | |
if tag_id in ExifTags.TAGS and ExifTags.TAGS[tag_id] == "GPSInfo": | |
gps_info = GeoMetadataExtractor.extract_gps_info(value) | |
if gps_info: | |
metadata["gps_info"] = GeoMetadataExtractor.make_serializable(gps_info) | |
else: | |
# Get tag name or use numeric ID with tag_ prefix | |
tag_name = ExifTags.TAGS.get(tag_id, f"tag_{tag_id}").lower() | |
metadata[tag_name] = GeoMetadataExtractor.make_serializable(value) | |
# Add camera model and date taken for convenience if available | |
if 'model' in metadata: | |
metadata["camera_model"] = metadata['model'] | |
if 'datetimeoriginal' in metadata: | |
metadata["date_taken"] = metadata['datetimeoriginal'] | |
return metadata | |
except Exception as e: | |
# Capture full stack trace for debugging | |
stack_trace = traceback.format_exc() | |
logger.error(f"Image {image_path} processing failed: {e}\n{stack_trace}") | |
# Return partial metadata with error information | |
metadata["error"] = str(e) | |
metadata["error_trace"] = stack_trace | |
return metadata | |
class MetadataProcessor: | |
"""Handles batch processing and file operations""" | |
def process_images(input_path: Union[str, Path]) -> List[Dict[str, Any]]: | |
""" | |
Process image files to extract metadata | |
Args: | |
input_path: Path to image file or directory | |
Returns: | |
List of metadata dictionaries for all processed images | |
""" | |
metadata_list = [] | |
input_path = Path(input_path) | |
start_time = time.time() | |
# Handle single file case | |
if input_path.is_file() and input_path.suffix.lower() in Config.SUPPORTED_EXTENSIONS: | |
logger.info(f"Processing single image: {input_path}") | |
metadata = GeoMetadataExtractor.get_image_metadata(input_path) | |
if metadata: | |
metadata_list.append(metadata) | |
# Handle directory case | |
elif input_path.is_dir(): | |
logger.info(f"Processing directory: {input_path}") | |
# Collect all image files first | |
image_paths = [ | |
path for path in input_path.rglob("*") | |
if path.is_file() and path.suffix.lower() in Config.SUPPORTED_EXTENSIONS | |
] | |
total_images = len(image_paths) | |
logger.info(f"Found {total_images} images to process") | |
# Process in parallel with progress tracking | |
if total_images > 0: | |
processed = 0 | |
with ThreadPoolExecutor(max_workers=Config.MAX_WORKERS) as executor: | |
# Submit all tasks | |
future_to_path = { | |
executor.submit(GeoMetadataExtractor.get_image_metadata, path): path | |
for path in image_paths | |
} | |
# Process as they complete | |
for future in as_completed(future_to_path): | |
path = future_to_path[future] | |
try: | |
metadata = future.result() | |
if metadata: | |
metadata_list.append(metadata) | |
# Update progress | |
processed += 1 | |
if processed % 10 == 0 or processed == total_images: | |
elapsed = time.time() - start_time | |
rate = processed / elapsed if elapsed > 0 else 0 | |
logger.info(f"Processed {processed}/{total_images} images ({processed/total_images*100:.1f}%) - {rate:.2f} images/sec") | |
except Exception as e: | |
logger.error(f"Error processing {path}: {e}") | |
else: | |
logger.warning(f"No images found in directory: {input_path}") | |
else: | |
logger.error(f"Invalid input: {input_path} is not a file or directory") | |
return [{"error": f"Invalid input: {input_path} is not a file or directory"}] | |
# Summarize results | |
elapsed = time.time() - start_time | |
images_per_second = len(metadata_list) / elapsed if elapsed > 0 else 0 | |
logger.info(f"Completed processing {len(metadata_list)} images in {elapsed:.2f} seconds ({images_per_second:.2f} images/sec)") | |
return metadata_list | |
def save_metadata_to_jsonl(metadata_list: List[Dict[str, Any]], output_file: Path) -> bool: | |
""" | |
Save metadata to JSONL format with error handling | |
Args: | |
metadata_list: List of metadata dictionaries | |
output_file: Path to output file | |
Returns: | |
True if save was successful, False otherwise | |
""" | |
try: | |
# Create directory if needed | |
output_file.parent.mkdir(parents=True, exist_ok=True) | |
# Write to file | |
with output_file.open('w', encoding='utf-8') as f: | |
for entry in metadata_list: | |
f.write(json.dumps(entry, ensure_ascii=False) + '\n') | |
logger.info(f"Successfully saved {len(metadata_list)} entries to {output_file}") | |
return True | |
except Exception as e: | |
stack_trace = traceback.format_exc() | |
logger.error(f"Failed to save metadata: {e}\n{stack_trace}") | |
return False | |
def upload_to_huggingface(metadata_file: Path, username: str, dataset_name: str) -> str: | |
""" | |
Upload metadata to Hugging Face as a dataset | |
Args: | |
metadata_file: Path to JSONL file | |
username: Hugging Face username | |
dataset_name: Dataset name to create/update | |
Returns: | |
Status message | |
""" | |
if not HAS_DATASETS: | |
return "Hugging Face datasets library not installed" | |
try: | |
# Read metadata | |
metadata_list = [] | |
with metadata_file.open('r', encoding='utf-8') as f: | |
for line in f: | |
metadata_list.append(json.loads(line)) | |
if not metadata_list: | |
return "No metadata to upload" | |
# Create dataset | |
logger.info(f"Creating dataset with {len(metadata_list)} entries") | |
dataset = Dataset.from_dict({ | |
"images": [entry.get("file_name", "unknown") for entry in metadata_list], | |
"metadata": metadata_list | |
}) | |
# Push to Hub | |
dataset_path = f"{username}/{dataset_name}" | |
logger.info(f"Pushing dataset to {dataset_path}") | |
dataset.push_to_hub(dataset_path, private=False) | |
return f"Successfully uploaded to {dataset_path} with {len(metadata_list)} entries" | |
except Exception as e: | |
stack_trace = traceback.format_exc() | |
logger.error(f"Upload failed: {e}\n{stack_trace}") | |
return f"Upload failed: {str(e)}" | |
class GradioInterface: | |
"""Gradio UI interface""" | |
def create_interface(): | |
""" | |
Create the Gradio interface | |
Returns: | |
Gradio interface object | |
""" | |
if not HAS_GRADIO: | |
logger.error("Gradio not installed, cannot create interface") | |
return None | |
def process_input(image_file, dir_path: str, username: str, dataset_name: str) -> str: | |
""" | |
Process inputs from Gradio UI | |
Args: | |
image_file: Uploaded file object or None | |
dir_path: Directory path string | |
username: Hugging Face username | |
dataset_name: Dataset name | |
Returns: | |
Results as formatted text | |
""" | |
output_lines = [] | |
metadata_list = [] | |
# Handle single image upload | |
if image_file: | |
image_path = Path(image_file.name) | |
output_lines.append(f"## Processing Single Image: {image_path.name}") | |
single_metadata = MetadataProcessor.process_images(image_path) | |
metadata_list.extend(single_metadata) | |
# Format first entry for display | |
if single_metadata: | |
output_lines.append("### Image Metadata:") | |
output_lines.append("```json") | |
output_lines.append(json.dumps(single_metadata[0], indent=2)) | |
output_lines.append("```") | |
# Handle directory processing | |
if dir_path: | |
dir_path = Path(dir_path) | |
if dir_path.is_dir(): | |
output_lines.append(f"## Processing Directory: {dir_path}") | |
dir_metadata = MetadataProcessor.process_images(dir_path) | |
# Add to full list | |
metadata_list.extend(dir_metadata) | |
# Summarize results | |
output_lines.append(f"### Directory Results:") | |
output_lines.append(f"- Processed {len(dir_metadata)} images") | |
# Location data summary | |
location_count = sum(1 for entry in dir_metadata if entry.get("gps_info") is not None) | |
output_lines.append(f"- Found location data in {location_count} images ({location_count/len(dir_metadata)*100:.1f}% if len(dir_metadata) > 0 else 0}%)") | |
# Show a few examples if available | |
if dir_metadata: | |
output_lines.append("\n### Sample Entry:") | |
output_lines.append("```json") | |
output_lines.append(json.dumps(dir_metadata[0], indent=2)) | |
output_lines.append("```") | |
else: | |
output_lines.append(f"⚠️ Error: {dir_path} is not a directory") | |
# Save and upload if we have metadata | |
if metadata_list: | |
temp_output_file = Path("temp_metadata.jsonl") | |
output_lines.append(f"\n## Saving and Uploading") | |
if MetadataProcessor.save_metadata_to_jsonl(metadata_list, temp_output_file): | |
output_lines.append(f"✅ Saved metadata to {temp_output_file}") | |
# Upload to Hugging Face | |
upload_result = MetadataProcessor.upload_to_huggingface( | |
temp_output_file, username, dataset_name | |
) | |
output_lines.append(f"📤 {upload_result}") | |
else: | |
output_lines.append("❌ Failed to save metadata") | |
return "\n".join(output_lines) if output_lines else "Please upload an image or provide a directory path" | |
# Create the interface | |
demo = gr.Interface( | |
fn=process_input, | |
inputs=[ | |
gr.File(label="Upload Image", file_types=list(Config.SUPPORTED_EXTENSIONS)), | |
gr.Textbox(label="Image Directory", placeholder=str(Config.DEFAULT_IMAGE_DIR), value=str(Config.DEFAULT_IMAGE_DIR)), | |
gr.Textbox(label="Hugging Face Username", value=Config.HF_USERNAME), | |
gr.Textbox(label="Dataset Name", value=Config.DATASET_NAME) | |
], | |
outputs=gr.Markdown(label="Results"), | |
title="Enhanced Geo-Metadata Extractor", | |
description=( | |
"Upload an image or process a directory to extract location metadata and other EXIF data. " | |
"Results can be automatically uploaded to Hugging Face Datasets." | |
), | |
allow_flagging="never", | |
examples=[ | |
[None, "sample_images", Config.HF_USERNAME, "sample-geo-metadata"] | |
] | |
) | |
return demo | |
def main(): | |
"""Main entry point""" | |
logger.info("Starting Geo-Metadata Extractor") | |
# Check dependencies | |
if not HAS_PIL: | |
logger.error("PIL is required for image processing. Please install: pip install pillow") | |
sys.exit(1) | |
# Create and launch the UI if running directly | |
if HAS_GRADIO: | |
logger.info("Creating Gradio interface") | |
demo = GradioInterface.create_interface() | |
if demo: | |
logger.info("Launching Gradio interface") | |
demo.launch(server_name="0.0.0.0", server_port=7860) | |
else: | |
logger.error("Failed to create Gradio interface") | |
else: | |
logger.warning("Gradio not installed, running in CLI mode") | |
# Process default directory as fallback | |
if Config.DEFAULT_IMAGE_DIR.exists(): | |
logger.info(f"Processing default directory: {Config.DEFAULT_IMAGE_DIR}") | |
metadata = MetadataProcessor.process_images(Config.DEFAULT_IMAGE_DIR) | |
if metadata: | |
logger.info(f"Saving {len(metadata)} entries to {Config.DEFAULT_OUTPUT_FILE}") | |
MetadataProcessor.save_metadata_to_jsonl(metadata, Config.DEFAULT_OUTPUT_FILE) | |
logger.info(f"Metadata saved to {Config.DEFAULT_OUTPUT_FILE}") | |
else: | |
logger.error(f"Default directory not found: {Config.DEFAULT_IMAGE_DIR}") | |
if __name__ == "__main__": | |
main() |