import os import json import time import logging # Import the handlers submodule explicitly from logging import handlers import threading import sys from pathlib import Path from concurrent.futures import ThreadPoolExecutor from datasets import Dataset from huggingface_hub import HfApi, create_repo, CommitOperationAdd from PIL import Image, ExifTags import gradio as gr # ----------------- CONFIGURATION ----------------- HF_USERNAME = os.getenv("HF_USERNAME", "latterworks") DATASET_NAME = os.getenv("DATASET_NAME", "geo-metadata") HF_TOKEN = os.getenv("HF_TOKEN") CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "3600")) # Check every hour MAX_BATCH_SIZE = int(os.getenv("MAX_BATCH_SIZE", "20")) MAX_LOG_SIZE_MB = int(os.getenv("MAX_LOG_SIZE_MB", "10")) SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'} # Logging Setup os.makedirs("logs", exist_ok=True) log_handler = handlers.RotatingFileHandler("logs/uploader.log", maxBytes=MAX_LOG_SIZE_MB * 1024 * 1024, backupCount=5) logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(), log_handler]) logger = logging.getLogger(__name__) # Global State STATS = {"uploads": 0, "total_files": 0, "files_with_gps": 0, "startup_time": int(time.time())} # Initialize HF API once api = HfApi(token=HF_TOKEN) # ----------------- UTILITIES ----------------- def repository_exists(repo_id, repo_type="dataset"): """Check if a Hugging Face dataset repo exists.""" try: api.repo_info(repo_id=repo_id, repo_type=repo_type) return True except Exception: return False def ensure_dataset_exists(): """Ensure dataset repository exists or create it.""" repo_id = f"{HF_USERNAME}/{DATASET_NAME}" if not repository_exists(repo_id): logger.info(f"Creating dataset repository: {repo_id}") create_repo(repo_id=repo_id, repo_type="dataset", private=False, token=HF_TOKEN) api.upload_file(path_or_fileobj=b"", path_in_repo="images/.gitkeep", repo_id=repo_id, repo_type="dataset", commit_message="Initialize images folder") return True def format_duration(seconds): """Convert seconds to human-readable duration.""" d, h, m, s = seconds // 86400, (seconds % 86400) // 3600, (seconds % 3600) // 60, seconds % 60 return f"{d}d {h}h {m}m {s}s" if d else f"{h}h {m}m {s}s" if h else f"{m}m {s}s" def convert_to_degrees(value): """Convert GPS coordinates to decimal degrees.""" try: d, m, s = [float(x.numerator) / float(x.denominator) if hasattr(x, 'numerator') else float(x) for x in value] return d + (m / 60.0) + (s / 3600.0) except Exception: return None def extract_gps_info(gps_info): """Extract and process GPS data from EXIF.""" if not isinstance(gps_info, dict): return None try: gps_data = {ExifTags.GPSTAGS.get(k, f"gps_{k}"): v for k, v in gps_info.items()} if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data: lat, lon = convert_to_degrees(gps_data['GPSLatitude']), convert_to_degrees(gps_data['GPSLongitude']) if lat and lon: if gps_data.get('GPSLatitudeRef', 'N') == 'S': lat = -lat if gps_data.get('GPSLongitudeRef', 'E') == 'W': lon = -lon gps_data.update({'Latitude': round(lat, 6), 'Longitude': round(lon, 6)}) return gps_data except Exception: return None def get_image_metadata(image_path): """Extract metadata from an image file.""" file_path = Path(image_path) metadata = {"file_name": str(file_path.absolute()), "file_extension": file_path.suffix.lower()} try: with Image.open(image_path) as img: metadata.update({"format": img.format, "size": list(img.size), "mode": img.mode}) exif_data = img._getexif() if exif_data: metadata.update({ExifTags.TAGS.get(k, f"tag_{k}").lower(): v for k, v in exif_data.items()}) if 'gpsinfo' in metadata: metadata["gps_info"] = extract_gps_info(metadata.pop('gpsinfo')) metadata["file_size"] = os.path.getsize(image_path) metadata["timestamp"] = int(time.time()) return metadata except Exception: return None # ----------------- UPLOADING ----------------- def upload_metadata(metadata_list): """Upload metadata to Hugging Face.""" if not metadata_list: return "No metadata to upload" repo_id = f"{HF_USERNAME}/{DATASET_NAME}" dataset = Dataset.from_dict({"metadata": metadata_list}) dataset.push_to_hub(repo_id, commit_message=f"Add {len(metadata_list)} image metadata entries") return "Upload successful" def upload_images(image_paths): """Upload images to Hugging Face.""" repo_id = f"{HF_USERNAME}/{DATASET_NAME}" operations = [] for image_path in image_paths: try: with open(image_path, "rb") as f: operations.append(CommitOperationAdd(path_in_repo=f"images/{Path(image_path).name}", path_or_fileobj=f.read())) except Exception: continue if operations: api.create_commit(repo_id=repo_id, repo_type="dataset", operations=operations, commit_message="Batch upload images") # ----------------- PROCESSING ----------------- def process_images(image_files): """Process images, extract metadata, and upload to Hugging Face.""" if not ensure_dataset_exists(): return "Dataset creation failed." metadata_list = [] image_paths = [] with ThreadPoolExecutor(max_workers=MAX_BATCH_SIZE) as executor: results = executor.map(get_image_metadata, [file.name for file in image_files]) for result, file in zip(results, image_files): if result: metadata_list.append(result) image_paths.append(file.name) if metadata_list: upload_metadata(metadata_list) upload_images(image_paths) return f"Processed {len(metadata_list)} images, uploaded metadata & images." return "No valid images processed." # ----------------- GRADIO UI ----------------- demo = gr.Interface( fn=process_images, inputs=gr.Files(label="Upload Images"), outputs=gr.Textbox(label="Status Report"), title="Geo-Metadata Uploader", description=f"Upload images for automatic metadata extraction and upload to Hugging Face ({HF_USERNAME}/{DATASET_NAME}).", allow_flagging="never" ) # ----------------- AUTO-SCHEDULING ----------------- def schedule_directory_scan(): """Periodically scan a directory for new images.""" watch_dir = os.getenv("WATCH_DIRECTORY") if watch_dir and os.path.isdir(watch_dir): image_files = [Path(watch_dir) / f for f in os.listdir(watch_dir) if f.lower().endswith(tuple(SUPPORTED_EXTENSIONS))] process_images(image_files) threading.Timer(CHECK_INTERVAL, schedule_directory_scan).start() if __name__ == "__main__": logger.info(f"Starting uploader for {HF_USERNAME}/{DATASET_NAME}...") ensure_dataset_exists() if os.getenv("WATCH_DIRECTORY"): threading.Thread(target=schedule_directory_scan).start() demo.launch(server_name="0.0.0.0", server_port=7860)