Spaces:
Runtime error
Runtime error
import os | |
import json | |
import time | |
import logging | |
# Import the handlers submodule explicitly | |
from logging import handlers | |
import threading | |
import sys | |
from pathlib import Path | |
from concurrent.futures import ThreadPoolExecutor | |
from datasets import Dataset | |
from huggingface_hub import HfApi, create_repo, CommitOperationAdd | |
from PIL import Image, ExifTags | |
import gradio as gr | |
# ----------------- CONFIGURATION ----------------- | |
HF_USERNAME = os.getenv("HF_USERNAME", "latterworks") | |
DATASET_NAME = os.getenv("DATASET_NAME", "geo-metadata") | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "3600")) # Check every hour | |
MAX_BATCH_SIZE = int(os.getenv("MAX_BATCH_SIZE", "20")) | |
MAX_LOG_SIZE_MB = int(os.getenv("MAX_LOG_SIZE_MB", "10")) | |
SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'} | |
# Logging Setup | |
os.makedirs("logs", exist_ok=True) | |
log_handler = handlers.RotatingFileHandler("logs/uploader.log", maxBytes=MAX_LOG_SIZE_MB * 1024 * 1024, backupCount=5) | |
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(), log_handler]) | |
logger = logging.getLogger(__name__) | |
# Global State | |
STATS = {"uploads": 0, "total_files": 0, "files_with_gps": 0, "startup_time": int(time.time())} | |
# Initialize HF API once | |
api = HfApi(token=HF_TOKEN) | |
# ----------------- UTILITIES ----------------- | |
def repository_exists(repo_id, repo_type="dataset"): | |
"""Check if a Hugging Face dataset repo exists.""" | |
try: | |
api.repo_info(repo_id=repo_id, repo_type=repo_type) | |
return True | |
except Exception: | |
return False | |
def ensure_dataset_exists(): | |
"""Ensure dataset repository exists or create it.""" | |
repo_id = f"{HF_USERNAME}/{DATASET_NAME}" | |
if not repository_exists(repo_id): | |
logger.info(f"Creating dataset repository: {repo_id}") | |
create_repo(repo_id=repo_id, repo_type="dataset", private=False, token=HF_TOKEN) | |
api.upload_file(path_or_fileobj=b"", path_in_repo="images/.gitkeep", repo_id=repo_id, repo_type="dataset", commit_message="Initialize images folder") | |
return True | |
def format_duration(seconds): | |
"""Convert seconds to human-readable duration.""" | |
d, h, m, s = seconds // 86400, (seconds % 86400) // 3600, (seconds % 3600) // 60, seconds % 60 | |
return f"{d}d {h}h {m}m {s}s" if d else f"{h}h {m}m {s}s" if h else f"{m}m {s}s" | |
def convert_to_degrees(value): | |
"""Convert GPS coordinates to decimal degrees.""" | |
try: | |
d, m, s = [float(x.numerator) / float(x.denominator) if hasattr(x, 'numerator') else float(x) for x in value] | |
return d + (m / 60.0) + (s / 3600.0) | |
except Exception: | |
return None | |
def extract_gps_info(gps_info): | |
"""Extract and process GPS data from EXIF.""" | |
if not isinstance(gps_info, dict): | |
return None | |
try: | |
gps_data = {ExifTags.GPSTAGS.get(k, f"gps_{k}"): v for k, v in gps_info.items()} | |
if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data: | |
lat, lon = convert_to_degrees(gps_data['GPSLatitude']), convert_to_degrees(gps_data['GPSLongitude']) | |
if lat and lon: | |
if gps_data.get('GPSLatitudeRef', 'N') == 'S': | |
lat = -lat | |
if gps_data.get('GPSLongitudeRef', 'E') == 'W': | |
lon = -lon | |
gps_data.update({'Latitude': round(lat, 6), 'Longitude': round(lon, 6)}) | |
return gps_data | |
except Exception: | |
return None | |
def get_image_metadata(image_path): | |
"""Extract metadata from an image file.""" | |
file_path = Path(image_path) | |
metadata = {"file_name": str(file_path.absolute()), "file_extension": file_path.suffix.lower()} | |
try: | |
with Image.open(image_path) as img: | |
metadata.update({"format": img.format, "size": list(img.size), "mode": img.mode}) | |
exif_data = img._getexif() | |
if exif_data: | |
metadata.update({ExifTags.TAGS.get(k, f"tag_{k}").lower(): v for k, v in exif_data.items()}) | |
if 'gpsinfo' in metadata: | |
metadata["gps_info"] = extract_gps_info(metadata.pop('gpsinfo')) | |
metadata["file_size"] = os.path.getsize(image_path) | |
metadata["timestamp"] = int(time.time()) | |
return metadata | |
except Exception: | |
return None | |
# ----------------- UPLOADING ----------------- | |
def upload_metadata(metadata_list): | |
"""Upload metadata to Hugging Face.""" | |
if not metadata_list: | |
return "No metadata to upload" | |
repo_id = f"{HF_USERNAME}/{DATASET_NAME}" | |
dataset = Dataset.from_dict({"metadata": metadata_list}) | |
dataset.push_to_hub(repo_id, commit_message=f"Add {len(metadata_list)} image metadata entries") | |
return "Upload successful" | |
def upload_images(image_paths): | |
"""Upload images to Hugging Face.""" | |
repo_id = f"{HF_USERNAME}/{DATASET_NAME}" | |
operations = [] | |
for image_path in image_paths: | |
try: | |
with open(image_path, "rb") as f: | |
operations.append(CommitOperationAdd(path_in_repo=f"images/{Path(image_path).name}", path_or_fileobj=f.read())) | |
except Exception: | |
continue | |
if operations: | |
api.create_commit(repo_id=repo_id, repo_type="dataset", operations=operations, commit_message="Batch upload images") | |
# ----------------- PROCESSING ----------------- | |
def process_images(image_files): | |
"""Process images, extract metadata, and upload to Hugging Face.""" | |
if not ensure_dataset_exists(): | |
return "Dataset creation failed." | |
metadata_list = [] | |
image_paths = [] | |
with ThreadPoolExecutor(max_workers=MAX_BATCH_SIZE) as executor: | |
results = executor.map(get_image_metadata, [file.name for file in image_files]) | |
for result, file in zip(results, image_files): | |
if result: | |
metadata_list.append(result) | |
image_paths.append(file.name) | |
if metadata_list: | |
upload_metadata(metadata_list) | |
upload_images(image_paths) | |
return f"Processed {len(metadata_list)} images, uploaded metadata & images." | |
return "No valid images processed." | |
# ----------------- GRADIO UI ----------------- | |
demo = gr.Interface( | |
fn=process_images, | |
inputs=gr.Files(label="Upload Images"), | |
outputs=gr.Textbox(label="Status Report"), | |
title="Geo-Metadata Uploader", | |
description=f"Upload images for automatic metadata extraction and upload to Hugging Face ({HF_USERNAME}/{DATASET_NAME}).", | |
allow_flagging="never" | |
) | |
# ----------------- AUTO-SCHEDULING ----------------- | |
def schedule_directory_scan(): | |
"""Periodically scan a directory for new images.""" | |
watch_dir = os.getenv("WATCH_DIRECTORY") | |
if watch_dir and os.path.isdir(watch_dir): | |
image_files = [Path(watch_dir) / f for f in os.listdir(watch_dir) if f.lower().endswith(tuple(SUPPORTED_EXTENSIONS))] | |
process_images(image_files) | |
threading.Timer(CHECK_INTERVAL, schedule_directory_scan).start() | |
if __name__ == "__main__": | |
logger.info(f"Starting uploader for {HF_USERNAME}/{DATASET_NAME}...") | |
ensure_dataset_exists() | |
if os.getenv("WATCH_DIRECTORY"): | |
threading.Thread(target=schedule_directory_scan).start() | |
demo.launch(server_name="0.0.0.0", server_port=7860) |