Spaces:
Runtime error
Runtime error
File size: 7,290 Bytes
34561c5 c35918c b9f8627 c35918c 3f99791 2328870 c35918c 34561c5 c35918c 34561c5 c35918c 3f99791 c35918c 2328870 c35918c 2328870 c35918c 2328870 c35918c 2328870 c35918c 2328870 c35918c 2328870 c35918c 2328870 34561c5 c35918c 34561c5 c35918c 34561c5 c35918c 34561c5 c35918c 34561c5 c35918c 34561c5 b9f8627 c35918c 34561c5 c35918c 2328870 c35918c 34561c5 c35918c 34561c5 c35918c 34561c5 c35918c 34561c5 c35918c b9f8627 c35918c b9f8627 c35918c 34561c5 c35918c 2328870 c35918c 2328870 c35918c 2328870 c35918c 2328870 c35918c 2328870 34561c5 c35918c 2328870 c35918c 2328870 3f99791 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import os
import json
import time
import logging
# Import the handlers submodule explicitly
from logging import handlers
import threading
import sys
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from datasets import Dataset
from huggingface_hub import HfApi, create_repo, CommitOperationAdd
from PIL import Image, ExifTags
import gradio as gr
# ----------------- CONFIGURATION -----------------
HF_USERNAME = os.getenv("HF_USERNAME", "latterworks")
DATASET_NAME = os.getenv("DATASET_NAME", "geo-metadata")
HF_TOKEN = os.getenv("HF_TOKEN")
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "3600")) # Check every hour
MAX_BATCH_SIZE = int(os.getenv("MAX_BATCH_SIZE", "20"))
MAX_LOG_SIZE_MB = int(os.getenv("MAX_LOG_SIZE_MB", "10"))
SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'}
# Logging Setup
os.makedirs("logs", exist_ok=True)
log_handler = handlers.RotatingFileHandler("logs/uploader.log", maxBytes=MAX_LOG_SIZE_MB * 1024 * 1024, backupCount=5)
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(), log_handler])
logger = logging.getLogger(__name__)
# Global State
STATS = {"uploads": 0, "total_files": 0, "files_with_gps": 0, "startup_time": int(time.time())}
# Initialize HF API once
api = HfApi(token=HF_TOKEN)
# ----------------- UTILITIES -----------------
def repository_exists(repo_id, repo_type="dataset"):
"""Check if a Hugging Face dataset repo exists."""
try:
api.repo_info(repo_id=repo_id, repo_type=repo_type)
return True
except Exception:
return False
def ensure_dataset_exists():
"""Ensure dataset repository exists or create it."""
repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
if not repository_exists(repo_id):
logger.info(f"Creating dataset repository: {repo_id}")
create_repo(repo_id=repo_id, repo_type="dataset", private=False, token=HF_TOKEN)
api.upload_file(path_or_fileobj=b"", path_in_repo="images/.gitkeep", repo_id=repo_id, repo_type="dataset", commit_message="Initialize images folder")
return True
def format_duration(seconds):
"""Convert seconds to human-readable duration."""
d, h, m, s = seconds // 86400, (seconds % 86400) // 3600, (seconds % 3600) // 60, seconds % 60
return f"{d}d {h}h {m}m {s}s" if d else f"{h}h {m}m {s}s" if h else f"{m}m {s}s"
def convert_to_degrees(value):
"""Convert GPS coordinates to decimal degrees."""
try:
d, m, s = [float(x.numerator) / float(x.denominator) if hasattr(x, 'numerator') else float(x) for x in value]
return d + (m / 60.0) + (s / 3600.0)
except Exception:
return None
def extract_gps_info(gps_info):
"""Extract and process GPS data from EXIF."""
if not isinstance(gps_info, dict):
return None
try:
gps_data = {ExifTags.GPSTAGS.get(k, f"gps_{k}"): v for k, v in gps_info.items()}
if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
lat, lon = convert_to_degrees(gps_data['GPSLatitude']), convert_to_degrees(gps_data['GPSLongitude'])
if lat and lon:
if gps_data.get('GPSLatitudeRef', 'N') == 'S':
lat = -lat
if gps_data.get('GPSLongitudeRef', 'E') == 'W':
lon = -lon
gps_data.update({'Latitude': round(lat, 6), 'Longitude': round(lon, 6)})
return gps_data
except Exception:
return None
def get_image_metadata(image_path):
"""Extract metadata from an image file."""
file_path = Path(image_path)
metadata = {"file_name": str(file_path.absolute()), "file_extension": file_path.suffix.lower()}
try:
with Image.open(image_path) as img:
metadata.update({"format": img.format, "size": list(img.size), "mode": img.mode})
exif_data = img._getexif()
if exif_data:
metadata.update({ExifTags.TAGS.get(k, f"tag_{k}").lower(): v for k, v in exif_data.items()})
if 'gpsinfo' in metadata:
metadata["gps_info"] = extract_gps_info(metadata.pop('gpsinfo'))
metadata["file_size"] = os.path.getsize(image_path)
metadata["timestamp"] = int(time.time())
return metadata
except Exception:
return None
# ----------------- UPLOADING -----------------
def upload_metadata(metadata_list):
"""Upload metadata to Hugging Face."""
if not metadata_list:
return "No metadata to upload"
repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
dataset = Dataset.from_dict({"metadata": metadata_list})
dataset.push_to_hub(repo_id, commit_message=f"Add {len(metadata_list)} image metadata entries")
return "Upload successful"
def upload_images(image_paths):
"""Upload images to Hugging Face."""
repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
operations = []
for image_path in image_paths:
try:
with open(image_path, "rb") as f:
operations.append(CommitOperationAdd(path_in_repo=f"images/{Path(image_path).name}", path_or_fileobj=f.read()))
except Exception:
continue
if operations:
api.create_commit(repo_id=repo_id, repo_type="dataset", operations=operations, commit_message="Batch upload images")
# ----------------- PROCESSING -----------------
def process_images(image_files):
"""Process images, extract metadata, and upload to Hugging Face."""
if not ensure_dataset_exists():
return "Dataset creation failed."
metadata_list = []
image_paths = []
with ThreadPoolExecutor(max_workers=MAX_BATCH_SIZE) as executor:
results = executor.map(get_image_metadata, [file.name for file in image_files])
for result, file in zip(results, image_files):
if result:
metadata_list.append(result)
image_paths.append(file.name)
if metadata_list:
upload_metadata(metadata_list)
upload_images(image_paths)
return f"Processed {len(metadata_list)} images, uploaded metadata & images."
return "No valid images processed."
# ----------------- GRADIO UI -----------------
demo = gr.Interface(
fn=process_images,
inputs=gr.Files(label="Upload Images"),
outputs=gr.Textbox(label="Status Report"),
title="Geo-Metadata Uploader",
description=f"Upload images for automatic metadata extraction and upload to Hugging Face ({HF_USERNAME}/{DATASET_NAME}).",
allow_flagging="never"
)
# ----------------- AUTO-SCHEDULING -----------------
def schedule_directory_scan():
"""Periodically scan a directory for new images."""
watch_dir = os.getenv("WATCH_DIRECTORY")
if watch_dir and os.path.isdir(watch_dir):
image_files = [Path(watch_dir) / f for f in os.listdir(watch_dir) if f.lower().endswith(tuple(SUPPORTED_EXTENSIONS))]
process_images(image_files)
threading.Timer(CHECK_INTERVAL, schedule_directory_scan).start()
if __name__ == "__main__":
logger.info(f"Starting uploader for {HF_USERNAME}/{DATASET_NAME}...")
ensure_dataset_exists()
if os.getenv("WATCH_DIRECTORY"):
threading.Thread(target=schedule_directory_scan).start()
demo.launch(server_name="0.0.0.0", server_port=7860) |