latterworks's picture
Update app.py
3f99791 verified
raw
history blame
7.29 kB
import os
import json
import time
import logging
# Import the handlers submodule explicitly
from logging import handlers
import threading
import sys
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from datasets import Dataset
from huggingface_hub import HfApi, create_repo, CommitOperationAdd
from PIL import Image, ExifTags
import gradio as gr
# ----------------- CONFIGURATION -----------------
HF_USERNAME = os.getenv("HF_USERNAME", "latterworks")
DATASET_NAME = os.getenv("DATASET_NAME", "geo-metadata")
HF_TOKEN = os.getenv("HF_TOKEN")
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "3600")) # Check every hour
MAX_BATCH_SIZE = int(os.getenv("MAX_BATCH_SIZE", "20"))
MAX_LOG_SIZE_MB = int(os.getenv("MAX_LOG_SIZE_MB", "10"))
SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'}
# Logging Setup
os.makedirs("logs", exist_ok=True)
log_handler = handlers.RotatingFileHandler("logs/uploader.log", maxBytes=MAX_LOG_SIZE_MB * 1024 * 1024, backupCount=5)
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(), log_handler])
logger = logging.getLogger(__name__)
# Global State
STATS = {"uploads": 0, "total_files": 0, "files_with_gps": 0, "startup_time": int(time.time())}
# Initialize HF API once
api = HfApi(token=HF_TOKEN)
# ----------------- UTILITIES -----------------
def repository_exists(repo_id, repo_type="dataset"):
"""Check if a Hugging Face dataset repo exists."""
try:
api.repo_info(repo_id=repo_id, repo_type=repo_type)
return True
except Exception:
return False
def ensure_dataset_exists():
"""Ensure dataset repository exists or create it."""
repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
if not repository_exists(repo_id):
logger.info(f"Creating dataset repository: {repo_id}")
create_repo(repo_id=repo_id, repo_type="dataset", private=False, token=HF_TOKEN)
api.upload_file(path_or_fileobj=b"", path_in_repo="images/.gitkeep", repo_id=repo_id, repo_type="dataset", commit_message="Initialize images folder")
return True
def format_duration(seconds):
"""Convert seconds to human-readable duration."""
d, h, m, s = seconds // 86400, (seconds % 86400) // 3600, (seconds % 3600) // 60, seconds % 60
return f"{d}d {h}h {m}m {s}s" if d else f"{h}h {m}m {s}s" if h else f"{m}m {s}s"
def convert_to_degrees(value):
"""Convert GPS coordinates to decimal degrees."""
try:
d, m, s = [float(x.numerator) / float(x.denominator) if hasattr(x, 'numerator') else float(x) for x in value]
return d + (m / 60.0) + (s / 3600.0)
except Exception:
return None
def extract_gps_info(gps_info):
"""Extract and process GPS data from EXIF."""
if not isinstance(gps_info, dict):
return None
try:
gps_data = {ExifTags.GPSTAGS.get(k, f"gps_{k}"): v for k, v in gps_info.items()}
if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
lat, lon = convert_to_degrees(gps_data['GPSLatitude']), convert_to_degrees(gps_data['GPSLongitude'])
if lat and lon:
if gps_data.get('GPSLatitudeRef', 'N') == 'S':
lat = -lat
if gps_data.get('GPSLongitudeRef', 'E') == 'W':
lon = -lon
gps_data.update({'Latitude': round(lat, 6), 'Longitude': round(lon, 6)})
return gps_data
except Exception:
return None
def get_image_metadata(image_path):
"""Extract metadata from an image file."""
file_path = Path(image_path)
metadata = {"file_name": str(file_path.absolute()), "file_extension": file_path.suffix.lower()}
try:
with Image.open(image_path) as img:
metadata.update({"format": img.format, "size": list(img.size), "mode": img.mode})
exif_data = img._getexif()
if exif_data:
metadata.update({ExifTags.TAGS.get(k, f"tag_{k}").lower(): v for k, v in exif_data.items()})
if 'gpsinfo' in metadata:
metadata["gps_info"] = extract_gps_info(metadata.pop('gpsinfo'))
metadata["file_size"] = os.path.getsize(image_path)
metadata["timestamp"] = int(time.time())
return metadata
except Exception:
return None
# ----------------- UPLOADING -----------------
def upload_metadata(metadata_list):
"""Upload metadata to Hugging Face."""
if not metadata_list:
return "No metadata to upload"
repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
dataset = Dataset.from_dict({"metadata": metadata_list})
dataset.push_to_hub(repo_id, commit_message=f"Add {len(metadata_list)} image metadata entries")
return "Upload successful"
def upload_images(image_paths):
"""Upload images to Hugging Face."""
repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
operations = []
for image_path in image_paths:
try:
with open(image_path, "rb") as f:
operations.append(CommitOperationAdd(path_in_repo=f"images/{Path(image_path).name}", path_or_fileobj=f.read()))
except Exception:
continue
if operations:
api.create_commit(repo_id=repo_id, repo_type="dataset", operations=operations, commit_message="Batch upload images")
# ----------------- PROCESSING -----------------
def process_images(image_files):
"""Process images, extract metadata, and upload to Hugging Face."""
if not ensure_dataset_exists():
return "Dataset creation failed."
metadata_list = []
image_paths = []
with ThreadPoolExecutor(max_workers=MAX_BATCH_SIZE) as executor:
results = executor.map(get_image_metadata, [file.name for file in image_files])
for result, file in zip(results, image_files):
if result:
metadata_list.append(result)
image_paths.append(file.name)
if metadata_list:
upload_metadata(metadata_list)
upload_images(image_paths)
return f"Processed {len(metadata_list)} images, uploaded metadata & images."
return "No valid images processed."
# ----------------- GRADIO UI -----------------
demo = gr.Interface(
fn=process_images,
inputs=gr.Files(label="Upload Images"),
outputs=gr.Textbox(label="Status Report"),
title="Geo-Metadata Uploader",
description=f"Upload images for automatic metadata extraction and upload to Hugging Face ({HF_USERNAME}/{DATASET_NAME}).",
allow_flagging="never"
)
# ----------------- AUTO-SCHEDULING -----------------
def schedule_directory_scan():
"""Periodically scan a directory for new images."""
watch_dir = os.getenv("WATCH_DIRECTORY")
if watch_dir and os.path.isdir(watch_dir):
image_files = [Path(watch_dir) / f for f in os.listdir(watch_dir) if f.lower().endswith(tuple(SUPPORTED_EXTENSIONS))]
process_images(image_files)
threading.Timer(CHECK_INTERVAL, schedule_directory_scan).start()
if __name__ == "__main__":
logger.info(f"Starting uploader for {HF_USERNAME}/{DATASET_NAME}...")
ensure_dataset_exists()
if os.getenv("WATCH_DIRECTORY"):
threading.Thread(target=schedule_directory_scan).start()
demo.launch(server_name="0.0.0.0", server_port=7860)