File size: 7,290 Bytes
34561c5
c35918c
b9f8627
c35918c
3f99791
 
2328870
 
c35918c
 
 
 
 
 
34561c5
c35918c
 
 
 
 
 
 
34561c5
 
c35918c
 
3f99791
c35918c
 
2328870
c35918c
 
2328870
c35918c
 
2328870
c35918c
 
 
 
 
2328870
c35918c
2328870
 
c35918c
 
 
 
 
 
 
 
2328870
 
c35918c
 
 
2328870
34561c5
c35918c
34561c5
c35918c
 
 
34561c5
 
 
c35918c
34561c5
 
 
c35918c
34561c5
c35918c
 
 
 
 
 
 
34561c5
b9f8627
c35918c
34561c5
 
c35918c
2328870
c35918c
34561c5
c35918c
 
 
 
 
 
 
34561c5
c35918c
34561c5
c35918c
 
34561c5
c35918c
 
 
b9f8627
c35918c
 
 
 
 
 
 
 
 
 
 
b9f8627
c35918c
 
 
 
 
 
34561c5
c35918c
 
 
 
 
2328870
c35918c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2328870
c35918c
2328870
c35918c
 
2328870
c35918c
 
2328870
 
34561c5
c35918c
2328870
c35918c
2328870
3f99791
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import os
import json
import time
import logging
# Import the handlers submodule explicitly
from logging import handlers
import threading
import sys
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from datasets import Dataset
from huggingface_hub import HfApi, create_repo, CommitOperationAdd
from PIL import Image, ExifTags
import gradio as gr

# ----------------- CONFIGURATION -----------------
HF_USERNAME = os.getenv("HF_USERNAME", "latterworks")
DATASET_NAME = os.getenv("DATASET_NAME", "geo-metadata")
HF_TOKEN = os.getenv("HF_TOKEN")
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "3600"))  # Check every hour
MAX_BATCH_SIZE = int(os.getenv("MAX_BATCH_SIZE", "20"))
MAX_LOG_SIZE_MB = int(os.getenv("MAX_LOG_SIZE_MB", "10"))
SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'}

# Logging Setup
os.makedirs("logs", exist_ok=True)
log_handler = handlers.RotatingFileHandler("logs/uploader.log", maxBytes=MAX_LOG_SIZE_MB * 1024 * 1024, backupCount=5)
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(), log_handler])
logger = logging.getLogger(__name__)

# Global State
STATS = {"uploads": 0, "total_files": 0, "files_with_gps": 0, "startup_time": int(time.time())}

# Initialize HF API once
api = HfApi(token=HF_TOKEN)

# ----------------- UTILITIES -----------------
def repository_exists(repo_id, repo_type="dataset"):
    """Check if a Hugging Face dataset repo exists."""
    try:
        api.repo_info(repo_id=repo_id, repo_type=repo_type)
        return True
    except Exception:
        return False

def ensure_dataset_exists():
    """Ensure dataset repository exists or create it."""
    repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
    if not repository_exists(repo_id):
        logger.info(f"Creating dataset repository: {repo_id}")
        create_repo(repo_id=repo_id, repo_type="dataset", private=False, token=HF_TOKEN)
        api.upload_file(path_or_fileobj=b"", path_in_repo="images/.gitkeep", repo_id=repo_id, repo_type="dataset", commit_message="Initialize images folder")
    return True

def format_duration(seconds):
    """Convert seconds to human-readable duration."""
    d, h, m, s = seconds // 86400, (seconds % 86400) // 3600, (seconds % 3600) // 60, seconds % 60
    return f"{d}d {h}h {m}m {s}s" if d else f"{h}h {m}m {s}s" if h else f"{m}m {s}s"

def convert_to_degrees(value):
    """Convert GPS coordinates to decimal degrees."""
    try:
        d, m, s = [float(x.numerator) / float(x.denominator) if hasattr(x, 'numerator') else float(x) for x in value]
        return d + (m / 60.0) + (s / 3600.0)
    except Exception:
        return None

def extract_gps_info(gps_info):
    """Extract and process GPS data from EXIF."""
    if not isinstance(gps_info, dict):
        return None
    try:
        gps_data = {ExifTags.GPSTAGS.get(k, f"gps_{k}"): v for k, v in gps_info.items()}
        if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
            lat, lon = convert_to_degrees(gps_data['GPSLatitude']), convert_to_degrees(gps_data['GPSLongitude'])
            if lat and lon:
                if gps_data.get('GPSLatitudeRef', 'N') == 'S':
                    lat = -lat
                if gps_data.get('GPSLongitudeRef', 'E') == 'W':
                    lon = -lon
                gps_data.update({'Latitude': round(lat, 6), 'Longitude': round(lon, 6)})
        return gps_data
    except Exception:
        return None

def get_image_metadata(image_path):
    """Extract metadata from an image file."""
    file_path = Path(image_path)
    metadata = {"file_name": str(file_path.absolute()), "file_extension": file_path.suffix.lower()}
    try:
        with Image.open(image_path) as img:
            metadata.update({"format": img.format, "size": list(img.size), "mode": img.mode})
            exif_data = img._getexif()
            if exif_data:
                metadata.update({ExifTags.TAGS.get(k, f"tag_{k}").lower(): v for k, v in exif_data.items()})
                if 'gpsinfo' in metadata:
                    metadata["gps_info"] = extract_gps_info(metadata.pop('gpsinfo'))
        metadata["file_size"] = os.path.getsize(image_path)
        metadata["timestamp"] = int(time.time())
        return metadata
    except Exception:
        return None

# ----------------- UPLOADING -----------------
def upload_metadata(metadata_list):
    """Upload metadata to Hugging Face."""
    if not metadata_list:
        return "No metadata to upload"
    repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
    dataset = Dataset.from_dict({"metadata": metadata_list})
    dataset.push_to_hub(repo_id, commit_message=f"Add {len(metadata_list)} image metadata entries")
    return "Upload successful"

def upload_images(image_paths):
    """Upload images to Hugging Face."""
    repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
    operations = []
    for image_path in image_paths:
        try:
            with open(image_path, "rb") as f:
                operations.append(CommitOperationAdd(path_in_repo=f"images/{Path(image_path).name}", path_or_fileobj=f.read()))
        except Exception:
            continue
    if operations:
        api.create_commit(repo_id=repo_id, repo_type="dataset", operations=operations, commit_message="Batch upload images")

# ----------------- PROCESSING -----------------
def process_images(image_files):
    """Process images, extract metadata, and upload to Hugging Face."""
    if not ensure_dataset_exists():
        return "Dataset creation failed."
    
    metadata_list = []
    image_paths = []
    with ThreadPoolExecutor(max_workers=MAX_BATCH_SIZE) as executor:
        results = executor.map(get_image_metadata, [file.name for file in image_files])
        for result, file in zip(results, image_files):
            if result:
                metadata_list.append(result)
                image_paths.append(file.name)

    if metadata_list:
        upload_metadata(metadata_list)
        upload_images(image_paths)
        return f"Processed {len(metadata_list)} images, uploaded metadata & images."
    return "No valid images processed."

# ----------------- GRADIO UI -----------------
demo = gr.Interface(
    fn=process_images,
    inputs=gr.Files(label="Upload Images"),
    outputs=gr.Textbox(label="Status Report"),
    title="Geo-Metadata Uploader",
    description=f"Upload images for automatic metadata extraction and upload to Hugging Face ({HF_USERNAME}/{DATASET_NAME}).",
    allow_flagging="never"
)

# ----------------- AUTO-SCHEDULING -----------------
def schedule_directory_scan():
    """Periodically scan a directory for new images."""
    watch_dir = os.getenv("WATCH_DIRECTORY")
    if watch_dir and os.path.isdir(watch_dir):
        image_files = [Path(watch_dir) / f for f in os.listdir(watch_dir) if f.lower().endswith(tuple(SUPPORTED_EXTENSIONS))]
        process_images(image_files)
    threading.Timer(CHECK_INTERVAL, schedule_directory_scan).start()

if __name__ == "__main__":
    logger.info(f"Starting uploader for {HF_USERNAME}/{DATASET_NAME}...")
    ensure_dataset_exists()
    if os.getenv("WATCH_DIRECTORY"):
        threading.Thread(target=schedule_directory_scan).start()
    demo.launch(server_name="0.0.0.0", server_port=7860)