Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,556 +1,170 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from pathlib import Path
|
3 |
-
from PIL import Image, ExifTags
|
4 |
-
import json
|
5 |
import os
|
6 |
-
import
|
7 |
import time
|
8 |
-
|
9 |
-
from huggingface_hub import HfApi, create_repo, repository_exists, CommitOperationAdd
|
10 |
-
from huggingface_hub.utils import tqdm
|
11 |
import threading
|
12 |
import sys
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
#
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
]
|
22 |
-
)
|
23 |
-
logger = logging.getLogger(__name__)
|
24 |
-
|
25 |
-
# Constants - edit these for your setup
|
26 |
-
HF_TOKEN = os.environ.get("HF_TOKEN")
|
27 |
-
HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks")
|
28 |
-
DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata")
|
29 |
SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'}
|
30 |
-
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", "3600")) # Check for files hourly by default
|
31 |
|
32 |
-
#
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
"last_upload": 0,
|
38 |
-
"startup_time": int(time.time())
|
39 |
-
}
|
40 |
-
|
41 |
-
def ensure_dataset_exists():
|
42 |
-
"""Create dataset repository if it doesn't exist"""
|
43 |
-
if not HF_TOKEN:
|
44 |
-
logger.error("HF_TOKEN not set. Cannot create or check dataset.")
|
45 |
-
return False
|
46 |
-
|
47 |
-
try:
|
48 |
-
api = HfApi(token=HF_TOKEN)
|
49 |
-
repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
|
50 |
-
|
51 |
-
# Check if repo exists
|
52 |
-
if not repository_exists(repo_id, repo_type="dataset", token=HF_TOKEN):
|
53 |
-
logger.info(f"Creating dataset repository: {repo_id}")
|
54 |
-
create_repo(
|
55 |
-
repo_id=repo_id,
|
56 |
-
repo_type="dataset",
|
57 |
-
private=False,
|
58 |
-
token=HF_TOKEN
|
59 |
-
)
|
60 |
-
|
61 |
-
# Create initial README
|
62 |
-
readme_content = f"""# {DATASET_NAME}
|
63 |
|
64 |
-
|
|
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
- Files with GPS data: 0
|
69 |
-
- Last updated: {time.strftime('%Y-%m-%d %H:%M:%S')}
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
"""
|
77 |
-
|
78 |
-
# Upload README
|
79 |
-
api.upload_file(
|
80 |
-
path_or_fileobj=readme_content.encode(),
|
81 |
-
path_in_repo="README.md",
|
82 |
-
repo_id=repo_id,
|
83 |
-
repo_type="dataset",
|
84 |
-
token=HF_TOKEN,
|
85 |
-
commit_message="Initial commit with README"
|
86 |
-
)
|
87 |
-
|
88 |
-
# Create folder structure
|
89 |
-
for folder in ["batches", "images", "scripts"]:
|
90 |
-
api.upload_file(
|
91 |
-
path_or_fileobj=b"",
|
92 |
-
path_in_repo=f"{folder}/.gitkeep",
|
93 |
-
repo_id=repo_id,
|
94 |
-
repo_type="dataset",
|
95 |
-
token=HF_TOKEN,
|
96 |
-
commit_message=f"Create {folder} directory"
|
97 |
-
)
|
98 |
-
|
99 |
-
# Upload this script to the repository
|
100 |
-
try:
|
101 |
-
script_path = os.path.abspath(sys.argv[0])
|
102 |
-
if os.path.exists(script_path):
|
103 |
-
with open(script_path, "rb") as f:
|
104 |
-
script_content = f.read()
|
105 |
-
|
106 |
-
api.upload_file(
|
107 |
-
path_or_fileobj=script_content,
|
108 |
-
path_in_repo="scripts/geo_metadata_extractor.py",
|
109 |
-
repo_id=repo_id,
|
110 |
-
repo_type="dataset",
|
111 |
-
token=HF_TOKEN,
|
112 |
-
commit_message="Upload metadata extractor script"
|
113 |
-
)
|
114 |
-
except Exception as e:
|
115 |
-
logger.error(f"Failed to upload script: {e}")
|
116 |
-
|
117 |
-
logger.info(f"Dataset repository created: {repo_id}")
|
118 |
-
else:
|
119 |
-
logger.info(f"Dataset repository already exists: {repo_id}")
|
120 |
-
|
121 |
return True
|
122 |
-
except Exception
|
123 |
-
logger.error(f"Error ensuring dataset exists: {e}")
|
124 |
return False
|
125 |
|
126 |
-
def
|
127 |
-
"""
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
api =
|
133 |
-
|
134 |
-
|
135 |
-
# Create updated README content
|
136 |
-
readme_content = f"""# {DATASET_NAME}
|
137 |
-
|
138 |
-
Automatically collected geo-metadata from images using the Geo-Metadata Extractor.
|
139 |
-
|
140 |
-
## Statistics
|
141 |
-
- Total files processed: {STATS["total_files"]}
|
142 |
-
- Files with GPS data: {STATS["files_with_gps"]}
|
143 |
-
- Upload batches: {STATS["uploads"]}
|
144 |
-
- Last updated: {time.strftime('%Y-%m-%d %H:%M:%S')}
|
145 |
-
- Uptime: {format_duration(int(time.time()) - STATS["startup_time"])}
|
146 |
-
|
147 |
-
## Data Format
|
148 |
-
Each entry contains:
|
149 |
-
- Basic image metadata (size, format, mode)
|
150 |
-
- EXIF data when available
|
151 |
-
- GPS coordinates extracted from EXIF when available
|
152 |
-
"""
|
153 |
-
|
154 |
-
# Upload updated README
|
155 |
-
api.upload_file(
|
156 |
-
path_or_fileobj=readme_content.encode(),
|
157 |
-
path_in_repo="README.md",
|
158 |
-
repo_id=repo_id,
|
159 |
-
repo_type="dataset",
|
160 |
-
token=HF_TOKEN,
|
161 |
-
commit_message="Update statistics"
|
162 |
-
)
|
163 |
-
|
164 |
-
logger.info("Updated README with current statistics")
|
165 |
-
except Exception as e:
|
166 |
-
logger.error(f"Error updating README: {e}")
|
167 |
|
168 |
def format_duration(seconds):
|
169 |
-
"""
|
170 |
-
|
171 |
-
|
172 |
-
minutes, seconds = divmod(remainder, 60)
|
173 |
-
|
174 |
-
parts = []
|
175 |
-
if days > 0:
|
176 |
-
parts.append(f"{days}d")
|
177 |
-
if hours > 0:
|
178 |
-
parts.append(f"{hours}h")
|
179 |
-
if minutes > 0:
|
180 |
-
parts.append(f"{minutes}m")
|
181 |
-
parts.append(f"{seconds}s")
|
182 |
-
|
183 |
-
return " ".join(parts)
|
184 |
|
185 |
def convert_to_degrees(value):
|
186 |
-
"""Convert GPS coordinates to decimal degrees"""
|
187 |
try:
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
d, m, s = value
|
192 |
-
# Convert from rational numbers if needed
|
193 |
-
d = d.numerator / d.denominator if hasattr(d, 'numerator') else float(d)
|
194 |
-
m = m.numerator / m.denominator if hasattr(m, 'numerator') else float(m)
|
195 |
-
s = s.numerator / s.denominator if hasattr(s, 'numerator') else float(s)
|
196 |
-
|
197 |
-
degrees = d + (m / 60.0) + (s / 3600.0)
|
198 |
-
if not -180 <= degrees <= 180:
|
199 |
-
logger.warning(f"GPS out of bounds: {degrees}°")
|
200 |
-
return degrees
|
201 |
-
except Exception as e:
|
202 |
-
logger.error(f"GPS conversion failed: {e}, value={value}")
|
203 |
return None
|
204 |
|
205 |
def extract_gps_info(gps_info):
|
206 |
-
"""Extract and process GPS data from EXIF"""
|
207 |
if not isinstance(gps_info, dict):
|
208 |
return None
|
209 |
-
|
210 |
-
gps_data = {}
|
211 |
try:
|
212 |
-
|
213 |
-
for key, val in gps_info.items():
|
214 |
-
tag_name = ExifTags.GPSTAGS.get(key, f"gps_{key}")
|
215 |
-
gps_data[tag_name] = val
|
216 |
-
|
217 |
-
# Process lat/long if present
|
218 |
if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
|
219 |
-
lat = convert_to_degrees(gps_data['GPSLatitude'])
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
lat_ref = gps_data.get('GPSLatitudeRef', 'N')
|
227 |
-
lon_ref = gps_data.get('GPSLongitudeRef', 'E')
|
228 |
-
|
229 |
-
# Flip signs based on hemisphere
|
230 |
-
if lat_ref == 'S':
|
231 |
-
lat = -lat
|
232 |
-
if lon_ref == 'W':
|
233 |
-
lon = -lon
|
234 |
-
|
235 |
-
# Store clean coords with proper precision
|
236 |
-
gps_data['Latitude'] = round(lat, 6)
|
237 |
-
gps_data['Longitude'] = round(lon, 6)
|
238 |
-
|
239 |
return gps_data
|
240 |
-
except Exception as e:
|
241 |
-
logger.error(f"GPS extraction error: {e}")
|
242 |
-
return None
|
243 |
-
|
244 |
-
def make_serializable(value):
|
245 |
-
"""Make any value JSON serializable"""
|
246 |
-
try:
|
247 |
-
if hasattr(value, 'numerator') and hasattr(value, 'denominator'):
|
248 |
-
return float(value.numerator) / float(value.denominator)
|
249 |
-
elif isinstance(value, (tuple, list)):
|
250 |
-
return [make_serializable(item) for item in value]
|
251 |
-
elif isinstance(value, dict):
|
252 |
-
return {str(k): make_serializable(v) for k, v in value.items()}
|
253 |
-
elif isinstance(value, bytes):
|
254 |
-
return value.decode('utf-8', errors='replace')
|
255 |
-
json.dumps(value) # Test if serializable
|
256 |
-
return value
|
257 |
except Exception:
|
258 |
-
return
|
259 |
|
260 |
def get_image_metadata(image_path):
|
261 |
-
"""Extract
|
262 |
file_path = Path(image_path)
|
263 |
-
metadata = {
|
264 |
-
"file_name": str(file_path.absolute()),
|
265 |
-
"file_basename": file_path.name,
|
266 |
-
"image_path_in_repo": f"images/{file_path.name}" # Path where image will be stored in repo
|
267 |
-
}
|
268 |
-
|
269 |
try:
|
270 |
-
with Image.open(image_path) as
|
271 |
-
metadata.update({
|
272 |
-
|
273 |
-
|
274 |
-
"
|
275 |
-
|
276 |
-
|
277 |
-
# Extract EXIF if available
|
278 |
-
exif_data = None
|
279 |
-
try:
|
280 |
-
exif_data = image._getexif()
|
281 |
-
except (AttributeError, Exception) as e:
|
282 |
-
metadata["exif_error"] = str(e)
|
283 |
-
|
284 |
-
if exif_data and isinstance(exif_data, dict):
|
285 |
-
for tag_id, value in exif_data.items():
|
286 |
-
try:
|
287 |
-
tag_name = ExifTags.TAGS.get(tag_id, f"tag_{tag_id}").lower()
|
288 |
-
if tag_name == "gpsinfo":
|
289 |
-
gps_info = extract_gps_info(value)
|
290 |
-
if gps_info:
|
291 |
-
metadata["gps_info"] = make_serializable(gps_info)
|
292 |
-
else:
|
293 |
-
metadata[tag_name] = make_serializable(value)
|
294 |
-
except Exception as e:
|
295 |
-
metadata[f"error_tag_{tag_id}"] = str(e)
|
296 |
-
|
297 |
-
# Add file details
|
298 |
metadata["file_size"] = os.path.getsize(image_path)
|
299 |
-
metadata["
|
300 |
-
metadata["extraction_timestamp"] = int(time.time())
|
301 |
-
|
302 |
-
# Test serialization
|
303 |
-
json.dumps(metadata)
|
304 |
return metadata
|
305 |
-
except Exception
|
306 |
-
|
307 |
-
return {"file_name": str(file_path.absolute()), "error": str(e)}
|
308 |
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
return "🚫 Upload some fucking images first! 📷", None
|
313 |
-
|
314 |
-
# Ensure dataset exists
|
315 |
-
if not ensure_dataset_exists():
|
316 |
-
return "❌ Failed to create or verify dataset repository. Check logs.", None
|
317 |
-
|
318 |
-
# Create temp directory for storing files if needed
|
319 |
-
os.makedirs("temp_uploads", exist_ok=True)
|
320 |
-
|
321 |
-
# Reset stats for this batch
|
322 |
-
batch_stats = {
|
323 |
-
"processed": 0,
|
324 |
-
"skipped": 0,
|
325 |
-
"errors": 0,
|
326 |
-
"with_gps": 0
|
327 |
-
}
|
328 |
-
|
329 |
-
metadata_list = []
|
330 |
-
filenames = []
|
331 |
-
|
332 |
-
# Process each image
|
333 |
-
for image_file in image_files:
|
334 |
-
if not image_file or not os.path.exists(image_file.name):
|
335 |
-
continue
|
336 |
-
|
337 |
-
file_ext = Path(image_file.name).suffix.lower()
|
338 |
-
if file_ext not in SUPPORTED_EXTENSIONS:
|
339 |
-
logger.info(f"Skipping unsupported file: {image_file.name}")
|
340 |
-
batch_stats["skipped"] += 1
|
341 |
-
continue
|
342 |
-
|
343 |
-
logger.info(f"Processing: {image_file.name}")
|
344 |
-
try:
|
345 |
-
metadata = get_image_metadata(image_file.name)
|
346 |
-
if metadata:
|
347 |
-
if "gps_info" in metadata:
|
348 |
-
batch_stats["with_gps"] += 1
|
349 |
-
STATS["files_with_gps"] += 1
|
350 |
-
metadata_list.append(metadata)
|
351 |
-
filenames.append(Path(image_file.name).name)
|
352 |
-
batch_stats["processed"] += 1
|
353 |
-
else:
|
354 |
-
batch_stats["errors"] += 1
|
355 |
-
except Exception as e:
|
356 |
-
logger.error(f"Failed on {image_file.name}: {e}")
|
357 |
-
batch_stats["errors"] += 1
|
358 |
-
|
359 |
-
# Exit if nothing processed
|
360 |
if not metadata_list:
|
361 |
-
return
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
# Upload to HF
|
374 |
-
upload_status = "not uploaded (no token)"
|
375 |
-
if HF_TOKEN:
|
376 |
try:
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
"metadata": metadata_list
|
384 |
-
})
|
385 |
-
|
386 |
-
# Push to hub
|
387 |
-
dataset.push_to_hub(
|
388 |
-
f"{HF_USERNAME}/{DATASET_NAME}",
|
389 |
-
token=HF_TOKEN,
|
390 |
-
commit_message=f"Added metadata for {len(metadata_list)} images"
|
391 |
-
)
|
392 |
-
|
393 |
-
# Upload raw JSONL file
|
394 |
-
api = HfApi(token=HF_TOKEN)
|
395 |
-
api.upload_file(
|
396 |
-
path_or_fileobj=output_file,
|
397 |
-
path_in_repo=f"batches/metadata_{timestamp}.jsonl",
|
398 |
-
repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
|
399 |
-
repo_type="dataset",
|
400 |
-
token=HF_TOKEN,
|
401 |
-
commit_message=f"Raw metadata batch {timestamp}"
|
402 |
-
)
|
403 |
-
|
404 |
-
# Upload the actual image files
|
405 |
-
logger.info(f"Uploading {len(image_files)} image files...")
|
406 |
-
operations = []
|
407 |
-
|
408 |
-
# Process images in batches to avoid memory issues with large datasets
|
409 |
-
MAX_BATCH_SIZE = 20 # Maximum images per commit
|
410 |
-
total_uploaded = 0
|
411 |
-
|
412 |
-
# Group image files into batches
|
413 |
-
image_batches = [image_files[i:i+MAX_BATCH_SIZE] for i in range(0, len(image_files), MAX_BATCH_SIZE)]
|
414 |
-
|
415 |
-
for batch_idx, img_batch in enumerate(image_batches):
|
416 |
-
operations = []
|
417 |
-
|
418 |
-
for img_file in tqdm(img_batch, desc=f"Preparing batch {batch_idx+1}/{len(image_batches)}"):
|
419 |
-
try:
|
420 |
-
file_path = img_file.name
|
421 |
-
file_name = os.path.basename(file_path)
|
422 |
-
target_path = f"images/{file_name}"
|
423 |
-
|
424 |
-
# Add file to operations list
|
425 |
-
with open(file_path, "rb") as f:
|
426 |
-
content = f.read()
|
427 |
-
operations.append(
|
428 |
-
CommitOperationAdd(
|
429 |
-
path_in_repo=target_path,
|
430 |
-
path_or_fileobj=content
|
431 |
-
)
|
432 |
-
)
|
433 |
-
except Exception as e:
|
434 |
-
logger.error(f"Error preparing image {img_file.name} for upload: {e}")
|
435 |
-
|
436 |
-
# Commit this batch of images
|
437 |
-
if operations:
|
438 |
-
try:
|
439 |
-
logger.info(f"Committing batch {batch_idx+1}/{len(image_batches)} with {len(operations)} images...")
|
440 |
-
api.create_commit(
|
441 |
-
repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
|
442 |
-
repo_type="dataset",
|
443 |
-
operations=operations,
|
444 |
-
commit_message=f"Upload {len(operations)} images (batch {batch_idx+1}/{len(image_batches)}) from upload {timestamp}"
|
445 |
-
)
|
446 |
-
total_uploaded += len(operations)
|
447 |
-
logger.info(f"Successfully uploaded batch {batch_idx+1} ({total_uploaded}/{len(image_files)} total)")
|
448 |
-
except Exception as e:
|
449 |
-
logger.error(f"Failed to upload image batch {batch_idx+1}: {e}")
|
450 |
-
|
451 |
-
logger.info(f"Image upload complete: {total_uploaded}/{len(image_files)} files uploaded")
|
452 |
-
|
453 |
-
# Update stats
|
454 |
-
STATS["uploads"] += 1
|
455 |
-
STATS["last_upload"] = timestamp
|
456 |
-
upload_status = "✅ success"
|
457 |
-
|
458 |
-
# Update README in background thread
|
459 |
-
threading.Thread(target=update_readme_stats).start()
|
460 |
-
|
461 |
-
except Exception as e:
|
462 |
-
logger.error(f"HF upload failed: {e}")
|
463 |
-
upload_status = f"❌ failed: {str(e)[:100]}..."
|
464 |
-
|
465 |
-
# Return stats with all info
|
466 |
-
result = (
|
467 |
-
f"🔥 BATCH STATS 🔥\n"
|
468 |
-
f"✓ Processed: {batch_stats['processed']} images\n"
|
469 |
-
f"🌍 With GPS: {batch_stats['with_gps']}\n"
|
470 |
-
f"🚫 Skipped: {batch_stats['skipped']}\n"
|
471 |
-
f"⚠️ Errors: {batch_stats['errors']}\n"
|
472 |
-
f"☁️ Upload: {upload_status}\n\n"
|
473 |
-
f"📊 TOTAL STATS 📊\n"
|
474 |
-
f"Total files: {STATS['total_files']}\n"
|
475 |
-
f"Files with GPS: {STATS['files_with_gps']}\n"
|
476 |
-
f"Upload batches: {STATS['uploads']}\n"
|
477 |
-
f"Uptime: {format_duration(int(time.time()) - STATS['startup_time'])}"
|
478 |
-
)
|
479 |
-
|
480 |
-
return result, output_file
|
481 |
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
return
|
487 |
-
|
488 |
-
logger.info(f"Scanning directory: {directory_path}")
|
489 |
-
image_files = []
|
490 |
-
|
491 |
-
# Find all image files in directory
|
492 |
-
for root, _, files in os.walk(directory_path):
|
493 |
-
for file in files:
|
494 |
-
file_path = os.path.join(root, file)
|
495 |
-
if Path(file_path).suffix.lower() in SUPPORTED_EXTENSIONS:
|
496 |
-
image_files.append(file_path)
|
497 |
-
|
498 |
-
if not image_files:
|
499 |
-
logger.info(f"No image files found in {directory_path}")
|
500 |
-
return
|
501 |
-
|
502 |
-
logger.info(f"Found {len(image_files)} image files in {directory_path}")
|
503 |
-
|
504 |
-
# Create file-like objects for processing
|
505 |
-
class FileObject:
|
506 |
-
def __init__(self, path):
|
507 |
-
self.name = path
|
508 |
|
509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
|
|
|
511 |
def schedule_directory_scan():
|
512 |
-
"""
|
513 |
-
watch_dir = os.
|
514 |
-
|
515 |
if watch_dir and os.path.isdir(watch_dir):
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
# Schedule next check
|
520 |
threading.Timer(CHECK_INTERVAL, schedule_directory_scan).start()
|
521 |
|
522 |
-
# Create the UI
|
523 |
-
demo = gr.Interface(
|
524 |
-
fn=process_images,
|
525 |
-
inputs=gr.Files(label="DROP IMAGES HERE 📸", file_types=["image"], file_count="multiple"),
|
526 |
-
outputs=[
|
527 |
-
gr.Textbox(label="Status Report", lines=10),
|
528 |
-
gr.File(label="Download Metadata JSONL")
|
529 |
-
],
|
530 |
-
title="🌍 Geo-Metadata Extractor 🔥",
|
531 |
-
description=(
|
532 |
-
f"Upload images to extract all metadata including GPS coordinates. "
|
533 |
-
f"Supported formats: {', '.join(sorted(ext[1:] for ext in SUPPORTED_EXTENSIONS))}. "
|
534 |
-
f"Data automatically uploads to {HF_USERNAME}/{DATASET_NAME} on Hugging Face."
|
535 |
-
),
|
536 |
-
allow_flagging="never",
|
537 |
-
theme="huggingface"
|
538 |
-
)
|
539 |
-
|
540 |
-
# Launch app and start background processes
|
541 |
if __name__ == "__main__":
|
542 |
-
|
543 |
ensure_dataset_exists()
|
544 |
-
|
545 |
-
# Start directory watcher if configured
|
546 |
-
if os.environ.get("WATCH_DIRECTORY"):
|
547 |
threading.Thread(target=schedule_directory_scan).start()
|
548 |
-
|
549 |
-
|
550 |
-
# Log startup info
|
551 |
-
logger.info(f"=== Application Startup at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
|
552 |
-
logger.info(f"Dataset: {HF_USERNAME}/{DATASET_NAME}")
|
553 |
-
logger.info(f"Token available: {bool(HF_TOKEN)}")
|
554 |
-
|
555 |
-
# Launch Gradio app
|
556 |
-
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import json
|
3 |
import time
|
4 |
+
import logging
|
|
|
|
|
5 |
import threading
|
6 |
import sys
|
7 |
+
from pathlib import Path
|
8 |
+
from concurrent.futures import ThreadPoolExecutor
|
9 |
+
from datasets import Dataset
|
10 |
+
from huggingface_hub import HfApi, create_repo, CommitOperationAdd
|
11 |
+
from PIL import Image, ExifTags
|
12 |
+
import gradio as gr
|
13 |
|
14 |
+
# ----------------- CONFIGURATION -----------------
|
15 |
+
HF_USERNAME = os.getenv("HF_USERNAME", "latterworks")
|
16 |
+
DATASET_NAME = os.getenv("DATASET_NAME", "geo-metadata")
|
17 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
18 |
+
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "3600")) # Check every hour
|
19 |
+
MAX_BATCH_SIZE = int(os.getenv("MAX_BATCH_SIZE", "20"))
|
20 |
+
MAX_LOG_SIZE_MB = int(os.getenv("MAX_LOG_SIZE_MB", "10"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'}
|
|
|
22 |
|
23 |
+
# Logging Setup
|
24 |
+
os.makedirs("logs", exist_ok=True)
|
25 |
+
log_handler = logging.handlers.RotatingFileHandler("logs/uploader.log", maxBytes=MAX_LOG_SIZE_MB * 1024 * 1024, backupCount=5)
|
26 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(), log_handler])
|
27 |
+
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
+
# Global State
|
30 |
+
STATS = {"uploads": 0, "total_files": 0, "files_with_gps": 0, "startup_time": int(time.time())}
|
31 |
|
32 |
+
# Initialize HF API once
|
33 |
+
api = HfApi(token=HF_TOKEN)
|
|
|
|
|
34 |
|
35 |
+
# ----------------- UTILITIES -----------------
|
36 |
+
def repository_exists(repo_id, repo_type="dataset"):
|
37 |
+
"""Check if a Hugging Face dataset repo exists."""
|
38 |
+
try:
|
39 |
+
api.repo_info(repo_id=repo_id, repo_type=repo_type)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
return True
|
41 |
+
except Exception:
|
|
|
42 |
return False
|
43 |
|
44 |
+
def ensure_dataset_exists():
|
45 |
+
"""Ensure dataset repository exists or create it."""
|
46 |
+
repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
|
47 |
+
if not repository_exists(repo_id):
|
48 |
+
logger.info(f"Creating dataset repository: {repo_id}")
|
49 |
+
create_repo(repo_id=repo_id, repo_type="dataset", private=False, token=HF_TOKEN)
|
50 |
+
api.upload_file(path_or_fileobj=b"", path_in_repo="images/.gitkeep", repo_id=repo_id, repo_type="dataset", commit_message="Initialize images folder")
|
51 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
def format_duration(seconds):
|
54 |
+
"""Convert seconds to human-readable duration."""
|
55 |
+
d, h, m, s = seconds // 86400, (seconds % 86400) // 3600, (seconds % 3600) // 60, seconds % 60
|
56 |
+
return f"{d}d {h}h {m}m {s}s" if d else f"{h}h {m}m {s}s" if h else f"{m}m {s}s"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
def convert_to_degrees(value):
|
59 |
+
"""Convert GPS coordinates to decimal degrees."""
|
60 |
try:
|
61 |
+
d, m, s = [float(x.numerator) / float(x.denominator) if hasattr(x, 'numerator') else float(x) for x in value]
|
62 |
+
return d + (m / 60.0) + (s / 3600.0)
|
63 |
+
except Exception:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
return None
|
65 |
|
66 |
def extract_gps_info(gps_info):
|
67 |
+
"""Extract and process GPS data from EXIF."""
|
68 |
if not isinstance(gps_info, dict):
|
69 |
return None
|
|
|
|
|
70 |
try:
|
71 |
+
gps_data = {ExifTags.GPSTAGS.get(k, f"gps_{k}"): v for k, v in gps_info.items()}
|
|
|
|
|
|
|
|
|
|
|
72 |
if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
|
73 |
+
lat, lon = convert_to_degrees(gps_data['GPSLatitude']), convert_to_degrees(gps_data['GPSLongitude'])
|
74 |
+
if lat and lon:
|
75 |
+
if gps_data.get('GPSLatitudeRef', 'N') == 'S':
|
76 |
+
lat = -lat
|
77 |
+
if gps_data.get('GPSLongitudeRef', 'E') == 'W':
|
78 |
+
lon = -lon
|
79 |
+
gps_data.update({'Latitude': round(lat, 6), 'Longitude': round(lon, 6)})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
return gps_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
except Exception:
|
82 |
+
return None
|
83 |
|
84 |
def get_image_metadata(image_path):
|
85 |
+
"""Extract metadata from an image file."""
|
86 |
file_path = Path(image_path)
|
87 |
+
metadata = {"file_name": str(file_path.absolute()), "file_extension": file_path.suffix.lower()}
|
|
|
|
|
|
|
|
|
|
|
88 |
try:
|
89 |
+
with Image.open(image_path) as img:
|
90 |
+
metadata.update({"format": img.format, "size": list(img.size), "mode": img.mode})
|
91 |
+
exif_data = img._getexif()
|
92 |
+
if exif_data:
|
93 |
+
metadata.update({ExifTags.TAGS.get(k, f"tag_{k}").lower(): v for k, v in exif_data.items()})
|
94 |
+
if 'gpsinfo' in metadata:
|
95 |
+
metadata["gps_info"] = extract_gps_info(metadata.pop('gpsinfo'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
metadata["file_size"] = os.path.getsize(image_path)
|
97 |
+
metadata["timestamp"] = int(time.time())
|
|
|
|
|
|
|
|
|
98 |
return metadata
|
99 |
+
except Exception:
|
100 |
+
return None
|
|
|
101 |
|
102 |
+
# ----------------- UPLOADING -----------------
|
103 |
+
def upload_metadata(metadata_list):
|
104 |
+
"""Upload metadata to Hugging Face."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
if not metadata_list:
|
106 |
+
return "No metadata to upload"
|
107 |
+
repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
|
108 |
+
dataset = Dataset.from_dict({"metadata": metadata_list})
|
109 |
+
dataset.push_to_hub(repo_id, commit_message=f"Add {len(metadata_list)} image metadata entries")
|
110 |
+
return "Upload successful"
|
111 |
+
|
112 |
+
def upload_images(image_paths):
|
113 |
+
"""Upload images to Hugging Face."""
|
114 |
+
repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
|
115 |
+
operations = []
|
116 |
+
for image_path in image_paths:
|
|
|
|
|
|
|
|
|
117 |
try:
|
118 |
+
with open(image_path, "rb") as f:
|
119 |
+
operations.append(CommitOperationAdd(path_in_repo=f"images/{Path(image_path).name}", path_or_fileobj=f.read()))
|
120 |
+
except Exception:
|
121 |
+
continue
|
122 |
+
if operations:
|
123 |
+
api.create_commit(repo_id=repo_id, repo_type="dataset", operations=operations, commit_message="Batch upload images")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
+
# ----------------- PROCESSING -----------------
|
126 |
+
def process_images(image_files):
|
127 |
+
"""Process images, extract metadata, and upload to Hugging Face."""
|
128 |
+
if not ensure_dataset_exists():
|
129 |
+
return "Dataset creation failed."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
+
metadata_list = []
|
132 |
+
image_paths = []
|
133 |
+
with ThreadPoolExecutor(max_workers=MAX_BATCH_SIZE) as executor:
|
134 |
+
results = executor.map(get_image_metadata, [file.name for file in image_files])
|
135 |
+
for result, file in zip(results, image_files):
|
136 |
+
if result:
|
137 |
+
metadata_list.append(result)
|
138 |
+
image_paths.append(file.name)
|
139 |
+
|
140 |
+
if metadata_list:
|
141 |
+
upload_metadata(metadata_list)
|
142 |
+
upload_images(image_paths)
|
143 |
+
return f"Processed {len(metadata_list)} images, uploaded metadata & images."
|
144 |
+
return "No valid images processed."
|
145 |
+
|
146 |
+
# ----------------- GRADIO UI -----------------
|
147 |
+
demo = gr.Interface(
|
148 |
+
fn=process_images,
|
149 |
+
inputs=gr.Files(label="Upload Images"),
|
150 |
+
outputs=gr.Textbox(label="Status Report"),
|
151 |
+
title="Geo-Metadata Uploader",
|
152 |
+
description=f"Upload images for automatic metadata extraction and upload to Hugging Face ({HF_USERNAME}/{DATASET_NAME}).",
|
153 |
+
allow_flagging="never"
|
154 |
+
)
|
155 |
|
156 |
+
# ----------------- AUTO-SCHEDULING -----------------
|
157 |
def schedule_directory_scan():
|
158 |
+
"""Periodically scan a directory for new images."""
|
159 |
+
watch_dir = os.getenv("WATCH_DIRECTORY")
|
|
|
160 |
if watch_dir and os.path.isdir(watch_dir):
|
161 |
+
image_files = [Path(watch_dir) / f for f in os.listdir(watch_dir) if f.lower().endswith(tuple(SUPPORTED_EXTENSIONS))]
|
162 |
+
process_images(image_files)
|
|
|
|
|
163 |
threading.Timer(CHECK_INTERVAL, schedule_directory_scan).start()
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
if __name__ == "__main__":
|
166 |
+
logger.info(f"Starting uploader for {HF_USERNAME}/{DATASET_NAME}...")
|
167 |
ensure_dataset_exists()
|
168 |
+
if os.getenv("WATCH_DIRECTORY"):
|
|
|
|
|
169 |
threading.Thread(target=schedule_directory_scan).start()
|
170 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|