latterworks commited on
Commit
c35918c
·
verified ·
1 Parent(s): ba846ff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -510
app.py CHANGED
@@ -1,556 +1,170 @@
1
- import gradio as gr
2
- from pathlib import Path
3
- from PIL import Image, ExifTags
4
- import json
5
  import os
6
- import logging
7
  import time
8
- from datasets import Dataset
9
- from huggingface_hub import HfApi, create_repo, repository_exists, CommitOperationAdd
10
- from huggingface_hub.utils import tqdm
11
  import threading
12
  import sys
 
 
 
 
 
 
13
 
14
- # Setup logging with timestamp
15
- logging.basicConfig(
16
- level=logging.INFO,
17
- format="%(asctime)s [%(levelname)s] %(message)s",
18
- handlers=[
19
- logging.StreamHandler(),
20
- logging.FileHandler("metadata_uploader.log")
21
- ]
22
- )
23
- logger = logging.getLogger(__name__)
24
-
25
- # Constants - edit these for your setup
26
- HF_TOKEN = os.environ.get("HF_TOKEN")
27
- HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks")
28
- DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata")
29
  SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'}
30
- CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", "3600")) # Check for files hourly by default
31
 
32
- # Global state
33
- STATS = {
34
- "uploads": 0,
35
- "total_files": 0,
36
- "files_with_gps": 0,
37
- "last_upload": 0,
38
- "startup_time": int(time.time())
39
- }
40
-
41
- def ensure_dataset_exists():
42
- """Create dataset repository if it doesn't exist"""
43
- if not HF_TOKEN:
44
- logger.error("HF_TOKEN not set. Cannot create or check dataset.")
45
- return False
46
-
47
- try:
48
- api = HfApi(token=HF_TOKEN)
49
- repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
50
-
51
- # Check if repo exists
52
- if not repository_exists(repo_id, repo_type="dataset", token=HF_TOKEN):
53
- logger.info(f"Creating dataset repository: {repo_id}")
54
- create_repo(
55
- repo_id=repo_id,
56
- repo_type="dataset",
57
- private=False,
58
- token=HF_TOKEN
59
- )
60
-
61
- # Create initial README
62
- readme_content = f"""# {DATASET_NAME}
63
 
64
- Automatically collected geo-metadata from images using the Geo-Metadata Extractor.
 
65
 
66
- ## Statistics
67
- - Total files processed: 0
68
- - Files with GPS data: 0
69
- - Last updated: {time.strftime('%Y-%m-%d %H:%M:%S')}
70
 
71
- ## Data Format
72
- Each entry contains:
73
- - Basic image metadata (size, format, mode)
74
- - EXIF data when available
75
- - GPS coordinates extracted from EXIF when available
76
- """
77
-
78
- # Upload README
79
- api.upload_file(
80
- path_or_fileobj=readme_content.encode(),
81
- path_in_repo="README.md",
82
- repo_id=repo_id,
83
- repo_type="dataset",
84
- token=HF_TOKEN,
85
- commit_message="Initial commit with README"
86
- )
87
-
88
- # Create folder structure
89
- for folder in ["batches", "images", "scripts"]:
90
- api.upload_file(
91
- path_or_fileobj=b"",
92
- path_in_repo=f"{folder}/.gitkeep",
93
- repo_id=repo_id,
94
- repo_type="dataset",
95
- token=HF_TOKEN,
96
- commit_message=f"Create {folder} directory"
97
- )
98
-
99
- # Upload this script to the repository
100
- try:
101
- script_path = os.path.abspath(sys.argv[0])
102
- if os.path.exists(script_path):
103
- with open(script_path, "rb") as f:
104
- script_content = f.read()
105
-
106
- api.upload_file(
107
- path_or_fileobj=script_content,
108
- path_in_repo="scripts/geo_metadata_extractor.py",
109
- repo_id=repo_id,
110
- repo_type="dataset",
111
- token=HF_TOKEN,
112
- commit_message="Upload metadata extractor script"
113
- )
114
- except Exception as e:
115
- logger.error(f"Failed to upload script: {e}")
116
-
117
- logger.info(f"Dataset repository created: {repo_id}")
118
- else:
119
- logger.info(f"Dataset repository already exists: {repo_id}")
120
-
121
  return True
122
- except Exception as e:
123
- logger.error(f"Error ensuring dataset exists: {e}")
124
  return False
125
 
126
- def update_readme_stats():
127
- """Update README with current statistics"""
128
- if not HF_TOKEN:
129
- return
130
-
131
- try:
132
- api = HfApi(token=HF_TOKEN)
133
- repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
134
-
135
- # Create updated README content
136
- readme_content = f"""# {DATASET_NAME}
137
-
138
- Automatically collected geo-metadata from images using the Geo-Metadata Extractor.
139
-
140
- ## Statistics
141
- - Total files processed: {STATS["total_files"]}
142
- - Files with GPS data: {STATS["files_with_gps"]}
143
- - Upload batches: {STATS["uploads"]}
144
- - Last updated: {time.strftime('%Y-%m-%d %H:%M:%S')}
145
- - Uptime: {format_duration(int(time.time()) - STATS["startup_time"])}
146
-
147
- ## Data Format
148
- Each entry contains:
149
- - Basic image metadata (size, format, mode)
150
- - EXIF data when available
151
- - GPS coordinates extracted from EXIF when available
152
- """
153
-
154
- # Upload updated README
155
- api.upload_file(
156
- path_or_fileobj=readme_content.encode(),
157
- path_in_repo="README.md",
158
- repo_id=repo_id,
159
- repo_type="dataset",
160
- token=HF_TOKEN,
161
- commit_message="Update statistics"
162
- )
163
-
164
- logger.info("Updated README with current statistics")
165
- except Exception as e:
166
- logger.error(f"Error updating README: {e}")
167
 
168
  def format_duration(seconds):
169
- """Format seconds into readable duration"""
170
- days, remainder = divmod(seconds, 86400)
171
- hours, remainder = divmod(remainder, 3600)
172
- minutes, seconds = divmod(remainder, 60)
173
-
174
- parts = []
175
- if days > 0:
176
- parts.append(f"{days}d")
177
- if hours > 0:
178
- parts.append(f"{hours}h")
179
- if minutes > 0:
180
- parts.append(f"{minutes}m")
181
- parts.append(f"{seconds}s")
182
-
183
- return " ".join(parts)
184
 
185
  def convert_to_degrees(value):
186
- """Convert GPS coordinates to decimal degrees"""
187
  try:
188
- if not isinstance(value, (tuple, list)) or len(value) != 3:
189
- raise ValueError(f"GPS needs 3 values, got {type(value)}")
190
-
191
- d, m, s = value
192
- # Convert from rational numbers if needed
193
- d = d.numerator / d.denominator if hasattr(d, 'numerator') else float(d)
194
- m = m.numerator / m.denominator if hasattr(m, 'numerator') else float(m)
195
- s = s.numerator / s.denominator if hasattr(s, 'numerator') else float(s)
196
-
197
- degrees = d + (m / 60.0) + (s / 3600.0)
198
- if not -180 <= degrees <= 180:
199
- logger.warning(f"GPS out of bounds: {degrees}°")
200
- return degrees
201
- except Exception as e:
202
- logger.error(f"GPS conversion failed: {e}, value={value}")
203
  return None
204
 
205
  def extract_gps_info(gps_info):
206
- """Extract and process GPS data from EXIF"""
207
  if not isinstance(gps_info, dict):
208
  return None
209
-
210
- gps_data = {}
211
  try:
212
- # Extract tags
213
- for key, val in gps_info.items():
214
- tag_name = ExifTags.GPSTAGS.get(key, f"gps_{key}")
215
- gps_data[tag_name] = val
216
-
217
- # Process lat/long if present
218
  if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
219
- lat = convert_to_degrees(gps_data['GPSLatitude'])
220
- lon = convert_to_degrees(gps_data['GPSLongitude'])
221
-
222
- if lat is None or lon is None:
223
- return None
224
-
225
- # Apply N/S/E/W reference
226
- lat_ref = gps_data.get('GPSLatitudeRef', 'N')
227
- lon_ref = gps_data.get('GPSLongitudeRef', 'E')
228
-
229
- # Flip signs based on hemisphere
230
- if lat_ref == 'S':
231
- lat = -lat
232
- if lon_ref == 'W':
233
- lon = -lon
234
-
235
- # Store clean coords with proper precision
236
- gps_data['Latitude'] = round(lat, 6)
237
- gps_data['Longitude'] = round(lon, 6)
238
-
239
  return gps_data
240
- except Exception as e:
241
- logger.error(f"GPS extraction error: {e}")
242
- return None
243
-
244
- def make_serializable(value):
245
- """Make any value JSON serializable"""
246
- try:
247
- if hasattr(value, 'numerator') and hasattr(value, 'denominator'):
248
- return float(value.numerator) / float(value.denominator)
249
- elif isinstance(value, (tuple, list)):
250
- return [make_serializable(item) for item in value]
251
- elif isinstance(value, dict):
252
- return {str(k): make_serializable(v) for k, v in value.items()}
253
- elif isinstance(value, bytes):
254
- return value.decode('utf-8', errors='replace')
255
- json.dumps(value) # Test if serializable
256
- return value
257
  except Exception:
258
- return str(value)
259
 
260
  def get_image_metadata(image_path):
261
- """Extract all metadata from an image file"""
262
  file_path = Path(image_path)
263
- metadata = {
264
- "file_name": str(file_path.absolute()),
265
- "file_basename": file_path.name,
266
- "image_path_in_repo": f"images/{file_path.name}" # Path where image will be stored in repo
267
- }
268
-
269
  try:
270
- with Image.open(image_path) as image:
271
- metadata.update({
272
- "format": image.format or "unknown",
273
- "size": list(image.size),
274
- "mode": image.mode or "unknown"
275
- })
276
-
277
- # Extract EXIF if available
278
- exif_data = None
279
- try:
280
- exif_data = image._getexif()
281
- except (AttributeError, Exception) as e:
282
- metadata["exif_error"] = str(e)
283
-
284
- if exif_data and isinstance(exif_data, dict):
285
- for tag_id, value in exif_data.items():
286
- try:
287
- tag_name = ExifTags.TAGS.get(tag_id, f"tag_{tag_id}").lower()
288
- if tag_name == "gpsinfo":
289
- gps_info = extract_gps_info(value)
290
- if gps_info:
291
- metadata["gps_info"] = make_serializable(gps_info)
292
- else:
293
- metadata[tag_name] = make_serializable(value)
294
- except Exception as e:
295
- metadata[f"error_tag_{tag_id}"] = str(e)
296
-
297
- # Add file details
298
  metadata["file_size"] = os.path.getsize(image_path)
299
- metadata["file_extension"] = file_path.suffix.lower()
300
- metadata["extraction_timestamp"] = int(time.time())
301
-
302
- # Test serialization
303
- json.dumps(metadata)
304
  return metadata
305
- except Exception as e:
306
- logger.error(f"Error processing {image_path}: {e}")
307
- return {"file_name": str(file_path.absolute()), "error": str(e)}
308
 
309
- def process_images(image_files):
310
- """Process images and upload metadata to Hugging Face"""
311
- if not image_files:
312
- return "🚫 Upload some fucking images first! 📷", None
313
-
314
- # Ensure dataset exists
315
- if not ensure_dataset_exists():
316
- return "❌ Failed to create or verify dataset repository. Check logs.", None
317
-
318
- # Create temp directory for storing files if needed
319
- os.makedirs("temp_uploads", exist_ok=True)
320
-
321
- # Reset stats for this batch
322
- batch_stats = {
323
- "processed": 0,
324
- "skipped": 0,
325
- "errors": 0,
326
- "with_gps": 0
327
- }
328
-
329
- metadata_list = []
330
- filenames = []
331
-
332
- # Process each image
333
- for image_file in image_files:
334
- if not image_file or not os.path.exists(image_file.name):
335
- continue
336
-
337
- file_ext = Path(image_file.name).suffix.lower()
338
- if file_ext not in SUPPORTED_EXTENSIONS:
339
- logger.info(f"Skipping unsupported file: {image_file.name}")
340
- batch_stats["skipped"] += 1
341
- continue
342
-
343
- logger.info(f"Processing: {image_file.name}")
344
- try:
345
- metadata = get_image_metadata(image_file.name)
346
- if metadata:
347
- if "gps_info" in metadata:
348
- batch_stats["with_gps"] += 1
349
- STATS["files_with_gps"] += 1
350
- metadata_list.append(metadata)
351
- filenames.append(Path(image_file.name).name)
352
- batch_stats["processed"] += 1
353
- else:
354
- batch_stats["errors"] += 1
355
- except Exception as e:
356
- logger.error(f"Failed on {image_file.name}: {e}")
357
- batch_stats["errors"] += 1
358
-
359
- # Exit if nothing processed
360
  if not metadata_list:
361
- return f"No valid images. Skipped: {batch_stats['skipped']}, Errors: {batch_stats['errors']}", None
362
-
363
- # Generate unique filename
364
- timestamp = int(time.time())
365
- STATS["total_files"] += batch_stats["processed"]
366
- output_file = f"metadata_{timestamp}.jsonl"
367
-
368
- # Save locally
369
- with open(output_file, 'w', encoding='utf-8') as f:
370
- for entry in metadata_list:
371
- f.write(json.dumps(entry, ensure_ascii=False) + '\n')
372
-
373
- # Upload to HF
374
- upload_status = "not uploaded (no token)"
375
- if HF_TOKEN:
376
  try:
377
- logger.info(f"Uploading to {HF_USERNAME}/{DATASET_NAME}...")
378
-
379
- # Create dataset object with both filenames and full metadata
380
- dataset = Dataset.from_dict({
381
- "filename": filenames,
382
- "image_path": [f"images/{f}" for f in filenames], # Path to actual image in repo
383
- "metadata": metadata_list
384
- })
385
-
386
- # Push to hub
387
- dataset.push_to_hub(
388
- f"{HF_USERNAME}/{DATASET_NAME}",
389
- token=HF_TOKEN,
390
- commit_message=f"Added metadata for {len(metadata_list)} images"
391
- )
392
-
393
- # Upload raw JSONL file
394
- api = HfApi(token=HF_TOKEN)
395
- api.upload_file(
396
- path_or_fileobj=output_file,
397
- path_in_repo=f"batches/metadata_{timestamp}.jsonl",
398
- repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
399
- repo_type="dataset",
400
- token=HF_TOKEN,
401
- commit_message=f"Raw metadata batch {timestamp}"
402
- )
403
-
404
- # Upload the actual image files
405
- logger.info(f"Uploading {len(image_files)} image files...")
406
- operations = []
407
-
408
- # Process images in batches to avoid memory issues with large datasets
409
- MAX_BATCH_SIZE = 20 # Maximum images per commit
410
- total_uploaded = 0
411
-
412
- # Group image files into batches
413
- image_batches = [image_files[i:i+MAX_BATCH_SIZE] for i in range(0, len(image_files), MAX_BATCH_SIZE)]
414
-
415
- for batch_idx, img_batch in enumerate(image_batches):
416
- operations = []
417
-
418
- for img_file in tqdm(img_batch, desc=f"Preparing batch {batch_idx+1}/{len(image_batches)}"):
419
- try:
420
- file_path = img_file.name
421
- file_name = os.path.basename(file_path)
422
- target_path = f"images/{file_name}"
423
-
424
- # Add file to operations list
425
- with open(file_path, "rb") as f:
426
- content = f.read()
427
- operations.append(
428
- CommitOperationAdd(
429
- path_in_repo=target_path,
430
- path_or_fileobj=content
431
- )
432
- )
433
- except Exception as e:
434
- logger.error(f"Error preparing image {img_file.name} for upload: {e}")
435
-
436
- # Commit this batch of images
437
- if operations:
438
- try:
439
- logger.info(f"Committing batch {batch_idx+1}/{len(image_batches)} with {len(operations)} images...")
440
- api.create_commit(
441
- repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
442
- repo_type="dataset",
443
- operations=operations,
444
- commit_message=f"Upload {len(operations)} images (batch {batch_idx+1}/{len(image_batches)}) from upload {timestamp}"
445
- )
446
- total_uploaded += len(operations)
447
- logger.info(f"Successfully uploaded batch {batch_idx+1} ({total_uploaded}/{len(image_files)} total)")
448
- except Exception as e:
449
- logger.error(f"Failed to upload image batch {batch_idx+1}: {e}")
450
-
451
- logger.info(f"Image upload complete: {total_uploaded}/{len(image_files)} files uploaded")
452
-
453
- # Update stats
454
- STATS["uploads"] += 1
455
- STATS["last_upload"] = timestamp
456
- upload_status = "✅ success"
457
-
458
- # Update README in background thread
459
- threading.Thread(target=update_readme_stats).start()
460
-
461
- except Exception as e:
462
- logger.error(f"HF upload failed: {e}")
463
- upload_status = f"❌ failed: {str(e)[:100]}..."
464
-
465
- # Return stats with all info
466
- result = (
467
- f"🔥 BATCH STATS 🔥\n"
468
- f"✓ Processed: {batch_stats['processed']} images\n"
469
- f"🌍 With GPS: {batch_stats['with_gps']}\n"
470
- f"🚫 Skipped: {batch_stats['skipped']}\n"
471
- f"⚠️ Errors: {batch_stats['errors']}\n"
472
- f"☁️ Upload: {upload_status}\n\n"
473
- f"📊 TOTAL STATS 📊\n"
474
- f"Total files: {STATS['total_files']}\n"
475
- f"Files with GPS: {STATS['files_with_gps']}\n"
476
- f"Upload batches: {STATS['uploads']}\n"
477
- f"Uptime: {format_duration(int(time.time()) - STATS['startup_time'])}"
478
- )
479
-
480
- return result, output_file
481
 
482
- def scan_and_process_directory(directory_path):
483
- """Scan directory for images and process them"""
484
- if not os.path.isdir(directory_path):
485
- logger.error(f"Not a directory: {directory_path}")
486
- return
487
-
488
- logger.info(f"Scanning directory: {directory_path}")
489
- image_files = []
490
-
491
- # Find all image files in directory
492
- for root, _, files in os.walk(directory_path):
493
- for file in files:
494
- file_path = os.path.join(root, file)
495
- if Path(file_path).suffix.lower() in SUPPORTED_EXTENSIONS:
496
- image_files.append(file_path)
497
-
498
- if not image_files:
499
- logger.info(f"No image files found in {directory_path}")
500
- return
501
-
502
- logger.info(f"Found {len(image_files)} image files in {directory_path}")
503
-
504
- # Create file-like objects for processing
505
- class FileObject:
506
- def __init__(self, path):
507
- self.name = path
508
 
509
- process_images([FileObject(path) for path in image_files])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
 
 
511
  def schedule_directory_scan():
512
- """Check for new files in directory periodically"""
513
- watch_dir = os.environ.get("WATCH_DIRECTORY")
514
-
515
  if watch_dir and os.path.isdir(watch_dir):
516
- logger.info(f"Scheduled scan of directory: {watch_dir}")
517
- scan_and_process_directory(watch_dir)
518
-
519
- # Schedule next check
520
  threading.Timer(CHECK_INTERVAL, schedule_directory_scan).start()
521
 
522
- # Create the UI
523
- demo = gr.Interface(
524
- fn=process_images,
525
- inputs=gr.Files(label="DROP IMAGES HERE 📸", file_types=["image"], file_count="multiple"),
526
- outputs=[
527
- gr.Textbox(label="Status Report", lines=10),
528
- gr.File(label="Download Metadata JSONL")
529
- ],
530
- title="🌍 Geo-Metadata Extractor 🔥",
531
- description=(
532
- f"Upload images to extract all metadata including GPS coordinates. "
533
- f"Supported formats: {', '.join(sorted(ext[1:] for ext in SUPPORTED_EXTENSIONS))}. "
534
- f"Data automatically uploads to {HF_USERNAME}/{DATASET_NAME} on Hugging Face."
535
- ),
536
- allow_flagging="never",
537
- theme="huggingface"
538
- )
539
-
540
- # Launch app and start background processes
541
  if __name__ == "__main__":
542
- # Ensure dataset exists on startup
543
  ensure_dataset_exists()
544
-
545
- # Start directory watcher if configured
546
- if os.environ.get("WATCH_DIRECTORY"):
547
  threading.Thread(target=schedule_directory_scan).start()
548
- logger.info(f"Starting directory watcher for {os.environ.get('WATCH_DIRECTORY')}")
549
-
550
- # Log startup info
551
- logger.info(f"=== Application Startup at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
552
- logger.info(f"Dataset: {HF_USERNAME}/{DATASET_NAME}")
553
- logger.info(f"Token available: {bool(HF_TOKEN)}")
554
-
555
- # Launch Gradio app
556
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
1
  import os
2
+ import json
3
  import time
4
+ import logging
 
 
5
  import threading
6
  import sys
7
+ from pathlib import Path
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from datasets import Dataset
10
+ from huggingface_hub import HfApi, create_repo, CommitOperationAdd
11
+ from PIL import Image, ExifTags
12
+ import gradio as gr
13
 
14
+ # ----------------- CONFIGURATION -----------------
15
+ HF_USERNAME = os.getenv("HF_USERNAME", "latterworks")
16
+ DATASET_NAME = os.getenv("DATASET_NAME", "geo-metadata")
17
+ HF_TOKEN = os.getenv("HF_TOKEN")
18
+ CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "3600")) # Check every hour
19
+ MAX_BATCH_SIZE = int(os.getenv("MAX_BATCH_SIZE", "20"))
20
+ MAX_LOG_SIZE_MB = int(os.getenv("MAX_LOG_SIZE_MB", "10"))
 
 
 
 
 
 
 
 
21
  SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'}
 
22
 
23
+ # Logging Setup
24
+ os.makedirs("logs", exist_ok=True)
25
+ log_handler = logging.handlers.RotatingFileHandler("logs/uploader.log", maxBytes=MAX_LOG_SIZE_MB * 1024 * 1024, backupCount=5)
26
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(), log_handler])
27
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ # Global State
30
+ STATS = {"uploads": 0, "total_files": 0, "files_with_gps": 0, "startup_time": int(time.time())}
31
 
32
+ # Initialize HF API once
33
+ api = HfApi(token=HF_TOKEN)
 
 
34
 
35
+ # ----------------- UTILITIES -----------------
36
+ def repository_exists(repo_id, repo_type="dataset"):
37
+ """Check if a Hugging Face dataset repo exists."""
38
+ try:
39
+ api.repo_info(repo_id=repo_id, repo_type=repo_type)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  return True
41
+ except Exception:
 
42
  return False
43
 
44
+ def ensure_dataset_exists():
45
+ """Ensure dataset repository exists or create it."""
46
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
47
+ if not repository_exists(repo_id):
48
+ logger.info(f"Creating dataset repository: {repo_id}")
49
+ create_repo(repo_id=repo_id, repo_type="dataset", private=False, token=HF_TOKEN)
50
+ api.upload_file(path_or_fileobj=b"", path_in_repo="images/.gitkeep", repo_id=repo_id, repo_type="dataset", commit_message="Initialize images folder")
51
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  def format_duration(seconds):
54
+ """Convert seconds to human-readable duration."""
55
+ d, h, m, s = seconds // 86400, (seconds % 86400) // 3600, (seconds % 3600) // 60, seconds % 60
56
+ return f"{d}d {h}h {m}m {s}s" if d else f"{h}h {m}m {s}s" if h else f"{m}m {s}s"
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  def convert_to_degrees(value):
59
+ """Convert GPS coordinates to decimal degrees."""
60
  try:
61
+ d, m, s = [float(x.numerator) / float(x.denominator) if hasattr(x, 'numerator') else float(x) for x in value]
62
+ return d + (m / 60.0) + (s / 3600.0)
63
+ except Exception:
 
 
 
 
 
 
 
 
 
 
 
 
64
  return None
65
 
66
  def extract_gps_info(gps_info):
67
+ """Extract and process GPS data from EXIF."""
68
  if not isinstance(gps_info, dict):
69
  return None
 
 
70
  try:
71
+ gps_data = {ExifTags.GPSTAGS.get(k, f"gps_{k}"): v for k, v in gps_info.items()}
 
 
 
 
 
72
  if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
73
+ lat, lon = convert_to_degrees(gps_data['GPSLatitude']), convert_to_degrees(gps_data['GPSLongitude'])
74
+ if lat and lon:
75
+ if gps_data.get('GPSLatitudeRef', 'N') == 'S':
76
+ lat = -lat
77
+ if gps_data.get('GPSLongitudeRef', 'E') == 'W':
78
+ lon = -lon
79
+ gps_data.update({'Latitude': round(lat, 6), 'Longitude': round(lon, 6)})
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  return gps_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  except Exception:
82
+ return None
83
 
84
  def get_image_metadata(image_path):
85
+ """Extract metadata from an image file."""
86
  file_path = Path(image_path)
87
+ metadata = {"file_name": str(file_path.absolute()), "file_extension": file_path.suffix.lower()}
 
 
 
 
 
88
  try:
89
+ with Image.open(image_path) as img:
90
+ metadata.update({"format": img.format, "size": list(img.size), "mode": img.mode})
91
+ exif_data = img._getexif()
92
+ if exif_data:
93
+ metadata.update({ExifTags.TAGS.get(k, f"tag_{k}").lower(): v for k, v in exif_data.items()})
94
+ if 'gpsinfo' in metadata:
95
+ metadata["gps_info"] = extract_gps_info(metadata.pop('gpsinfo'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  metadata["file_size"] = os.path.getsize(image_path)
97
+ metadata["timestamp"] = int(time.time())
 
 
 
 
98
  return metadata
99
+ except Exception:
100
+ return None
 
101
 
102
+ # ----------------- UPLOADING -----------------
103
+ def upload_metadata(metadata_list):
104
+ """Upload metadata to Hugging Face."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  if not metadata_list:
106
+ return "No metadata to upload"
107
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
108
+ dataset = Dataset.from_dict({"metadata": metadata_list})
109
+ dataset.push_to_hub(repo_id, commit_message=f"Add {len(metadata_list)} image metadata entries")
110
+ return "Upload successful"
111
+
112
+ def upload_images(image_paths):
113
+ """Upload images to Hugging Face."""
114
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
115
+ operations = []
116
+ for image_path in image_paths:
 
 
 
 
117
  try:
118
+ with open(image_path, "rb") as f:
119
+ operations.append(CommitOperationAdd(path_in_repo=f"images/{Path(image_path).name}", path_or_fileobj=f.read()))
120
+ except Exception:
121
+ continue
122
+ if operations:
123
+ api.create_commit(repo_id=repo_id, repo_type="dataset", operations=operations, commit_message="Batch upload images")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
+ # ----------------- PROCESSING -----------------
126
+ def process_images(image_files):
127
+ """Process images, extract metadata, and upload to Hugging Face."""
128
+ if not ensure_dataset_exists():
129
+ return "Dataset creation failed."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
+ metadata_list = []
132
+ image_paths = []
133
+ with ThreadPoolExecutor(max_workers=MAX_BATCH_SIZE) as executor:
134
+ results = executor.map(get_image_metadata, [file.name for file in image_files])
135
+ for result, file in zip(results, image_files):
136
+ if result:
137
+ metadata_list.append(result)
138
+ image_paths.append(file.name)
139
+
140
+ if metadata_list:
141
+ upload_metadata(metadata_list)
142
+ upload_images(image_paths)
143
+ return f"Processed {len(metadata_list)} images, uploaded metadata & images."
144
+ return "No valid images processed."
145
+
146
+ # ----------------- GRADIO UI -----------------
147
+ demo = gr.Interface(
148
+ fn=process_images,
149
+ inputs=gr.Files(label="Upload Images"),
150
+ outputs=gr.Textbox(label="Status Report"),
151
+ title="Geo-Metadata Uploader",
152
+ description=f"Upload images for automatic metadata extraction and upload to Hugging Face ({HF_USERNAME}/{DATASET_NAME}).",
153
+ allow_flagging="never"
154
+ )
155
 
156
+ # ----------------- AUTO-SCHEDULING -----------------
157
  def schedule_directory_scan():
158
+ """Periodically scan a directory for new images."""
159
+ watch_dir = os.getenv("WATCH_DIRECTORY")
 
160
  if watch_dir and os.path.isdir(watch_dir):
161
+ image_files = [Path(watch_dir) / f for f in os.listdir(watch_dir) if f.lower().endswith(tuple(SUPPORTED_EXTENSIONS))]
162
+ process_images(image_files)
 
 
163
  threading.Timer(CHECK_INTERVAL, schedule_directory_scan).start()
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  if __name__ == "__main__":
166
+ logger.info(f"Starting uploader for {HF_USERNAME}/{DATASET_NAME}...")
167
  ensure_dataset_exists()
168
+ if os.getenv("WATCH_DIRECTORY"):
 
 
169
  threading.Thread(target=schedule_directory_scan).start()
170
+ demo.launch(server_name="0.0.0.0", server_port=7860)