latterworks commited on
Commit
2328870
Β·
verified Β·
1 Parent(s): b9f8627

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +289 -15
app.py CHANGED
@@ -6,7 +6,10 @@ import os
6
  import logging
7
  import time
8
  from datasets import Dataset
9
- from huggingface_hub import HfApi
 
 
 
10
 
11
  # Setup logging with timestamp
12
  logging.basicConfig(
@@ -19,21 +22,168 @@ logging.basicConfig(
19
  )
20
  logger = logging.getLogger(__name__)
21
 
22
- # Constants - put your shit here
23
  HF_TOKEN = os.environ.get("HF_TOKEN")
24
- HF_USERNAME = "latterworks" # Your username
25
- DATASET_NAME = "geo-metadata" # Your dataset name
26
  SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'}
 
27
 
28
- # Status tracking
29
  STATS = {
30
  "uploads": 0,
31
  "total_files": 0,
32
- "files_with_gps": 0
 
 
33
  }
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def convert_to_degrees(value):
36
- """Convert GPS coordinates to decimal degrees - handles all the edge cases"""
37
  try:
38
  if not isinstance(value, (tuple, list)) or len(value) != 3:
39
  raise ValueError(f"GPS needs 3 values, got {type(value)}")
@@ -109,7 +259,13 @@ def make_serializable(value):
109
 
110
  def get_image_metadata(image_path):
111
  """Extract all metadata from an image file"""
112
- metadata = {"file_name": str(Path(image_path).absolute())}
 
 
 
 
 
 
113
  try:
114
  with Image.open(image_path) as image:
115
  metadata.update({
@@ -140,7 +296,7 @@ def get_image_metadata(image_path):
140
 
141
  # Add file details
142
  metadata["file_size"] = os.path.getsize(image_path)
143
- metadata["file_extension"] = Path(image_path).suffix.lower()
144
  metadata["extraction_timestamp"] = int(time.time())
145
 
146
  # Test serialization
@@ -148,12 +304,19 @@ def get_image_metadata(image_path):
148
  return metadata
149
  except Exception as e:
150
  logger.error(f"Error processing {image_path}: {e}")
151
- return {"file_name": str(Path(image_path).absolute()), "error": str(e)}
152
 
153
  def process_images(image_files):
154
  """Process images and upload metadata to Hugging Face"""
155
  if not image_files:
156
  return "🚫 Upload some fucking images first! πŸ“·", None
 
 
 
 
 
 
 
157
 
158
  # Reset stats for this batch
159
  batch_stats = {
@@ -216,6 +379,7 @@ def process_images(image_files):
216
  # Create dataset object with both filenames and full metadata
217
  dataset = Dataset.from_dict({
218
  "filename": filenames,
 
219
  "metadata": metadata_list
220
  })
221
 
@@ -227,7 +391,7 @@ def process_images(image_files):
227
  )
228
 
229
  # Upload raw JSONL file
230
- api = HfApi()
231
  api.upload_file(
232
  path_or_fileobj=output_file,
233
  path_in_repo=f"batches/metadata_{timestamp}.jsonl",
@@ -237,8 +401,63 @@ def process_images(image_files):
237
  commit_message=f"Raw metadata batch {timestamp}"
238
  )
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  STATS["uploads"] += 1
 
241
  upload_status = "βœ… success"
 
 
 
 
242
  except Exception as e:
243
  logger.error(f"HF upload failed: {e}")
244
  upload_status = f"❌ failed: {str(e)[:100]}..."
@@ -254,12 +473,53 @@ def process_images(image_files):
254
  f"πŸ“Š TOTAL STATS πŸ“Š\n"
255
  f"Total files: {STATS['total_files']}\n"
256
  f"Files with GPS: {STATS['files_with_gps']}\n"
257
- f"Upload batches: {STATS['uploads']}"
 
258
  )
259
 
260
  return result, output_file
261
 
262
- # Create the UI that actually fucking works
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  demo = gr.Interface(
264
  fn=process_images,
265
  inputs=gr.Files(label="DROP IMAGES HERE πŸ“Έ", file_types=["image"], file_count="multiple"),
@@ -277,6 +537,20 @@ demo = gr.Interface(
277
  theme="huggingface"
278
  )
279
 
280
- # Only launch when run directly
281
  if __name__ == "__main__":
282
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import logging
7
  import time
8
  from datasets import Dataset
9
+ from huggingface_hub import HfApi, create_repo, repository_exists, CommitOperationAdd
10
+ from huggingface_hub.utils import tqdm
11
+ import threading
12
+ import sys
13
 
14
  # Setup logging with timestamp
15
  logging.basicConfig(
 
22
  )
23
  logger = logging.getLogger(__name__)
24
 
25
+ # Constants - edit these for your setup
26
  HF_TOKEN = os.environ.get("HF_TOKEN")
27
+ HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks")
28
+ DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata")
29
  SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'}
30
+ CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", "3600")) # Check for files hourly by default
31
 
32
+ # Global state
33
  STATS = {
34
  "uploads": 0,
35
  "total_files": 0,
36
+ "files_with_gps": 0,
37
+ "last_upload": 0,
38
+ "startup_time": int(time.time())
39
  }
40
 
41
+ def ensure_dataset_exists():
42
+ """Create dataset repository if it doesn't exist"""
43
+ if not HF_TOKEN:
44
+ logger.error("HF_TOKEN not set. Cannot create or check dataset.")
45
+ return False
46
+
47
+ try:
48
+ api = HfApi(token=HF_TOKEN)
49
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
50
+
51
+ # Check if repo exists
52
+ if not repository_exists(repo_id, repo_type="dataset", token=HF_TOKEN):
53
+ logger.info(f"Creating dataset repository: {repo_id}")
54
+ create_repo(
55
+ repo_id=repo_id,
56
+ repo_type="dataset",
57
+ private=False,
58
+ token=HF_TOKEN
59
+ )
60
+
61
+ # Create initial README
62
+ readme_content = f"""# {DATASET_NAME}
63
+
64
+ Automatically collected geo-metadata from images using the Geo-Metadata Extractor.
65
+
66
+ ## Statistics
67
+ - Total files processed: 0
68
+ - Files with GPS data: 0
69
+ - Last updated: {time.strftime('%Y-%m-%d %H:%M:%S')}
70
+
71
+ ## Data Format
72
+ Each entry contains:
73
+ - Basic image metadata (size, format, mode)
74
+ - EXIF data when available
75
+ - GPS coordinates extracted from EXIF when available
76
+ """
77
+
78
+ # Upload README
79
+ api.upload_file(
80
+ path_or_fileobj=readme_content.encode(),
81
+ path_in_repo="README.md",
82
+ repo_id=repo_id,
83
+ repo_type="dataset",
84
+ token=HF_TOKEN,
85
+ commit_message="Initial commit with README"
86
+ )
87
+
88
+ # Create folder structure
89
+ for folder in ["batches", "images", "scripts"]:
90
+ api.upload_file(
91
+ path_or_fileobj=b"",
92
+ path_in_repo=f"{folder}/.gitkeep",
93
+ repo_id=repo_id,
94
+ repo_type="dataset",
95
+ token=HF_TOKEN,
96
+ commit_message=f"Create {folder} directory"
97
+ )
98
+
99
+ # Upload this script to the repository
100
+ try:
101
+ script_path = os.path.abspath(sys.argv[0])
102
+ if os.path.exists(script_path):
103
+ with open(script_path, "rb") as f:
104
+ script_content = f.read()
105
+
106
+ api.upload_file(
107
+ path_or_fileobj=script_content,
108
+ path_in_repo="scripts/geo_metadata_extractor.py",
109
+ repo_id=repo_id,
110
+ repo_type="dataset",
111
+ token=HF_TOKEN,
112
+ commit_message="Upload metadata extractor script"
113
+ )
114
+ except Exception as e:
115
+ logger.error(f"Failed to upload script: {e}")
116
+
117
+ logger.info(f"Dataset repository created: {repo_id}")
118
+ else:
119
+ logger.info(f"Dataset repository already exists: {repo_id}")
120
+
121
+ return True
122
+ except Exception as e:
123
+ logger.error(f"Error ensuring dataset exists: {e}")
124
+ return False
125
+
126
+ def update_readme_stats():
127
+ """Update README with current statistics"""
128
+ if not HF_TOKEN:
129
+ return
130
+
131
+ try:
132
+ api = HfApi(token=HF_TOKEN)
133
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
134
+
135
+ # Create updated README content
136
+ readme_content = f"""# {DATASET_NAME}
137
+
138
+ Automatically collected geo-metadata from images using the Geo-Metadata Extractor.
139
+
140
+ ## Statistics
141
+ - Total files processed: {STATS["total_files"]}
142
+ - Files with GPS data: {STATS["files_with_gps"]}
143
+ - Upload batches: {STATS["uploads"]}
144
+ - Last updated: {time.strftime('%Y-%m-%d %H:%M:%S')}
145
+ - Uptime: {format_duration(int(time.time()) - STATS["startup_time"])}
146
+
147
+ ## Data Format
148
+ Each entry contains:
149
+ - Basic image metadata (size, format, mode)
150
+ - EXIF data when available
151
+ - GPS coordinates extracted from EXIF when available
152
+ """
153
+
154
+ # Upload updated README
155
+ api.upload_file(
156
+ path_or_fileobj=readme_content.encode(),
157
+ path_in_repo="README.md",
158
+ repo_id=repo_id,
159
+ repo_type="dataset",
160
+ token=HF_TOKEN,
161
+ commit_message="Update statistics"
162
+ )
163
+
164
+ logger.info("Updated README with current statistics")
165
+ except Exception as e:
166
+ logger.error(f"Error updating README: {e}")
167
+
168
+ def format_duration(seconds):
169
+ """Format seconds into readable duration"""
170
+ days, remainder = divmod(seconds, 86400)
171
+ hours, remainder = divmod(remainder, 3600)
172
+ minutes, seconds = divmod(remainder, 60)
173
+
174
+ parts = []
175
+ if days > 0:
176
+ parts.append(f"{days}d")
177
+ if hours > 0:
178
+ parts.append(f"{hours}h")
179
+ if minutes > 0:
180
+ parts.append(f"{minutes}m")
181
+ parts.append(f"{seconds}s")
182
+
183
+ return " ".join(parts)
184
+
185
  def convert_to_degrees(value):
186
+ """Convert GPS coordinates to decimal degrees"""
187
  try:
188
  if not isinstance(value, (tuple, list)) or len(value) != 3:
189
  raise ValueError(f"GPS needs 3 values, got {type(value)}")
 
259
 
260
  def get_image_metadata(image_path):
261
  """Extract all metadata from an image file"""
262
+ file_path = Path(image_path)
263
+ metadata = {
264
+ "file_name": str(file_path.absolute()),
265
+ "file_basename": file_path.name,
266
+ "image_path_in_repo": f"images/{file_path.name}" # Path where image will be stored in repo
267
+ }
268
+
269
  try:
270
  with Image.open(image_path) as image:
271
  metadata.update({
 
296
 
297
  # Add file details
298
  metadata["file_size"] = os.path.getsize(image_path)
299
+ metadata["file_extension"] = file_path.suffix.lower()
300
  metadata["extraction_timestamp"] = int(time.time())
301
 
302
  # Test serialization
 
304
  return metadata
305
  except Exception as e:
306
  logger.error(f"Error processing {image_path}: {e}")
307
+ return {"file_name": str(file_path.absolute()), "error": str(e)}
308
 
309
  def process_images(image_files):
310
  """Process images and upload metadata to Hugging Face"""
311
  if not image_files:
312
  return "🚫 Upload some fucking images first! πŸ“·", None
313
+
314
+ # Ensure dataset exists
315
+ if not ensure_dataset_exists():
316
+ return "❌ Failed to create or verify dataset repository. Check logs.", None
317
+
318
+ # Create temp directory for storing files if needed
319
+ os.makedirs("temp_uploads", exist_ok=True)
320
 
321
  # Reset stats for this batch
322
  batch_stats = {
 
379
  # Create dataset object with both filenames and full metadata
380
  dataset = Dataset.from_dict({
381
  "filename": filenames,
382
+ "image_path": [f"images/{f}" for f in filenames], # Path to actual image in repo
383
  "metadata": metadata_list
384
  })
385
 
 
391
  )
392
 
393
  # Upload raw JSONL file
394
+ api = HfApi(token=HF_TOKEN)
395
  api.upload_file(
396
  path_or_fileobj=output_file,
397
  path_in_repo=f"batches/metadata_{timestamp}.jsonl",
 
401
  commit_message=f"Raw metadata batch {timestamp}"
402
  )
403
 
404
+ # Upload the actual image files
405
+ logger.info(f"Uploading {len(image_files)} image files...")
406
+ operations = []
407
+
408
+ # Process images in batches to avoid memory issues with large datasets
409
+ MAX_BATCH_SIZE = 20 # Maximum images per commit
410
+ total_uploaded = 0
411
+
412
+ # Group image files into batches
413
+ image_batches = [image_files[i:i+MAX_BATCH_SIZE] for i in range(0, len(image_files), MAX_BATCH_SIZE)]
414
+
415
+ for batch_idx, img_batch in enumerate(image_batches):
416
+ operations = []
417
+
418
+ for img_file in tqdm(img_batch, desc=f"Preparing batch {batch_idx+1}/{len(image_batches)}"):
419
+ try:
420
+ file_path = img_file.name
421
+ file_name = os.path.basename(file_path)
422
+ target_path = f"images/{file_name}"
423
+
424
+ # Add file to operations list
425
+ with open(file_path, "rb") as f:
426
+ content = f.read()
427
+ operations.append(
428
+ CommitOperationAdd(
429
+ path_in_repo=target_path,
430
+ path_or_fileobj=content
431
+ )
432
+ )
433
+ except Exception as e:
434
+ logger.error(f"Error preparing image {img_file.name} for upload: {e}")
435
+
436
+ # Commit this batch of images
437
+ if operations:
438
+ try:
439
+ logger.info(f"Committing batch {batch_idx+1}/{len(image_batches)} with {len(operations)} images...")
440
+ api.create_commit(
441
+ repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
442
+ repo_type="dataset",
443
+ operations=operations,
444
+ commit_message=f"Upload {len(operations)} images (batch {batch_idx+1}/{len(image_batches)}) from upload {timestamp}"
445
+ )
446
+ total_uploaded += len(operations)
447
+ logger.info(f"Successfully uploaded batch {batch_idx+1} ({total_uploaded}/{len(image_files)} total)")
448
+ except Exception as e:
449
+ logger.error(f"Failed to upload image batch {batch_idx+1}: {e}")
450
+
451
+ logger.info(f"Image upload complete: {total_uploaded}/{len(image_files)} files uploaded")
452
+
453
+ # Update stats
454
  STATS["uploads"] += 1
455
+ STATS["last_upload"] = timestamp
456
  upload_status = "βœ… success"
457
+
458
+ # Update README in background thread
459
+ threading.Thread(target=update_readme_stats).start()
460
+
461
  except Exception as e:
462
  logger.error(f"HF upload failed: {e}")
463
  upload_status = f"❌ failed: {str(e)[:100]}..."
 
473
  f"πŸ“Š TOTAL STATS πŸ“Š\n"
474
  f"Total files: {STATS['total_files']}\n"
475
  f"Files with GPS: {STATS['files_with_gps']}\n"
476
+ f"Upload batches: {STATS['uploads']}\n"
477
+ f"Uptime: {format_duration(int(time.time()) - STATS['startup_time'])}"
478
  )
479
 
480
  return result, output_file
481
 
482
+ def scan_and_process_directory(directory_path):
483
+ """Scan directory for images and process them"""
484
+ if not os.path.isdir(directory_path):
485
+ logger.error(f"Not a directory: {directory_path}")
486
+ return
487
+
488
+ logger.info(f"Scanning directory: {directory_path}")
489
+ image_files = []
490
+
491
+ # Find all image files in directory
492
+ for root, _, files in os.walk(directory_path):
493
+ for file in files:
494
+ file_path = os.path.join(root, file)
495
+ if Path(file_path).suffix.lower() in SUPPORTED_EXTENSIONS:
496
+ image_files.append(file_path)
497
+
498
+ if not image_files:
499
+ logger.info(f"No image files found in {directory_path}")
500
+ return
501
+
502
+ logger.info(f"Found {len(image_files)} image files in {directory_path}")
503
+
504
+ # Create file-like objects for processing
505
+ class FileObject:
506
+ def __init__(self, path):
507
+ self.name = path
508
+
509
+ process_images([FileObject(path) for path in image_files])
510
+
511
+ def schedule_directory_scan():
512
+ """Check for new files in directory periodically"""
513
+ watch_dir = os.environ.get("WATCH_DIRECTORY")
514
+
515
+ if watch_dir and os.path.isdir(watch_dir):
516
+ logger.info(f"Scheduled scan of directory: {watch_dir}")
517
+ scan_and_process_directory(watch_dir)
518
+
519
+ # Schedule next check
520
+ threading.Timer(CHECK_INTERVAL, schedule_directory_scan).start()
521
+
522
+ # Create the UI
523
  demo = gr.Interface(
524
  fn=process_images,
525
  inputs=gr.Files(label="DROP IMAGES HERE πŸ“Έ", file_types=["image"], file_count="multiple"),
 
537
  theme="huggingface"
538
  )
539
 
540
+ # Launch app and start background processes
541
  if __name__ == "__main__":
542
+ # Ensure dataset exists on startup
543
+ ensure_dataset_exists()
544
+
545
+ # Start directory watcher if configured
546
+ if os.environ.get("WATCH_DIRECTORY"):
547
+ threading.Thread(target=schedule_directory_scan).start()
548
+ logger.info(f"Starting directory watcher for {os.environ.get('WATCH_DIRECTORY')}")
549
+
550
+ # Log startup info
551
+ logger.info(f"=== Application Startup at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
552
+ logger.info(f"Dataset: {HF_USERNAME}/{DATASET_NAME}")
553
+ logger.info(f"Token available: {bool(HF_TOKEN)}")
554
+
555
+ # Launch Gradio app
556
+ demo.launch(server_name="0.0.0.0", server_port=7860)