latterworks commited on
Commit
997cbe9
·
verified ·
1 Parent(s): 4d7f662

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +586 -225
app.py CHANGED
@@ -1,243 +1,604 @@
1
  from pathlib import Path
2
- from PIL import Image, ExifTags
3
  import json
4
  import sys
5
  import os
6
- import gradio as gr
7
  import logging
8
- from datasets import Dataset
9
- from typing import Dict, List, Any, Optional
10
  import traceback
 
 
 
11
 
12
- # Logging setup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  logging.basicConfig(
14
  level=logging.INFO,
15
- format="%(asctime)s [%(levelname)s] %(message)s",
16
- handlers=[logging.StreamHandler(sys.stdout)]
 
 
 
17
  )
18
- logger = logging.getLogger(__name__)
19
-
20
- # Config with defaults (editable via UI or env vars)
21
- DEFAULT_IMAGE_DIR = Path(os.environ.get("IMAGE_DIR", "./images"))
22
- DEFAULT_OUTPUT_FILE = Path(os.environ.get("OUTPUT_METADATA_FILE", "./metadata.jsonl"))
23
- HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks")
24
- DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata")
25
-
26
- SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'}
27
-
28
- # Convert GPS coordinates to decimal degrees
29
- def convert_to_degrees(value: tuple) -> Optional[float]:
30
- try:
31
- if not isinstance(value, (tuple, list)) or len(value) != 3:
32
- raise ValueError("GPS value must be a tuple of 3 elements")
33
- d, m, s = value
34
- degrees = float(d) + (float(m) / 60.0) + (float(s) / 3600.0)
35
- if not -180 <= degrees <= 180:
36
- raise ValueError("GPS degrees out of valid range")
37
- return degrees
38
- except (TypeError, ValueError) as e:
39
- logger.error(f"Failed to convert GPS coordinates: {e}")
40
- return None
41
-
42
- # Extract and format GPS metadata
43
- def extract_gps_info(gps_info: Dict[int, Any]) -> Optional[Dict[str, Any]]:
44
- if not isinstance(gps_info, dict):
45
- logger.warning("GPSInfo ain’t a dict, skipping")
46
- return None
47
-
48
- gps_data = {}
49
- try:
50
- for key, val in gps_info.items():
51
- tag_name = ExifTags.GPSTAGS.get(key, f"unknown_gps_tag_{key}")
52
- gps_data[tag_name] = val
53
-
54
- if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
55
- lat = convert_to_degrees(gps_data['GPSLatitude'])
56
- lon = convert_to_degrees(gps_data['GPSLongitude'])
57
- if lat is None or lon is None:
58
- logger.error("Failed to convert lat/lon, skipping GPS")
59
- return None
60
-
61
- lat_ref = gps_data.get('GPSLatitudeRef', 'N')
62
- lon_ref = gps_data.get('GPSLongitudeRef', 'E')
63
- if lat_ref not in {'N', 'S'} or lon_ref not in {'E', 'W'}:
64
- logger.warning(f"Bad GPS ref: {lat_ref}, {lon_ref}")
65
- else:
66
- if lat_ref == 'S':
67
- lat = -lat
68
- if lon_ref == 'W':
69
- lon = -lon
70
-
71
- gps_data['Latitude'] = lat
72
- gps_data['Longitude'] = lon
73
-
74
- return gps_data
75
- except Exception as e:
76
- logger.error(f"GPS extraction crashed: {traceback.format_exc()}")
77
- return None
78
-
79
- # Make stuff JSON-serializable
80
- def make_serializable(value: Any) -> Any:
81
- try:
82
- if hasattr(value, 'numerator') and hasattr(value, 'denominator'):
83
- return float(value.numerator) / float(value.denominator)
84
- elif isinstance(value, (tuple, list)):
85
- return [make_serializable(item) for item in value]
86
- elif isinstance(value, dict):
87
- return {str(k): make_serializable(v) for k, v in value.items()}
88
- elif isinstance(value, bytes):
89
- return value.decode('utf-8', errors='replace')
90
- json.dumps(value)
91
- return value
92
- except Exception as e:
93
- logger.warning(f"Serialization failed, stringin’ it: {e}")
94
- return str(value)
95
-
96
- # Extract metadata from one image
97
- def get_image_metadata(image_path: Path) -> Dict[str, Any]:
98
- metadata = {"file_name": str(image_path.absolute())}
99
- try:
100
- with Image.open(image_path) as image:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  metadata.update({
102
- "format": image.format or "unknown",
103
- "size": list(image.size),
104
- "mode": image.mode or "unknown"
 
105
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- exif_data = None
108
- try:
109
- exif_data = image._getexif()
110
- except AttributeError:
111
- metadata["exif_error"] = "No EXIF data"
112
- except Exception as e:
113
- metadata["exif_error"] = f"EXIF crashed: {str(e)}"
114
-
115
- if exif_data and isinstance(exif_data, dict):
116
- for tag_id, value in exif_data.items():
117
- tag_name = ExifTags.TAGS.get(tag_id, f"tag_{tag_id}").lower()
118
- if tag_name == "gpsinfo":
119
- gps_info = extract_gps_info(value)
120
- if gps_info:
121
- metadata["gps_info"] = make_serializable(gps_info)
122
- else:
123
- metadata[tag_name] = make_serializable(value)
124
-
125
- metadata["file_size"] = image_path.stat().st_size
126
- metadata["file_extension"] = image_path.suffix.lower()
127
- return metadata
128
- except Exception as e:
129
- logger.error(f"Image {image_path} crashed: {traceback.format_exc()}")
130
- return {"file_name": str(image_path.absolute()), "error": str(e)}
131
-
132
- # Process images (single file or directory)
133
- def process_images(input_data: str | Path) -> List[Dict[str, Any]]:
134
- metadata_list = []
135
- input_path = Path(input_data)
136
-
137
- if input_path.is_file() and input_path.suffix.lower() in SUPPORTED_EXTENSIONS:
138
- logger.info(f"Processing single image: {input_path}")
139
- metadata = get_image_metadata(input_path)
140
- if metadata:
141
- metadata_list.append(metadata)
142
- elif input_path.is_dir():
143
- logger.info(f"Processing directory: {input_path}")
144
- for image_path in input_path.rglob("*"):
145
- if image_path.is_file() and image_path.suffix.lower() in SUPPORTED_EXTENSIONS:
146
- logger.info(f"Processing: {image_path}")
147
- metadata = get_image_metadata(image_path)
148
- if metadata:
149
- metadata_list.append(metadata)
150
- else:
151
- logger.error(f"Invalid input: {input_data}")
152
- return [{"error": f"Invalid input: {input_data}"}]
153
-
154
- return metadata_list
155
-
156
- # Save to JSONL
157
- def save_metadata_to_jsonl(metadata_list: List[Dict[str, Any]], output_file: Path) -> bool:
158
- try:
159
- output_file.parent.mkdir(parents=True, exist_ok=True)
160
- with output_file.open('w', encoding='utf-8') as f:
161
- for entry in metadata_list:
162
- f.write(json.dumps(entry, ensure_ascii=False) + '\n')
163
- logger.info(f"Saved {len(metadata_list)} entries to {output_file}")
164
- return True
165
- except Exception as e:
166
- logger.error(f"Save crashed: {traceback.format_exc()}")
167
- return False
168
-
169
- # Upload to Hugging Face
170
- def upload_to_huggingface(metadata_file: Path, username: str, dataset_name: str) -> str:
171
- try:
172
  metadata_list = []
173
- with metadata_file.open('r', encoding='utf-8') as f:
174
- for line in f:
175
- metadata_list.append(json.loads(line))
176
-
177
- if not metadata_list:
178
- return "No metadata to upload, fam!"
179
-
180
- dataset = Dataset.from_dict({
181
- "images": [entry.get("file_name") for entry in metadata_list],
182
- "metadata": metadata_list
183
- })
184
- dataset.push_to_hub(f"{username}/{dataset_name}", private=False)
185
- return f"Uploaded to {username}/{dataset_name} with {len(metadata_list)} entries!"
186
- except Exception as e:
187
- logger.error(f"Upload crashed: {traceback.format_exc()}")
188
- return f"Upload failed: {str(e)}"
189
-
190
- # Gradio processing function
191
- def gradio_process(image_file, dir_path: str, username: str, dataset_name: str) -> str:
192
- output = []
193
- metadata_list = []
194
-
195
- # Process single image if uploaded
196
- if image_file:
197
- image_path = Path(image_file.name) # Gradio gives temp file path
198
- metadata_list = process_images(image_path)
199
- output.append("Single Image Metadata:")
200
- for entry in metadata_list:
201
- output.append(json.dumps(entry, indent=2))
202
-
203
- # Process directory if provided
204
- if dir_path:
205
- dir_path = Path(dir_path)
206
- if dir_path.is_dir():
207
- metadata_list.extend(process_images(dir_path))
208
- output.append("Directory Metadata:")
209
- for entry in metadata_list[-len(process_images(dir_path)):]:
210
- output.append(json.dumps(entry, indent=2))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  else:
212
- output.append(f"Error: {dir_path} ain’t a directory, fam!")
213
-
214
- # Save and upload if we got metadata
215
- if metadata_list:
216
- temp_output_file = Path("temp_metadata.jsonl")
217
- if save_metadata_to_jsonl(metadata_list, temp_output_file):
218
- output.append(f"Saved metadata to {temp_output_file}")
219
- upload_result = upload_to_huggingface(temp_output_file, username, dataset_name)
220
- output.append(upload_result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  else:
222
- output.append("Save failed, dawg!")
223
-
224
- return "\n\n".join(output) if output else "Drop an image or dir, fam!"
225
-
226
- # Gradio interface
227
- demo = gr.Interface(
228
- fn=gradio_process,
229
- inputs=[
230
- gr.File(label="Upload Image", file_types=list(SUPPORTED_EXTENSIONS)),
231
- gr.Textbox(label="Image Directory", placeholder=str(DEFAULT_IMAGE_DIR), value=str(DEFAULT_IMAGE_DIR)),
232
- gr.Textbox(label="Hugging Face Username", value=HF_USERNAME),
233
- gr.Textbox(label="Dataset Name", value=DATASET_NAME)
234
- ],
235
- outputs=gr.Textbox(label="Metadata Output"),
236
- title="Geo-Metadata Extractor",
237
- description="Upload an image or point to a directory to extract metadata and push to Hugging Face, Bay Area style!",
238
- allow_flagging="never"
239
- )
240
 
241
  if __name__ == "__main__":
242
- logger.info("Firin’ up the Gradio geo-metadata extractor...")
243
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  from pathlib import Path
 
2
  import json
3
  import sys
4
  import os
 
5
  import logging
 
 
6
  import traceback
7
+ from typing import Dict, List, Any, Optional, Union, Tuple
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ import time
10
 
11
+ # Third-party imports with robust error handling
12
+ try:
13
+ from PIL import Image, ExifTags
14
+ HAS_PIL = True
15
+ except ImportError:
16
+ HAS_PIL = False
17
+ logging.warning("PIL not installed - image processing disabled")
18
+
19
+ try:
20
+ import gradio as gr
21
+ HAS_GRADIO = True
22
+ except ImportError:
23
+ HAS_GRADIO = False
24
+ logging.warning("Gradio not installed - UI disabled")
25
+
26
+ try:
27
+ from datasets import Dataset
28
+ HAS_DATASETS = True
29
+ except ImportError:
30
+ HAS_DATASETS = False
31
+ logging.warning("Datasets library not installed - HF upload disabled")
32
+
33
+ # Advanced logging configuration
34
  logging.basicConfig(
35
  level=logging.INFO,
36
+ format="%(asctime)s [%(levelname)s] %(name)s:%(lineno)d - %(message)s",
37
+ handlers=[
38
+ logging.StreamHandler(sys.stdout),
39
+ logging.FileHandler("geo_extractor.log")
40
+ ]
41
  )
42
+ logger = logging.getLogger("geo_metadata_extractor")
43
+
44
+ # Configurable settings with environment variable overrides and validation
45
+ class Config:
46
+ """Configuration container with validation and defaults"""
47
+
48
+ DEFAULT_IMAGE_DIR = Path(os.environ.get("IMAGE_DIR", "./images"))
49
+ DEFAULT_OUTPUT_FILE = Path(os.environ.get("OUTPUT_METADATA_FILE", "./metadata.jsonl"))
50
+ HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks")
51
+ DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata")
52
+ MAX_WORKERS = int(os.environ.get("MAX_WORKERS", "4"))
53
+ BATCH_SIZE = int(os.environ.get("BATCH_SIZE", "100"))
54
+
55
+ # Image formats with EXIF support prioritized first
56
+ SUPPORTED_EXTENSIONS = {
57
+ # Primary formats with good EXIF support
58
+ '.jpg', '.jpeg', '.tiff', '.tif',
59
+ # Secondary formats with limited metadata support
60
+ '.png', '.heic', '.bmp', '.webp'
61
+ }
62
+
63
+ @classmethod
64
+ def validate(cls) -> List[str]:
65
+ """Validate configuration settings and return warnings"""
66
+ warnings = []
67
+
68
+ if cls.MAX_WORKERS < 1:
69
+ cls.MAX_WORKERS = 1
70
+ warnings.append(f"Invalid MAX_WORKERS value, reset to {cls.MAX_WORKERS}")
71
+
72
+ if cls.BATCH_SIZE < 10:
73
+ cls.BATCH_SIZE = 10
74
+ warnings.append(f"BATCH_SIZE too small, reset to {cls.BATCH_SIZE}")
75
+
76
+ return warnings
77
+
78
+ # Run config validation at import time
79
+ config_warnings = Config.validate()
80
+ for warning in config_warnings:
81
+ logger.warning(warning)
82
+
83
+ class GeoMetadataExtractor:
84
+ """Core metadata extraction logic with advanced error handling"""
85
+
86
+ @staticmethod
87
+ def convert_to_degrees(value: Union[tuple, list]) -> Optional[float]:
88
+ """
89
+ Convert GPS coordinates (degrees, minutes, seconds) to decimal degrees
90
+
91
+ Args:
92
+ value: Tuple of degrees, minutes, seconds
93
+
94
+ Returns:
95
+ Decimal degrees as float, or None if conversion fails
96
+ """
97
+ try:
98
+ if not isinstance(value, (tuple, list)) or len(value) != 3:
99
+ raise ValueError(f"GPS value must be a tuple of 3 elements, got {type(value)}")
100
+
101
+ d, m, s = value
102
+ degrees = float(d) + (float(m) / 60.0) + (float(s) / 3600.0)
103
+
104
+ # Validate range
105
+ if not -180 <= degrees <= 180:
106
+ raise ValueError(f"GPS degrees out of valid range: {degrees}")
107
+
108
+ return degrees
109
+ except (TypeError, ValueError, ZeroDivisionError) as e:
110
+ logger.error(f"Failed to convert GPS coordinates: {e}")
111
+ return None
112
+
113
+ @staticmethod
114
+ def extract_gps_info(gps_info: Dict[int, Any]) -> Optional[Dict[str, Any]]:
115
+ """
116
+ Extract and format GPS metadata from EXIF
117
+
118
+ Args:
119
+ gps_info: Dictionary of GPS EXIF tags
120
+
121
+ Returns:
122
+ Formatted GPS data including decimal latitude/longitude
123
+ """
124
+ if not isinstance(gps_info, dict):
125
+ logger.warning("GPS info is not a dictionary, skipping")
126
+ return None
127
+
128
+ gps_data = {}
129
+ try:
130
+ # Extract tag data
131
+ for key, val in gps_info.items():
132
+ tag_name = ExifTags.GPSTAGS.get(key, f"unknown_gps_tag_{key}")
133
+ gps_data[tag_name] = val
134
+
135
+ # Process coordinates if available
136
+ if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
137
+ lat = GeoMetadataExtractor.convert_to_degrees(gps_data['GPSLatitude'])
138
+ lon = GeoMetadataExtractor.convert_to_degrees(gps_data['GPSLongitude'])
139
+
140
+ if lat is None or lon is None:
141
+ logger.error("Failed to convert latitude/longitude, skipping GPS data")
142
+ return None
143
+
144
+ # Apply hemispheric references
145
+ lat_ref = gps_data.get('GPSLatitudeRef', 'N')
146
+ lon_ref = gps_data.get('GPSLongitudeRef', 'E')
147
+
148
+ if lat_ref not in {'N', 'S'} or lon_ref not in {'E', 'W'}:
149
+ logger.warning(f"Invalid GPS reference values: lat_ref={lat_ref}, lon_ref={lon_ref}")
150
+ else:
151
+ if lat_ref == 'S':
152
+ lat = -lat
153
+ if lon_ref == 'W':
154
+ lon = -lon
155
+
156
+ # Add calculated decimal coordinates
157
+ gps_data['Latitude'] = round(lat, 6) # 6 decimal places ≈ 10cm precision
158
+ gps_data['Longitude'] = round(lon, 6)
159
+
160
+ # Add additional derived fields
161
+ if 'GPSAltitude' in gps_data:
162
+ try:
163
+ altitude = gps_data['GPSAltitude']
164
+ if hasattr(altitude, 'numerator') and hasattr(altitude, 'denominator'):
165
+ gps_data['AltitudeMeters'] = float(altitude.numerator) / float(altitude.denominator)
166
+ except Exception as e:
167
+ logger.warning(f"Failed to process altitude: {e}")
168
+
169
+ return gps_data
170
+ except Exception as e:
171
+ stack_trace = traceback.format_exc()
172
+ logger.error(f"GPS extraction error: {e}\n{stack_trace}")
173
+ return None
174
+
175
+ @staticmethod
176
+ def make_serializable(value: Any) -> Any:
177
+ """
178
+ Recursively convert non-serializable types to JSON-compatible values
179
+
180
+ Args:
181
+ value: Any value to convert
182
+
183
+ Returns:
184
+ JSON-serializable representation of value
185
+ """
186
+ try:
187
+ # Handle rational numbers (fractions)
188
+ if hasattr(value, 'numerator') and hasattr(value, 'denominator'):
189
+ if value.denominator == 0:
190
+ return "undefined (division by zero)"
191
+ return float(value.numerator) / float(value.denominator)
192
+
193
+ # Handle nested structures
194
+ elif isinstance(value, (tuple, list)):
195
+ return [GeoMetadataExtractor.make_serializable(item) for item in value]
196
+
197
+ elif isinstance(value, dict):
198
+ return {str(k): GeoMetadataExtractor.make_serializable(v) for k, v in value.items()}
199
+
200
+ # Handle binary data
201
+ elif isinstance(value, bytes):
202
+ return value.decode('utf-8', errors='replace')
203
+
204
+ # Test if directly serializable
205
+ json.dumps(value)
206
+ return value
207
+
208
+ except Exception as e:
209
+ logger.warning(f"Value serialization failed, converting to string: {e}")
210
+ return str(value)
211
+
212
+ @staticmethod
213
+ def get_image_metadata(image_path: Path) -> Dict[str, Any]:
214
+ """
215
+ Extract comprehensive metadata from an image file
216
+
217
+ Args:
218
+ image_path: Path to image file
219
+
220
+ Returns:
221
+ Dictionary of extracted metadata
222
+ """
223
+ # Core metadata with absolute file path
224
+ metadata = {
225
+ "file_name": str(image_path.absolute()),
226
+ "extraction_time": time.strftime("%Y-%m-%d %H:%M:%S")
227
+ }
228
+
229
+ try:
230
+ # Process file system metadata first (always available)
231
+ stat_info = image_path.stat()
232
  metadata.update({
233
+ "file_size": stat_info.st_size,
234
+ "file_extension": image_path.suffix.lower(),
235
+ "last_modified": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat_info.st_mtime)),
236
+ "creation_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat_info.st_ctime))
237
  })
238
+
239
+ # Exit early if PIL not available
240
+ if not HAS_PIL:
241
+ metadata["error"] = "PIL library not available"
242
+ return metadata
243
+
244
+ # Extract image and EXIF data
245
+ with Image.open(image_path) as image:
246
+ # Basic image properties
247
+ metadata.update({
248
+ "format": image.format or "unknown",
249
+ "size": list(image.size),
250
+ "width": image.width,
251
+ "height": image.height,
252
+ "mode": image.mode or "unknown",
253
+ "aspect_ratio": round(image.width / image.height, 3) if image.height > 0 else None
254
+ })
255
 
256
+ # Extract EXIF data if available
257
+ exif_data = None
258
+ try:
259
+ # Different methods depending on image format
260
+ if hasattr(image, '_getexif'):
261
+ exif_data = image._getexif()
262
+ elif hasattr(image, 'getexif'):
263
+ exif_data = image.getexif()
264
+
265
+ # Some formats like PNG store metadata differently
266
+ if not exif_data and image.format == 'PNG' and 'exif' in image.info:
267
+ exif_data = image.info.get('exif')
268
+ metadata["exif_source"] = "PNG info block"
269
+ except AttributeError:
270
+ metadata["exif_error"] = "No EXIF extraction method available"
271
+ except Exception as e:
272
+ metadata["exif_error"] = f"EXIF extraction failed: {str(e)}"
273
+
274
+ # Process EXIF data if found
275
+ if exif_data and isinstance(exif_data, dict):
276
+ for tag_id, value in exif_data.items():
277
+ # Handle GPS data specially
278
+ if tag_id in ExifTags.TAGS and ExifTags.TAGS[tag_id] == "GPSInfo":
279
+ gps_info = GeoMetadataExtractor.extract_gps_info(value)
280
+ if gps_info:
281
+ metadata["gps_info"] = GeoMetadataExtractor.make_serializable(gps_info)
282
+ else:
283
+ # Get tag name or use numeric ID with tag_ prefix
284
+ tag_name = ExifTags.TAGS.get(tag_id, f"tag_{tag_id}").lower()
285
+ metadata[tag_name] = GeoMetadataExtractor.make_serializable(value)
286
+
287
+ # Add camera model and date taken for convenience if available
288
+ if 'model' in metadata:
289
+ metadata["camera_model"] = metadata['model']
290
+ if 'datetimeoriginal' in metadata:
291
+ metadata["date_taken"] = metadata['datetimeoriginal']
292
+
293
+ return metadata
294
+ except Exception as e:
295
+ # Capture full stack trace for debugging
296
+ stack_trace = traceback.format_exc()
297
+ logger.error(f"Image {image_path} processing failed: {e}\n{stack_trace}")
298
+
299
+ # Return partial metadata with error information
300
+ metadata["error"] = str(e)
301
+ metadata["error_trace"] = stack_trace
302
+ return metadata
303
+
304
+ class MetadataProcessor:
305
+ """Handles batch processing and file operations"""
306
+
307
+ @staticmethod
308
+ def process_images(input_path: Union[str, Path]) -> List[Dict[str, Any]]:
309
+ """
310
+ Process image files to extract metadata
311
+
312
+ Args:
313
+ input_path: Path to image file or directory
314
+
315
+ Returns:
316
+ List of metadata dictionaries for all processed images
317
+ """
 
 
 
318
  metadata_list = []
319
+ input_path = Path(input_path)
320
+ start_time = time.time()
321
+
322
+ # Handle single file case
323
+ if input_path.is_file() and input_path.suffix.lower() in Config.SUPPORTED_EXTENSIONS:
324
+ logger.info(f"Processing single image: {input_path}")
325
+ metadata = GeoMetadataExtractor.get_image_metadata(input_path)
326
+ if metadata:
327
+ metadata_list.append(metadata)
328
+
329
+ # Handle directory case
330
+ elif input_path.is_dir():
331
+ logger.info(f"Processing directory: {input_path}")
332
+
333
+ # Collect all image files first
334
+ image_paths = [
335
+ path for path in input_path.rglob("*")
336
+ if path.is_file() and path.suffix.lower() in Config.SUPPORTED_EXTENSIONS
337
+ ]
338
+
339
+ total_images = len(image_paths)
340
+ logger.info(f"Found {total_images} images to process")
341
+
342
+ # Process in parallel with progress tracking
343
+ if total_images > 0:
344
+ processed = 0
345
+ with ThreadPoolExecutor(max_workers=Config.MAX_WORKERS) as executor:
346
+ # Submit all tasks
347
+ future_to_path = {
348
+ executor.submit(GeoMetadataExtractor.get_image_metadata, path): path
349
+ for path in image_paths
350
+ }
351
+
352
+ # Process as they complete
353
+ for future in as_completed(future_to_path):
354
+ path = future_to_path[future]
355
+ try:
356
+ metadata = future.result()
357
+ if metadata:
358
+ metadata_list.append(metadata)
359
+
360
+ # Update progress
361
+ processed += 1
362
+ if processed % 10 == 0 or processed == total_images:
363
+ elapsed = time.time() - start_time
364
+ rate = processed / elapsed if elapsed > 0 else 0
365
+ logger.info(f"Processed {processed}/{total_images} images ({processed/total_images*100:.1f}%) - {rate:.2f} images/sec")
366
+
367
+ except Exception as e:
368
+ logger.error(f"Error processing {path}: {e}")
369
+ else:
370
+ logger.warning(f"No images found in directory: {input_path}")
371
  else:
372
+ logger.error(f"Invalid input: {input_path} is not a file or directory")
373
+ return [{"error": f"Invalid input: {input_path} is not a file or directory"}]
374
+
375
+ # Summarize results
376
+ elapsed = time.time() - start_time
377
+ images_per_second = len(metadata_list) / elapsed if elapsed > 0 else 0
378
+ logger.info(f"Completed processing {len(metadata_list)} images in {elapsed:.2f} seconds ({images_per_second:.2f} images/sec)")
379
+
380
+ return metadata_list
381
+
382
+ @staticmethod
383
+ def save_metadata_to_jsonl(metadata_list: List[Dict[str, Any]], output_file: Path) -> bool:
384
+ """
385
+ Save metadata to JSONL format with error handling
386
+
387
+ Args:
388
+ metadata_list: List of metadata dictionaries
389
+ output_file: Path to output file
390
+
391
+ Returns:
392
+ True if save was successful, False otherwise
393
+ """
394
+ try:
395
+ # Create directory if needed
396
+ output_file.parent.mkdir(parents=True, exist_ok=True)
397
+
398
+ # Write to file
399
+ with output_file.open('w', encoding='utf-8') as f:
400
+ for entry in metadata_list:
401
+ f.write(json.dumps(entry, ensure_ascii=False) + '\n')
402
+
403
+ logger.info(f"Successfully saved {len(metadata_list)} entries to {output_file}")
404
+ return True
405
+
406
+ except Exception as e:
407
+ stack_trace = traceback.format_exc()
408
+ logger.error(f"Failed to save metadata: {e}\n{stack_trace}")
409
+ return False
410
+
411
+ @staticmethod
412
+ def upload_to_huggingface(metadata_file: Path, username: str, dataset_name: str) -> str:
413
+ """
414
+ Upload metadata to Hugging Face as a dataset
415
+
416
+ Args:
417
+ metadata_file: Path to JSONL file
418
+ username: Hugging Face username
419
+ dataset_name: Dataset name to create/update
420
+
421
+ Returns:
422
+ Status message
423
+ """
424
+ if not HAS_DATASETS:
425
+ return "Hugging Face datasets library not installed"
426
+
427
+ try:
428
+ # Read metadata
429
+ metadata_list = []
430
+ with metadata_file.open('r', encoding='utf-8') as f:
431
+ for line in f:
432
+ metadata_list.append(json.loads(line))
433
+
434
+ if not metadata_list:
435
+ return "No metadata to upload"
436
+
437
+ # Create dataset
438
+ logger.info(f"Creating dataset with {len(metadata_list)} entries")
439
+ dataset = Dataset.from_dict({
440
+ "images": [entry.get("file_name", "unknown") for entry in metadata_list],
441
+ "metadata": metadata_list
442
+ })
443
+
444
+ # Push to Hub
445
+ dataset_path = f"{username}/{dataset_name}"
446
+ logger.info(f"Pushing dataset to {dataset_path}")
447
+ dataset.push_to_hub(dataset_path, private=False)
448
+
449
+ return f"Successfully uploaded to {dataset_path} with {len(metadata_list)} entries"
450
+
451
+ except Exception as e:
452
+ stack_trace = traceback.format_exc()
453
+ logger.error(f"Upload failed: {e}\n{stack_trace}")
454
+ return f"Upload failed: {str(e)}"
455
+
456
+ class GradioInterface:
457
+ """Gradio UI interface"""
458
+
459
+ @staticmethod
460
+ def create_interface():
461
+ """
462
+ Create the Gradio interface
463
+
464
+ Returns:
465
+ Gradio interface object
466
+ """
467
+ if not HAS_GRADIO:
468
+ logger.error("Gradio not installed, cannot create interface")
469
+ return None
470
+
471
+ def process_input(image_file, dir_path: str, username: str, dataset_name: str) -> str:
472
+ """
473
+ Process inputs from Gradio UI
474
+
475
+ Args:
476
+ image_file: Uploaded file object or None
477
+ dir_path: Directory path string
478
+ username: Hugging Face username
479
+ dataset_name: Dataset name
480
+
481
+ Returns:
482
+ Results as formatted text
483
+ """
484
+ output_lines = []
485
+ metadata_list = []
486
+
487
+ # Handle single image upload
488
+ if image_file:
489
+ image_path = Path(image_file.name)
490
+ output_lines.append(f"## Processing Single Image: {image_path.name}")
491
+
492
+ single_metadata = MetadataProcessor.process_images(image_path)
493
+ metadata_list.extend(single_metadata)
494
+
495
+ # Format first entry for display
496
+ if single_metadata:
497
+ output_lines.append("### Image Metadata:")
498
+ output_lines.append("```json")
499
+ output_lines.append(json.dumps(single_metadata[0], indent=2))
500
+ output_lines.append("```")
501
+
502
+ # Handle directory processing
503
+ if dir_path:
504
+ dir_path = Path(dir_path)
505
+ if dir_path.is_dir():
506
+ output_lines.append(f"## Processing Directory: {dir_path}")
507
+ dir_metadata = MetadataProcessor.process_images(dir_path)
508
+
509
+ # Add to full list
510
+ metadata_list.extend(dir_metadata)
511
+
512
+ # Summarize results
513
+ output_lines.append(f"### Directory Results:")
514
+ output_lines.append(f"- Processed {len(dir_metadata)} images")
515
+
516
+ # Location data summary
517
+ location_count = sum(1 for entry in dir_metadata if entry.get("gps_info") is not None)
518
+ output_lines.append(f"- Found location data in {location_count} images ({location_count/len(dir_metadata)*100:.1f}% if len(dir_metadata) > 0 else 0}%)")
519
+
520
+ # Show a few examples if available
521
+ if dir_metadata:
522
+ output_lines.append("\n### Sample Entry:")
523
+ output_lines.append("```json")
524
+ output_lines.append(json.dumps(dir_metadata[0], indent=2))
525
+ output_lines.append("```")
526
+ else:
527
+ output_lines.append(f"⚠️ Error: {dir_path} is not a directory")
528
+
529
+ # Save and upload if we have metadata
530
+ if metadata_list:
531
+ temp_output_file = Path("temp_metadata.jsonl")
532
+ output_lines.append(f"\n## Saving and Uploading")
533
+
534
+ if MetadataProcessor.save_metadata_to_jsonl(metadata_list, temp_output_file):
535
+ output_lines.append(f"✅ Saved metadata to {temp_output_file}")
536
+
537
+ # Upload to Hugging Face
538
+ upload_result = MetadataProcessor.upload_to_huggingface(
539
+ temp_output_file, username, dataset_name
540
+ )
541
+ output_lines.append(f"📤 {upload_result}")
542
+ else:
543
+ output_lines.append("❌ Failed to save metadata")
544
+
545
+ return "\n".join(output_lines) if output_lines else "Please upload an image or provide a directory path"
546
+
547
+ # Create the interface
548
+ demo = gr.Interface(
549
+ fn=process_input,
550
+ inputs=[
551
+ gr.File(label="Upload Image", file_types=list(Config.SUPPORTED_EXTENSIONS)),
552
+ gr.Textbox(label="Image Directory", placeholder=str(Config.DEFAULT_IMAGE_DIR), value=str(Config.DEFAULT_IMAGE_DIR)),
553
+ gr.Textbox(label="Hugging Face Username", value=Config.HF_USERNAME),
554
+ gr.Textbox(label="Dataset Name", value=Config.DATASET_NAME)
555
+ ],
556
+ outputs=gr.Markdown(label="Results"),
557
+ title="Enhanced Geo-Metadata Extractor",
558
+ description=(
559
+ "Upload an image or process a directory to extract location metadata and other EXIF data. "
560
+ "Results can be automatically uploaded to Hugging Face Datasets."
561
+ ),
562
+ allow_flagging="never",
563
+ examples=[
564
+ [None, "sample_images", Config.HF_USERNAME, "sample-geo-metadata"]
565
+ ]
566
+ )
567
+
568
+ return demo
569
+
570
+ def main():
571
+ """Main entry point"""
572
+ logger.info("Starting Geo-Metadata Extractor")
573
+
574
+ # Check dependencies
575
+ if not HAS_PIL:
576
+ logger.error("PIL is required for image processing. Please install: pip install pillow")
577
+ sys.exit(1)
578
+
579
+ # Create and launch the UI if running directly
580
+ if HAS_GRADIO:
581
+ logger.info("Creating Gradio interface")
582
+ demo = GradioInterface.create_interface()
583
+ if demo:
584
+ logger.info("Launching Gradio interface")
585
+ demo.launch(server_name="0.0.0.0", server_port=7860)
586
  else:
587
+ logger.error("Failed to create Gradio interface")
588
+ else:
589
+ logger.warning("Gradio not installed, running in CLI mode")
590
+
591
+ # Process default directory as fallback
592
+ if Config.DEFAULT_IMAGE_DIR.exists():
593
+ logger.info(f"Processing default directory: {Config.DEFAULT_IMAGE_DIR}")
594
+ metadata = MetadataProcessor.process_images(Config.DEFAULT_IMAGE_DIR)
595
+
596
+ if metadata:
597
+ logger.info(f"Saving {len(metadata)} entries to {Config.DEFAULT_OUTPUT_FILE}")
598
+ MetadataProcessor.save_metadata_to_jsonl(metadata, Config.DEFAULT_OUTPUT_FILE)
599
+ logger.info(f"Metadata saved to {Config.DEFAULT_OUTPUT_FILE}")
600
+ else:
601
+ logger.error(f"Default directory not found: {Config.DEFAULT_IMAGE_DIR}")
 
 
 
602
 
603
  if __name__ == "__main__":
604
+ main()