Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,243 +1,604 @@
|
|
1 |
from pathlib import Path
|
2 |
-
from PIL import Image, ExifTags
|
3 |
import json
|
4 |
import sys
|
5 |
import os
|
6 |
-
import gradio as gr
|
7 |
import logging
|
8 |
-
from datasets import Dataset
|
9 |
-
from typing import Dict, List, Any, Optional
|
10 |
import traceback
|
|
|
|
|
|
|
11 |
|
12 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
logging.basicConfig(
|
14 |
level=logging.INFO,
|
15 |
-
format="%(asctime)s [%(levelname)s] %(message)s",
|
16 |
-
handlers=[
|
|
|
|
|
|
|
17 |
)
|
18 |
-
logger = logging.getLogger(
|
19 |
-
|
20 |
-
#
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
return
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
metadata.update({
|
102 |
-
"
|
103 |
-
"
|
104 |
-
"
|
|
|
105 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
# Upload to Hugging Face
|
170 |
-
def upload_to_huggingface(metadata_file: Path, username: str, dataset_name: str) -> str:
|
171 |
-
try:
|
172 |
metadata_list = []
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
if
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
else:
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
else:
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
#
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
description="Upload an image or point to a directory to extract metadata and push to Hugging Face, Bay Area style!",
|
238 |
-
allow_flagging="never"
|
239 |
-
)
|
240 |
|
241 |
if __name__ == "__main__":
|
242 |
-
|
243 |
-
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
1 |
from pathlib import Path
|
|
|
2 |
import json
|
3 |
import sys
|
4 |
import os
|
|
|
5 |
import logging
|
|
|
|
|
6 |
import traceback
|
7 |
+
from typing import Dict, List, Any, Optional, Union, Tuple
|
8 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9 |
+
import time
|
10 |
|
11 |
+
# Third-party imports with robust error handling
|
12 |
+
try:
|
13 |
+
from PIL import Image, ExifTags
|
14 |
+
HAS_PIL = True
|
15 |
+
except ImportError:
|
16 |
+
HAS_PIL = False
|
17 |
+
logging.warning("PIL not installed - image processing disabled")
|
18 |
+
|
19 |
+
try:
|
20 |
+
import gradio as gr
|
21 |
+
HAS_GRADIO = True
|
22 |
+
except ImportError:
|
23 |
+
HAS_GRADIO = False
|
24 |
+
logging.warning("Gradio not installed - UI disabled")
|
25 |
+
|
26 |
+
try:
|
27 |
+
from datasets import Dataset
|
28 |
+
HAS_DATASETS = True
|
29 |
+
except ImportError:
|
30 |
+
HAS_DATASETS = False
|
31 |
+
logging.warning("Datasets library not installed - HF upload disabled")
|
32 |
+
|
33 |
+
# Advanced logging configuration
|
34 |
logging.basicConfig(
|
35 |
level=logging.INFO,
|
36 |
+
format="%(asctime)s [%(levelname)s] %(name)s:%(lineno)d - %(message)s",
|
37 |
+
handlers=[
|
38 |
+
logging.StreamHandler(sys.stdout),
|
39 |
+
logging.FileHandler("geo_extractor.log")
|
40 |
+
]
|
41 |
)
|
42 |
+
logger = logging.getLogger("geo_metadata_extractor")
|
43 |
+
|
44 |
+
# Configurable settings with environment variable overrides and validation
|
45 |
+
class Config:
|
46 |
+
"""Configuration container with validation and defaults"""
|
47 |
+
|
48 |
+
DEFAULT_IMAGE_DIR = Path(os.environ.get("IMAGE_DIR", "./images"))
|
49 |
+
DEFAULT_OUTPUT_FILE = Path(os.environ.get("OUTPUT_METADATA_FILE", "./metadata.jsonl"))
|
50 |
+
HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks")
|
51 |
+
DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata")
|
52 |
+
MAX_WORKERS = int(os.environ.get("MAX_WORKERS", "4"))
|
53 |
+
BATCH_SIZE = int(os.environ.get("BATCH_SIZE", "100"))
|
54 |
+
|
55 |
+
# Image formats with EXIF support prioritized first
|
56 |
+
SUPPORTED_EXTENSIONS = {
|
57 |
+
# Primary formats with good EXIF support
|
58 |
+
'.jpg', '.jpeg', '.tiff', '.tif',
|
59 |
+
# Secondary formats with limited metadata support
|
60 |
+
'.png', '.heic', '.bmp', '.webp'
|
61 |
+
}
|
62 |
+
|
63 |
+
@classmethod
|
64 |
+
def validate(cls) -> List[str]:
|
65 |
+
"""Validate configuration settings and return warnings"""
|
66 |
+
warnings = []
|
67 |
+
|
68 |
+
if cls.MAX_WORKERS < 1:
|
69 |
+
cls.MAX_WORKERS = 1
|
70 |
+
warnings.append(f"Invalid MAX_WORKERS value, reset to {cls.MAX_WORKERS}")
|
71 |
+
|
72 |
+
if cls.BATCH_SIZE < 10:
|
73 |
+
cls.BATCH_SIZE = 10
|
74 |
+
warnings.append(f"BATCH_SIZE too small, reset to {cls.BATCH_SIZE}")
|
75 |
+
|
76 |
+
return warnings
|
77 |
+
|
78 |
+
# Run config validation at import time
|
79 |
+
config_warnings = Config.validate()
|
80 |
+
for warning in config_warnings:
|
81 |
+
logger.warning(warning)
|
82 |
+
|
83 |
+
class GeoMetadataExtractor:
|
84 |
+
"""Core metadata extraction logic with advanced error handling"""
|
85 |
+
|
86 |
+
@staticmethod
|
87 |
+
def convert_to_degrees(value: Union[tuple, list]) -> Optional[float]:
|
88 |
+
"""
|
89 |
+
Convert GPS coordinates (degrees, minutes, seconds) to decimal degrees
|
90 |
+
|
91 |
+
Args:
|
92 |
+
value: Tuple of degrees, minutes, seconds
|
93 |
+
|
94 |
+
Returns:
|
95 |
+
Decimal degrees as float, or None if conversion fails
|
96 |
+
"""
|
97 |
+
try:
|
98 |
+
if not isinstance(value, (tuple, list)) or len(value) != 3:
|
99 |
+
raise ValueError(f"GPS value must be a tuple of 3 elements, got {type(value)}")
|
100 |
+
|
101 |
+
d, m, s = value
|
102 |
+
degrees = float(d) + (float(m) / 60.0) + (float(s) / 3600.0)
|
103 |
+
|
104 |
+
# Validate range
|
105 |
+
if not -180 <= degrees <= 180:
|
106 |
+
raise ValueError(f"GPS degrees out of valid range: {degrees}")
|
107 |
+
|
108 |
+
return degrees
|
109 |
+
except (TypeError, ValueError, ZeroDivisionError) as e:
|
110 |
+
logger.error(f"Failed to convert GPS coordinates: {e}")
|
111 |
+
return None
|
112 |
+
|
113 |
+
@staticmethod
|
114 |
+
def extract_gps_info(gps_info: Dict[int, Any]) -> Optional[Dict[str, Any]]:
|
115 |
+
"""
|
116 |
+
Extract and format GPS metadata from EXIF
|
117 |
+
|
118 |
+
Args:
|
119 |
+
gps_info: Dictionary of GPS EXIF tags
|
120 |
+
|
121 |
+
Returns:
|
122 |
+
Formatted GPS data including decimal latitude/longitude
|
123 |
+
"""
|
124 |
+
if not isinstance(gps_info, dict):
|
125 |
+
logger.warning("GPS info is not a dictionary, skipping")
|
126 |
+
return None
|
127 |
+
|
128 |
+
gps_data = {}
|
129 |
+
try:
|
130 |
+
# Extract tag data
|
131 |
+
for key, val in gps_info.items():
|
132 |
+
tag_name = ExifTags.GPSTAGS.get(key, f"unknown_gps_tag_{key}")
|
133 |
+
gps_data[tag_name] = val
|
134 |
+
|
135 |
+
# Process coordinates if available
|
136 |
+
if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
|
137 |
+
lat = GeoMetadataExtractor.convert_to_degrees(gps_data['GPSLatitude'])
|
138 |
+
lon = GeoMetadataExtractor.convert_to_degrees(gps_data['GPSLongitude'])
|
139 |
+
|
140 |
+
if lat is None or lon is None:
|
141 |
+
logger.error("Failed to convert latitude/longitude, skipping GPS data")
|
142 |
+
return None
|
143 |
+
|
144 |
+
# Apply hemispheric references
|
145 |
+
lat_ref = gps_data.get('GPSLatitudeRef', 'N')
|
146 |
+
lon_ref = gps_data.get('GPSLongitudeRef', 'E')
|
147 |
+
|
148 |
+
if lat_ref not in {'N', 'S'} or lon_ref not in {'E', 'W'}:
|
149 |
+
logger.warning(f"Invalid GPS reference values: lat_ref={lat_ref}, lon_ref={lon_ref}")
|
150 |
+
else:
|
151 |
+
if lat_ref == 'S':
|
152 |
+
lat = -lat
|
153 |
+
if lon_ref == 'W':
|
154 |
+
lon = -lon
|
155 |
+
|
156 |
+
# Add calculated decimal coordinates
|
157 |
+
gps_data['Latitude'] = round(lat, 6) # 6 decimal places ≈ 10cm precision
|
158 |
+
gps_data['Longitude'] = round(lon, 6)
|
159 |
+
|
160 |
+
# Add additional derived fields
|
161 |
+
if 'GPSAltitude' in gps_data:
|
162 |
+
try:
|
163 |
+
altitude = gps_data['GPSAltitude']
|
164 |
+
if hasattr(altitude, 'numerator') and hasattr(altitude, 'denominator'):
|
165 |
+
gps_data['AltitudeMeters'] = float(altitude.numerator) / float(altitude.denominator)
|
166 |
+
except Exception as e:
|
167 |
+
logger.warning(f"Failed to process altitude: {e}")
|
168 |
+
|
169 |
+
return gps_data
|
170 |
+
except Exception as e:
|
171 |
+
stack_trace = traceback.format_exc()
|
172 |
+
logger.error(f"GPS extraction error: {e}\n{stack_trace}")
|
173 |
+
return None
|
174 |
+
|
175 |
+
@staticmethod
|
176 |
+
def make_serializable(value: Any) -> Any:
|
177 |
+
"""
|
178 |
+
Recursively convert non-serializable types to JSON-compatible values
|
179 |
+
|
180 |
+
Args:
|
181 |
+
value: Any value to convert
|
182 |
+
|
183 |
+
Returns:
|
184 |
+
JSON-serializable representation of value
|
185 |
+
"""
|
186 |
+
try:
|
187 |
+
# Handle rational numbers (fractions)
|
188 |
+
if hasattr(value, 'numerator') and hasattr(value, 'denominator'):
|
189 |
+
if value.denominator == 0:
|
190 |
+
return "undefined (division by zero)"
|
191 |
+
return float(value.numerator) / float(value.denominator)
|
192 |
+
|
193 |
+
# Handle nested structures
|
194 |
+
elif isinstance(value, (tuple, list)):
|
195 |
+
return [GeoMetadataExtractor.make_serializable(item) for item in value]
|
196 |
+
|
197 |
+
elif isinstance(value, dict):
|
198 |
+
return {str(k): GeoMetadataExtractor.make_serializable(v) for k, v in value.items()}
|
199 |
+
|
200 |
+
# Handle binary data
|
201 |
+
elif isinstance(value, bytes):
|
202 |
+
return value.decode('utf-8', errors='replace')
|
203 |
+
|
204 |
+
# Test if directly serializable
|
205 |
+
json.dumps(value)
|
206 |
+
return value
|
207 |
+
|
208 |
+
except Exception as e:
|
209 |
+
logger.warning(f"Value serialization failed, converting to string: {e}")
|
210 |
+
return str(value)
|
211 |
+
|
212 |
+
@staticmethod
|
213 |
+
def get_image_metadata(image_path: Path) -> Dict[str, Any]:
|
214 |
+
"""
|
215 |
+
Extract comprehensive metadata from an image file
|
216 |
+
|
217 |
+
Args:
|
218 |
+
image_path: Path to image file
|
219 |
+
|
220 |
+
Returns:
|
221 |
+
Dictionary of extracted metadata
|
222 |
+
"""
|
223 |
+
# Core metadata with absolute file path
|
224 |
+
metadata = {
|
225 |
+
"file_name": str(image_path.absolute()),
|
226 |
+
"extraction_time": time.strftime("%Y-%m-%d %H:%M:%S")
|
227 |
+
}
|
228 |
+
|
229 |
+
try:
|
230 |
+
# Process file system metadata first (always available)
|
231 |
+
stat_info = image_path.stat()
|
232 |
metadata.update({
|
233 |
+
"file_size": stat_info.st_size,
|
234 |
+
"file_extension": image_path.suffix.lower(),
|
235 |
+
"last_modified": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat_info.st_mtime)),
|
236 |
+
"creation_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat_info.st_ctime))
|
237 |
})
|
238 |
+
|
239 |
+
# Exit early if PIL not available
|
240 |
+
if not HAS_PIL:
|
241 |
+
metadata["error"] = "PIL library not available"
|
242 |
+
return metadata
|
243 |
+
|
244 |
+
# Extract image and EXIF data
|
245 |
+
with Image.open(image_path) as image:
|
246 |
+
# Basic image properties
|
247 |
+
metadata.update({
|
248 |
+
"format": image.format or "unknown",
|
249 |
+
"size": list(image.size),
|
250 |
+
"width": image.width,
|
251 |
+
"height": image.height,
|
252 |
+
"mode": image.mode or "unknown",
|
253 |
+
"aspect_ratio": round(image.width / image.height, 3) if image.height > 0 else None
|
254 |
+
})
|
255 |
|
256 |
+
# Extract EXIF data if available
|
257 |
+
exif_data = None
|
258 |
+
try:
|
259 |
+
# Different methods depending on image format
|
260 |
+
if hasattr(image, '_getexif'):
|
261 |
+
exif_data = image._getexif()
|
262 |
+
elif hasattr(image, 'getexif'):
|
263 |
+
exif_data = image.getexif()
|
264 |
+
|
265 |
+
# Some formats like PNG store metadata differently
|
266 |
+
if not exif_data and image.format == 'PNG' and 'exif' in image.info:
|
267 |
+
exif_data = image.info.get('exif')
|
268 |
+
metadata["exif_source"] = "PNG info block"
|
269 |
+
except AttributeError:
|
270 |
+
metadata["exif_error"] = "No EXIF extraction method available"
|
271 |
+
except Exception as e:
|
272 |
+
metadata["exif_error"] = f"EXIF extraction failed: {str(e)}"
|
273 |
+
|
274 |
+
# Process EXIF data if found
|
275 |
+
if exif_data and isinstance(exif_data, dict):
|
276 |
+
for tag_id, value in exif_data.items():
|
277 |
+
# Handle GPS data specially
|
278 |
+
if tag_id in ExifTags.TAGS and ExifTags.TAGS[tag_id] == "GPSInfo":
|
279 |
+
gps_info = GeoMetadataExtractor.extract_gps_info(value)
|
280 |
+
if gps_info:
|
281 |
+
metadata["gps_info"] = GeoMetadataExtractor.make_serializable(gps_info)
|
282 |
+
else:
|
283 |
+
# Get tag name or use numeric ID with tag_ prefix
|
284 |
+
tag_name = ExifTags.TAGS.get(tag_id, f"tag_{tag_id}").lower()
|
285 |
+
metadata[tag_name] = GeoMetadataExtractor.make_serializable(value)
|
286 |
+
|
287 |
+
# Add camera model and date taken for convenience if available
|
288 |
+
if 'model' in metadata:
|
289 |
+
metadata["camera_model"] = metadata['model']
|
290 |
+
if 'datetimeoriginal' in metadata:
|
291 |
+
metadata["date_taken"] = metadata['datetimeoriginal']
|
292 |
+
|
293 |
+
return metadata
|
294 |
+
except Exception as e:
|
295 |
+
# Capture full stack trace for debugging
|
296 |
+
stack_trace = traceback.format_exc()
|
297 |
+
logger.error(f"Image {image_path} processing failed: {e}\n{stack_trace}")
|
298 |
+
|
299 |
+
# Return partial metadata with error information
|
300 |
+
metadata["error"] = str(e)
|
301 |
+
metadata["error_trace"] = stack_trace
|
302 |
+
return metadata
|
303 |
+
|
304 |
+
class MetadataProcessor:
|
305 |
+
"""Handles batch processing and file operations"""
|
306 |
+
|
307 |
+
@staticmethod
|
308 |
+
def process_images(input_path: Union[str, Path]) -> List[Dict[str, Any]]:
|
309 |
+
"""
|
310 |
+
Process image files to extract metadata
|
311 |
+
|
312 |
+
Args:
|
313 |
+
input_path: Path to image file or directory
|
314 |
+
|
315 |
+
Returns:
|
316 |
+
List of metadata dictionaries for all processed images
|
317 |
+
"""
|
|
|
|
|
|
|
318 |
metadata_list = []
|
319 |
+
input_path = Path(input_path)
|
320 |
+
start_time = time.time()
|
321 |
+
|
322 |
+
# Handle single file case
|
323 |
+
if input_path.is_file() and input_path.suffix.lower() in Config.SUPPORTED_EXTENSIONS:
|
324 |
+
logger.info(f"Processing single image: {input_path}")
|
325 |
+
metadata = GeoMetadataExtractor.get_image_metadata(input_path)
|
326 |
+
if metadata:
|
327 |
+
metadata_list.append(metadata)
|
328 |
+
|
329 |
+
# Handle directory case
|
330 |
+
elif input_path.is_dir():
|
331 |
+
logger.info(f"Processing directory: {input_path}")
|
332 |
+
|
333 |
+
# Collect all image files first
|
334 |
+
image_paths = [
|
335 |
+
path for path in input_path.rglob("*")
|
336 |
+
if path.is_file() and path.suffix.lower() in Config.SUPPORTED_EXTENSIONS
|
337 |
+
]
|
338 |
+
|
339 |
+
total_images = len(image_paths)
|
340 |
+
logger.info(f"Found {total_images} images to process")
|
341 |
+
|
342 |
+
# Process in parallel with progress tracking
|
343 |
+
if total_images > 0:
|
344 |
+
processed = 0
|
345 |
+
with ThreadPoolExecutor(max_workers=Config.MAX_WORKERS) as executor:
|
346 |
+
# Submit all tasks
|
347 |
+
future_to_path = {
|
348 |
+
executor.submit(GeoMetadataExtractor.get_image_metadata, path): path
|
349 |
+
for path in image_paths
|
350 |
+
}
|
351 |
+
|
352 |
+
# Process as they complete
|
353 |
+
for future in as_completed(future_to_path):
|
354 |
+
path = future_to_path[future]
|
355 |
+
try:
|
356 |
+
metadata = future.result()
|
357 |
+
if metadata:
|
358 |
+
metadata_list.append(metadata)
|
359 |
+
|
360 |
+
# Update progress
|
361 |
+
processed += 1
|
362 |
+
if processed % 10 == 0 or processed == total_images:
|
363 |
+
elapsed = time.time() - start_time
|
364 |
+
rate = processed / elapsed if elapsed > 0 else 0
|
365 |
+
logger.info(f"Processed {processed}/{total_images} images ({processed/total_images*100:.1f}%) - {rate:.2f} images/sec")
|
366 |
+
|
367 |
+
except Exception as e:
|
368 |
+
logger.error(f"Error processing {path}: {e}")
|
369 |
+
else:
|
370 |
+
logger.warning(f"No images found in directory: {input_path}")
|
371 |
else:
|
372 |
+
logger.error(f"Invalid input: {input_path} is not a file or directory")
|
373 |
+
return [{"error": f"Invalid input: {input_path} is not a file or directory"}]
|
374 |
+
|
375 |
+
# Summarize results
|
376 |
+
elapsed = time.time() - start_time
|
377 |
+
images_per_second = len(metadata_list) / elapsed if elapsed > 0 else 0
|
378 |
+
logger.info(f"Completed processing {len(metadata_list)} images in {elapsed:.2f} seconds ({images_per_second:.2f} images/sec)")
|
379 |
+
|
380 |
+
return metadata_list
|
381 |
+
|
382 |
+
@staticmethod
|
383 |
+
def save_metadata_to_jsonl(metadata_list: List[Dict[str, Any]], output_file: Path) -> bool:
|
384 |
+
"""
|
385 |
+
Save metadata to JSONL format with error handling
|
386 |
+
|
387 |
+
Args:
|
388 |
+
metadata_list: List of metadata dictionaries
|
389 |
+
output_file: Path to output file
|
390 |
+
|
391 |
+
Returns:
|
392 |
+
True if save was successful, False otherwise
|
393 |
+
"""
|
394 |
+
try:
|
395 |
+
# Create directory if needed
|
396 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
397 |
+
|
398 |
+
# Write to file
|
399 |
+
with output_file.open('w', encoding='utf-8') as f:
|
400 |
+
for entry in metadata_list:
|
401 |
+
f.write(json.dumps(entry, ensure_ascii=False) + '\n')
|
402 |
+
|
403 |
+
logger.info(f"Successfully saved {len(metadata_list)} entries to {output_file}")
|
404 |
+
return True
|
405 |
+
|
406 |
+
except Exception as e:
|
407 |
+
stack_trace = traceback.format_exc()
|
408 |
+
logger.error(f"Failed to save metadata: {e}\n{stack_trace}")
|
409 |
+
return False
|
410 |
+
|
411 |
+
@staticmethod
|
412 |
+
def upload_to_huggingface(metadata_file: Path, username: str, dataset_name: str) -> str:
|
413 |
+
"""
|
414 |
+
Upload metadata to Hugging Face as a dataset
|
415 |
+
|
416 |
+
Args:
|
417 |
+
metadata_file: Path to JSONL file
|
418 |
+
username: Hugging Face username
|
419 |
+
dataset_name: Dataset name to create/update
|
420 |
+
|
421 |
+
Returns:
|
422 |
+
Status message
|
423 |
+
"""
|
424 |
+
if not HAS_DATASETS:
|
425 |
+
return "Hugging Face datasets library not installed"
|
426 |
+
|
427 |
+
try:
|
428 |
+
# Read metadata
|
429 |
+
metadata_list = []
|
430 |
+
with metadata_file.open('r', encoding='utf-8') as f:
|
431 |
+
for line in f:
|
432 |
+
metadata_list.append(json.loads(line))
|
433 |
+
|
434 |
+
if not metadata_list:
|
435 |
+
return "No metadata to upload"
|
436 |
+
|
437 |
+
# Create dataset
|
438 |
+
logger.info(f"Creating dataset with {len(metadata_list)} entries")
|
439 |
+
dataset = Dataset.from_dict({
|
440 |
+
"images": [entry.get("file_name", "unknown") for entry in metadata_list],
|
441 |
+
"metadata": metadata_list
|
442 |
+
})
|
443 |
+
|
444 |
+
# Push to Hub
|
445 |
+
dataset_path = f"{username}/{dataset_name}"
|
446 |
+
logger.info(f"Pushing dataset to {dataset_path}")
|
447 |
+
dataset.push_to_hub(dataset_path, private=False)
|
448 |
+
|
449 |
+
return f"Successfully uploaded to {dataset_path} with {len(metadata_list)} entries"
|
450 |
+
|
451 |
+
except Exception as e:
|
452 |
+
stack_trace = traceback.format_exc()
|
453 |
+
logger.error(f"Upload failed: {e}\n{stack_trace}")
|
454 |
+
return f"Upload failed: {str(e)}"
|
455 |
+
|
456 |
+
class GradioInterface:
|
457 |
+
"""Gradio UI interface"""
|
458 |
+
|
459 |
+
@staticmethod
|
460 |
+
def create_interface():
|
461 |
+
"""
|
462 |
+
Create the Gradio interface
|
463 |
+
|
464 |
+
Returns:
|
465 |
+
Gradio interface object
|
466 |
+
"""
|
467 |
+
if not HAS_GRADIO:
|
468 |
+
logger.error("Gradio not installed, cannot create interface")
|
469 |
+
return None
|
470 |
+
|
471 |
+
def process_input(image_file, dir_path: str, username: str, dataset_name: str) -> str:
|
472 |
+
"""
|
473 |
+
Process inputs from Gradio UI
|
474 |
+
|
475 |
+
Args:
|
476 |
+
image_file: Uploaded file object or None
|
477 |
+
dir_path: Directory path string
|
478 |
+
username: Hugging Face username
|
479 |
+
dataset_name: Dataset name
|
480 |
+
|
481 |
+
Returns:
|
482 |
+
Results as formatted text
|
483 |
+
"""
|
484 |
+
output_lines = []
|
485 |
+
metadata_list = []
|
486 |
+
|
487 |
+
# Handle single image upload
|
488 |
+
if image_file:
|
489 |
+
image_path = Path(image_file.name)
|
490 |
+
output_lines.append(f"## Processing Single Image: {image_path.name}")
|
491 |
+
|
492 |
+
single_metadata = MetadataProcessor.process_images(image_path)
|
493 |
+
metadata_list.extend(single_metadata)
|
494 |
+
|
495 |
+
# Format first entry for display
|
496 |
+
if single_metadata:
|
497 |
+
output_lines.append("### Image Metadata:")
|
498 |
+
output_lines.append("```json")
|
499 |
+
output_lines.append(json.dumps(single_metadata[0], indent=2))
|
500 |
+
output_lines.append("```")
|
501 |
+
|
502 |
+
# Handle directory processing
|
503 |
+
if dir_path:
|
504 |
+
dir_path = Path(dir_path)
|
505 |
+
if dir_path.is_dir():
|
506 |
+
output_lines.append(f"## Processing Directory: {dir_path}")
|
507 |
+
dir_metadata = MetadataProcessor.process_images(dir_path)
|
508 |
+
|
509 |
+
# Add to full list
|
510 |
+
metadata_list.extend(dir_metadata)
|
511 |
+
|
512 |
+
# Summarize results
|
513 |
+
output_lines.append(f"### Directory Results:")
|
514 |
+
output_lines.append(f"- Processed {len(dir_metadata)} images")
|
515 |
+
|
516 |
+
# Location data summary
|
517 |
+
location_count = sum(1 for entry in dir_metadata if entry.get("gps_info") is not None)
|
518 |
+
output_lines.append(f"- Found location data in {location_count} images ({location_count/len(dir_metadata)*100:.1f}% if len(dir_metadata) > 0 else 0}%)")
|
519 |
+
|
520 |
+
# Show a few examples if available
|
521 |
+
if dir_metadata:
|
522 |
+
output_lines.append("\n### Sample Entry:")
|
523 |
+
output_lines.append("```json")
|
524 |
+
output_lines.append(json.dumps(dir_metadata[0], indent=2))
|
525 |
+
output_lines.append("```")
|
526 |
+
else:
|
527 |
+
output_lines.append(f"⚠️ Error: {dir_path} is not a directory")
|
528 |
+
|
529 |
+
# Save and upload if we have metadata
|
530 |
+
if metadata_list:
|
531 |
+
temp_output_file = Path("temp_metadata.jsonl")
|
532 |
+
output_lines.append(f"\n## Saving and Uploading")
|
533 |
+
|
534 |
+
if MetadataProcessor.save_metadata_to_jsonl(metadata_list, temp_output_file):
|
535 |
+
output_lines.append(f"✅ Saved metadata to {temp_output_file}")
|
536 |
+
|
537 |
+
# Upload to Hugging Face
|
538 |
+
upload_result = MetadataProcessor.upload_to_huggingface(
|
539 |
+
temp_output_file, username, dataset_name
|
540 |
+
)
|
541 |
+
output_lines.append(f"📤 {upload_result}")
|
542 |
+
else:
|
543 |
+
output_lines.append("❌ Failed to save metadata")
|
544 |
+
|
545 |
+
return "\n".join(output_lines) if output_lines else "Please upload an image or provide a directory path"
|
546 |
+
|
547 |
+
# Create the interface
|
548 |
+
demo = gr.Interface(
|
549 |
+
fn=process_input,
|
550 |
+
inputs=[
|
551 |
+
gr.File(label="Upload Image", file_types=list(Config.SUPPORTED_EXTENSIONS)),
|
552 |
+
gr.Textbox(label="Image Directory", placeholder=str(Config.DEFAULT_IMAGE_DIR), value=str(Config.DEFAULT_IMAGE_DIR)),
|
553 |
+
gr.Textbox(label="Hugging Face Username", value=Config.HF_USERNAME),
|
554 |
+
gr.Textbox(label="Dataset Name", value=Config.DATASET_NAME)
|
555 |
+
],
|
556 |
+
outputs=gr.Markdown(label="Results"),
|
557 |
+
title="Enhanced Geo-Metadata Extractor",
|
558 |
+
description=(
|
559 |
+
"Upload an image or process a directory to extract location metadata and other EXIF data. "
|
560 |
+
"Results can be automatically uploaded to Hugging Face Datasets."
|
561 |
+
),
|
562 |
+
allow_flagging="never",
|
563 |
+
examples=[
|
564 |
+
[None, "sample_images", Config.HF_USERNAME, "sample-geo-metadata"]
|
565 |
+
]
|
566 |
+
)
|
567 |
+
|
568 |
+
return demo
|
569 |
+
|
570 |
+
def main():
|
571 |
+
"""Main entry point"""
|
572 |
+
logger.info("Starting Geo-Metadata Extractor")
|
573 |
+
|
574 |
+
# Check dependencies
|
575 |
+
if not HAS_PIL:
|
576 |
+
logger.error("PIL is required for image processing. Please install: pip install pillow")
|
577 |
+
sys.exit(1)
|
578 |
+
|
579 |
+
# Create and launch the UI if running directly
|
580 |
+
if HAS_GRADIO:
|
581 |
+
logger.info("Creating Gradio interface")
|
582 |
+
demo = GradioInterface.create_interface()
|
583 |
+
if demo:
|
584 |
+
logger.info("Launching Gradio interface")
|
585 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
586 |
else:
|
587 |
+
logger.error("Failed to create Gradio interface")
|
588 |
+
else:
|
589 |
+
logger.warning("Gradio not installed, running in CLI mode")
|
590 |
+
|
591 |
+
# Process default directory as fallback
|
592 |
+
if Config.DEFAULT_IMAGE_DIR.exists():
|
593 |
+
logger.info(f"Processing default directory: {Config.DEFAULT_IMAGE_DIR}")
|
594 |
+
metadata = MetadataProcessor.process_images(Config.DEFAULT_IMAGE_DIR)
|
595 |
+
|
596 |
+
if metadata:
|
597 |
+
logger.info(f"Saving {len(metadata)} entries to {Config.DEFAULT_OUTPUT_FILE}")
|
598 |
+
MetadataProcessor.save_metadata_to_jsonl(metadata, Config.DEFAULT_OUTPUT_FILE)
|
599 |
+
logger.info(f"Metadata saved to {Config.DEFAULT_OUTPUT_FILE}")
|
600 |
+
else:
|
601 |
+
logger.error(f"Default directory not found: {Config.DEFAULT_IMAGE_DIR}")
|
|
|
|
|
|
|
602 |
|
603 |
if __name__ == "__main__":
|
604 |
+
main()
|
|