Spaces:

latterworks
/

geo-metadata-extractor-gradio

Runtime error

App Files Files Community

latterworks commited on Mar 18

Commit

2328870

verified ·

1 Parent(s): b9f8627

Update app.py

Browse files

Files changed (1) hide show

app.py +289 -15

app.py CHANGED Viewed

@@ -6,7 +6,10 @@ import os
 import logging
 import time
 from datasets import Dataset
-from huggingface_hub import HfApi
 # Setup logging with timestamp
 logging.basicConfig(
@@ -19,21 +22,168 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-# Constants - put your shit here
 HF_TOKEN = os.environ.get("HF_TOKEN")
-HF_USERNAME = "latterworks"  # Your username
-DATASET_NAME = "geo-metadata"  # Your dataset name
 SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'}
-# Status tracking
 STATS = {
     "uploads": 0,
     "total_files": 0,
-    "files_with_gps": 0
 }
 def convert_to_degrees(value):
-    """Convert GPS coordinates to decimal degrees - handles all the edge cases"""
     try:
         if not isinstance(value, (tuple, list)) or len(value) != 3:
             raise ValueError(f"GPS needs 3 values, got {type(value)}")
@@ -109,7 +259,13 @@ def make_serializable(value):
 def get_image_metadata(image_path):
     """Extract all metadata from an image file"""
-    metadata = {"file_name": str(Path(image_path).absolute())}
     try:
         with Image.open(image_path) as image:
             metadata.update({
@@ -140,7 +296,7 @@ def get_image_metadata(image_path):
         # Add file details
         metadata["file_size"] = os.path.getsize(image_path)
-        metadata["file_extension"] = Path(image_path).suffix.lower()
         metadata["extraction_timestamp"] = int(time.time())
         # Test serialization
@@ -148,12 +304,19 @@ def get_image_metadata(image_path):
         return metadata
     except Exception as e:
         logger.error(f"Error processing {image_path}: {e}")
-        return {"file_name": str(Path(image_path).absolute()), "error": str(e)}
 def process_images(image_files):
     """Process images and upload metadata to Hugging Face"""
     if not image_files:
         return "🚫 Upload some fucking images first! 📷", None
     # Reset stats for this batch
     batch_stats = {
@@ -216,6 +379,7 @@ def process_images(image_files):
             # Create dataset object with both filenames and full metadata
             dataset = Dataset.from_dict({
                 "filename": filenames,
                 "metadata": metadata_list
             })
@@ -227,7 +391,7 @@ def process_images(image_files):
             )
             # Upload raw JSONL file
-            api = HfApi()
             api.upload_file(
                 path_or_fileobj=output_file,
                 path_in_repo=f"batches/metadata_{timestamp}.jsonl",
@@ -237,8 +401,63 @@ def process_images(image_files):
                 commit_message=f"Raw metadata batch {timestamp}"
             )
             STATS["uploads"] += 1
             upload_status = "✅ success"
         except Exception as e:
             logger.error(f"HF upload failed: {e}")
             upload_status = f"❌ failed: {str(e)[:100]}..."
@@ -254,12 +473,53 @@ def process_images(image_files):
         f"📊 TOTAL STATS 📊\n"
         f"Total files: {STATS['total_files']}\n"
         f"Files with GPS: {STATS['files_with_gps']}\n"
-        f"Upload batches: {STATS['uploads']}"
     )
     return result, output_file
-# Create the UI that actually fucking works
 demo = gr.Interface(
     fn=process_images,
     inputs=gr.Files(label="DROP IMAGES HERE 📸", file_types=["image"], file_count="multiple"),
@@ -277,6 +537,20 @@ demo = gr.Interface(
     theme="huggingface"
 )
-# Only launch when run directly
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import logging
 import time
 from datasets import Dataset
+from huggingface_hub import HfApi, create_repo, repository_exists, CommitOperationAdd
+from huggingface_hub.utils import tqdm
+import threading
+import sys
 # Setup logging with timestamp
 logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
+# Constants - edit these for your setup
 HF_TOKEN = os.environ.get("HF_TOKEN")
+HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks")
+DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata")
 SUPPORTED_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.tiff', '.bmp', '.webp'}
+CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", "3600"))  # Check for files hourly by default
+# Global state
 STATS = {
     "uploads": 0,
     "total_files": 0,
+    "files_with_gps": 0,
+    "last_upload": 0,
+    "startup_time": int(time.time())
 }
+def ensure_dataset_exists():
+    """Create dataset repository if it doesn't exist"""
+    if not HF_TOKEN:
+        logger.error("HF_TOKEN not set. Cannot create or check dataset.")
+        return False
+    try:
+        api = HfApi(token=HF_TOKEN)
+        repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
+        # Check if repo exists
+        if not repository_exists(repo_id, repo_type="dataset", token=HF_TOKEN):
+            logger.info(f"Creating dataset repository: {repo_id}")
+            create_repo(
+                repo_id=repo_id,
+                repo_type="dataset",
+                private=False,
+                token=HF_TOKEN
+            )
+            # Create initial README
+            readme_content = f"""# {DATASET_NAME}
+Automatically collected geo-metadata from images using the Geo-Metadata Extractor.
+## Statistics
+- Total files processed: 0
+- Files with GPS data: 0
+- Last updated: {time.strftime('%Y-%m-%d %H:%M:%S')}
+## Data Format
+Each entry contains:
+- Basic image metadata (size, format, mode)
+- EXIF data when available
+- GPS coordinates extracted from EXIF when available
+"""
+            # Upload README
+            api.upload_file(
+                path_or_fileobj=readme_content.encode(),
+                path_in_repo="README.md",
+                repo_id=repo_id,
+                repo_type="dataset",
+                token=HF_TOKEN,
+                commit_message="Initial commit with README"
+            )
+            # Create folder structure
+            for folder in ["batches", "images", "scripts"]:
+                api.upload_file(
+                    path_or_fileobj=b"",
+                    path_in_repo=f"{folder}/.gitkeep",
+                    repo_id=repo_id,
+                    repo_type="dataset",
+                    token=HF_TOKEN,
+                    commit_message=f"Create {folder} directory"
+                )
+            # Upload this script to the repository
+            try:
+                script_path = os.path.abspath(sys.argv[0])
+                if os.path.exists(script_path):
+                    with open(script_path, "rb") as f:
+                        script_content = f.read()
+                    api.upload_file(
+                        path_or_fileobj=script_content,
+                        path_in_repo="scripts/geo_metadata_extractor.py",
+                        repo_id=repo_id,
+                        repo_type="dataset",
+                        token=HF_TOKEN,
+                        commit_message="Upload metadata extractor script"
+                    )
+            except Exception as e:
+                logger.error(f"Failed to upload script: {e}")
+            logger.info(f"Dataset repository created: {repo_id}")
+        else:
+            logger.info(f"Dataset repository already exists: {repo_id}")
+        return True
+    except Exception as e:
+        logger.error(f"Error ensuring dataset exists: {e}")
+        return False
+def update_readme_stats():
+    """Update README with current statistics"""
+    if not HF_TOKEN:
+        return
+    try:
+        api = HfApi(token=HF_TOKEN)
+        repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
+        # Create updated README content
+        readme_content = f"""# {DATASET_NAME}
+Automatically collected geo-metadata from images using the Geo-Metadata Extractor.
+## Statistics
+- Total files processed: {STATS["total_files"]}
+- Files with GPS data: {STATS["files_with_gps"]}
+- Upload batches: {STATS["uploads"]}
+- Last updated: {time.strftime('%Y-%m-%d %H:%M:%S')}
+- Uptime: {format_duration(int(time.time()) - STATS["startup_time"])}
+## Data Format
+Each entry contains:
+- Basic image metadata (size, format, mode)
+- EXIF data when available
+- GPS coordinates extracted from EXIF when available
+"""
+        # Upload updated README
+        api.upload_file(
+            path_or_fileobj=readme_content.encode(),
+            path_in_repo="README.md",
+            repo_id=repo_id,
+            repo_type="dataset",
+            token=HF_TOKEN,
+            commit_message="Update statistics"
+        )
+        logger.info("Updated README with current statistics")
+    except Exception as e:
+        logger.error(f"Error updating README: {e}")
+def format_duration(seconds):
+    """Format seconds into readable duration"""
+    days, remainder = divmod(seconds, 86400)
+    hours, remainder = divmod(remainder, 3600)
+    minutes, seconds = divmod(remainder, 60)
+    parts = []
+    if days > 0:
+        parts.append(f"{days}d")
+    if hours > 0:
+        parts.append(f"{hours}h")
+    if minutes > 0:
+        parts.append(f"{minutes}m")
+    parts.append(f"{seconds}s")
+    return " ".join(parts)
 def convert_to_degrees(value):
+    """Convert GPS coordinates to decimal degrees"""
     try:
         if not isinstance(value, (tuple, list)) or len(value) != 3:
             raise ValueError(f"GPS needs 3 values, got {type(value)}")
 def get_image_metadata(image_path):
     """Extract all metadata from an image file"""
+    file_path = Path(image_path)
+    metadata = {
+        "file_name": str(file_path.absolute()),
+        "file_basename": file_path.name,
+        "image_path_in_repo": f"images/{file_path.name}"  # Path where image will be stored in repo
+    }
     try:
         with Image.open(image_path) as image:
             metadata.update({
         # Add file details
         metadata["file_size"] = os.path.getsize(image_path)
+        metadata["file_extension"] = file_path.suffix.lower()
         metadata["extraction_timestamp"] = int(time.time())
         # Test serialization
         return metadata
     except Exception as e:
         logger.error(f"Error processing {image_path}: {e}")
+        return {"file_name": str(file_path.absolute()), "error": str(e)}
 def process_images(image_files):
     """Process images and upload metadata to Hugging Face"""
     if not image_files:
         return "🚫 Upload some fucking images first! 📷", None
+    # Ensure dataset exists
+    if not ensure_dataset_exists():
+        return "❌ Failed to create or verify dataset repository. Check logs.", None
+    # Create temp directory for storing files if needed
+    os.makedirs("temp_uploads", exist_ok=True)
     # Reset stats for this batch
     batch_stats = {
             # Create dataset object with both filenames and full metadata
             dataset = Dataset.from_dict({
                 "filename": filenames,
+                "image_path": [f"images/{f}" for f in filenames],  # Path to actual image in repo
                 "metadata": metadata_list
             })
             )
             # Upload raw JSONL file
+            api = HfApi(token=HF_TOKEN)
             api.upload_file(
                 path_or_fileobj=output_file,
                 path_in_repo=f"batches/metadata_{timestamp}.jsonl",
                 commit_message=f"Raw metadata batch {timestamp}"
             )
+            # Upload the actual image files
+            logger.info(f"Uploading {len(image_files)} image files...")
+            operations = []
+            # Process images in batches to avoid memory issues with large datasets
+            MAX_BATCH_SIZE = 20  # Maximum images per commit
+            total_uploaded = 0
+            # Group image files into batches
+            image_batches = [image_files[i:i+MAX_BATCH_SIZE] for i in range(0, len(image_files), MAX_BATCH_SIZE)]
+            for batch_idx, img_batch in enumerate(image_batches):
+                operations = []
+                for img_file in tqdm(img_batch, desc=f"Preparing batch {batch_idx+1}/{len(image_batches)}"):
+                    try:
+                        file_path = img_file.name
+                        file_name = os.path.basename(file_path)
+                        target_path = f"images/{file_name}"
+                        # Add file to operations list
+                        with open(file_path, "rb") as f:
+                            content = f.read()
+                            operations.append(
+                                CommitOperationAdd(
+                                    path_in_repo=target_path,
+                                    path_or_fileobj=content
+                                )
+                            )
+                    except Exception as e:
+                        logger.error(f"Error preparing image {img_file.name} for upload: {e}")
+                # Commit this batch of images
+                if operations:
+                    try:
+                        logger.info(f"Committing batch {batch_idx+1}/{len(image_batches)} with {len(operations)} images...")
+                        api.create_commit(
+                            repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
+                            repo_type="dataset",
+                            operations=operations,
+                            commit_message=f"Upload {len(operations)} images (batch {batch_idx+1}/{len(image_batches)}) from upload {timestamp}"
+                        )
+                        total_uploaded += len(operations)
+                        logger.info(f"Successfully uploaded batch {batch_idx+1} ({total_uploaded}/{len(image_files)} total)")
+                    except Exception as e:
+                        logger.error(f"Failed to upload image batch {batch_idx+1}: {e}")
+            logger.info(f"Image upload complete: {total_uploaded}/{len(image_files)} files uploaded")
+            # Update stats
             STATS["uploads"] += 1
+            STATS["last_upload"] = timestamp
             upload_status = "✅ success"
+            # Update README in background thread
+            threading.Thread(target=update_readme_stats).start()
         except Exception as e:
             logger.error(f"HF upload failed: {e}")
             upload_status = f"❌ failed: {str(e)[:100]}..."
         f"📊 TOTAL STATS 📊\n"
         f"Total files: {STATS['total_files']}\n"
         f"Files with GPS: {STATS['files_with_gps']}\n"
+        f"Upload batches: {STATS['uploads']}\n"
+        f"Uptime: {format_duration(int(time.time()) - STATS['startup_time'])}"
     )
     return result, output_file
+def scan_and_process_directory(directory_path):
+    """Scan directory for images and process them"""
+    if not os.path.isdir(directory_path):
+        logger.error(f"Not a directory: {directory_path}")
+        return
+    logger.info(f"Scanning directory: {directory_path}")
+    image_files = []
+    # Find all image files in directory
+    for root, _, files in os.walk(directory_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if Path(file_path).suffix.lower() in SUPPORTED_EXTENSIONS:
+                image_files.append(file_path)
+    if not image_files:
+        logger.info(f"No image files found in {directory_path}")
+        return
+    logger.info(f"Found {len(image_files)} image files in {directory_path}")
+    # Create file-like objects for processing
+    class FileObject:
+        def __init__(self, path):
+            self.name = path
+    process_images([FileObject(path) for path in image_files])
+def schedule_directory_scan():
+    """Check for new files in directory periodically"""
+    watch_dir = os.environ.get("WATCH_DIRECTORY")
+    if watch_dir and os.path.isdir(watch_dir):
+        logger.info(f"Scheduled scan of directory: {watch_dir}")
+        scan_and_process_directory(watch_dir)
+    # Schedule next check
+    threading.Timer(CHECK_INTERVAL, schedule_directory_scan).start()
+# Create the UI
 demo = gr.Interface(
     fn=process_images,
     inputs=gr.Files(label="DROP IMAGES HERE 📸", file_types=["image"], file_count="multiple"),
     theme="huggingface"
 )
+# Launch app and start background processes
 if __name__ == "__main__":
+    # Ensure dataset exists on startup
+    ensure_dataset_exists()
+    # Start directory watcher if configured
+    if os.environ.get("WATCH_DIRECTORY"):
+        threading.Thread(target=schedule_directory_scan).start()
+        logger.info(f"Starting directory watcher for {os.environ.get('WATCH_DIRECTORY')}")
+    # Log startup info
+    logger.info(f"=== Application Startup at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")
+    logger.info(f"Dataset: {HF_USERNAME}/{DATASET_NAME}")
+    logger.info(f"Token available: {bool(HF_TOKEN)}")
+    # Launch Gradio app
+    demo.launch(server_name="0.0.0.0", server_port=7860)