Spaces:

latterworks
/

geo-metadata-extractor-gradio

Runtime error

App Files Files Community

latterworks commited on Mar 18

Commit

9d4c4f1

verified ·

1 Parent(s): b63d232

Update app.py

Browse files

Files changed (1) hide show

app.py +275 -44

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import torch
 import numpy as np
 import folium
@@ -8,45 +7,78 @@ import os
 import PIL.Image
 from io import BytesIO
 import base64
-from typing import Tuple, List, Dict, Any, Optional, Union
 from pathlib import Path
 # GeoCLIP dependencies
 from geoclip import GeoCLIP
 from transformers import CLIPTokenizer, CLIPProcessor
 class GeoCLIPCore:
     """
-    Vectorized GeoCLIP implementation with minimal compute overhead.
-    Implements tensor-optimized inference for:
-    1. Text-to-location prediction
-    2. Image-to-location prediction
-    3. Coordinate embedding generation
-    4. Cross-modal similarity analysis
     """
-    def __init__(self, device: Optional[str] = None) -> None:
         """
-        Initialize model with optimal compute resource allocation.
         Args:
             device: Target compute device (None for auto-detection)
         """
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
-        # Load and configure core model components
         self._model = GeoCLIP().to(self.device)
         self._tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
         self._processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
-        # Cache frequently used components for performance
         self._location_encoder = self._model.location_encoder
         self._image_encoder = self._model.image_encoder
         self._gps_gallery = None  # Lazy-loaded on first prediction
-        print(f"GeoCLIP initialized on {self.device}")
     def embed_text(self, text: str) -> torch.Tensor:
         """
@@ -76,15 +108,15 @@ class GeoCLIPCore:
             L2-normalized embedding tensor (shape: [1, 512])
         """
         with torch.no_grad():
-            # Process different image input types
             if isinstance(image, str):
                 # Path to image file
                 image = PIL.Image.open(image).convert("RGB")
             elif isinstance(image, np.ndarray):
-                # Convert numpy array to PIL Image
                 image = PIL.Image.fromarray(np.uint8(image)).convert("RGB")
-            # Process image using CLIP processor
             inputs = self._processor(images=image, return_tensors="pt").to(self.device)
             embedding = self._model.image_encoder(inputs.pixel_values)
             return torch.nn.functional.normalize(embedding, dim=1)
@@ -104,7 +136,7 @@ class GeoCLIPCore:
             embedding = self._location_encoder(coords_tensor)
             return torch.nn.functional.normalize(embedding, dim=1)
-    def _ensure_gps_gallery(self):
         """Ensure GPS gallery is loaded and cached for efficient reuse."""
         if self._gps_gallery is None:
             self._gps_gallery = self._model.gps_gallery.to(self.device)
@@ -123,21 +155,21 @@ class GeoCLIPCore:
             List of prediction dictionaries with coordinates and confidence scores
         """
         with torch.no_grad():
-            # Ensure GPS gallery is loaded
             self._ensure_gps_gallery()
-            # Generate location embeddings
             location_embeddings = self._location_encoder(self._gps_gallery)
             location_embeddings = torch.nn.functional.normalize(location_embeddings, dim=1)
-            # Calculate similarity and softmax probabilities
             similarity = self._model.logit_scale.exp() * (query_embedding @ location_embeddings.T)
             probs = similarity.softmax(dim=-1)
-            # Extract top predictions
             top_values, top_indices = torch.topk(probs[0], min(top_k, len(self._gps_gallery)))
-            # Format results
             predictions = []
             for idx, confidence in zip(top_indices.cpu().numpy(), top_values.cpu().numpy()):
                 predictions.append({
@@ -161,7 +193,9 @@ class GeoCLIPCore:
         embedding = self.embed_text(text)
         return self.predict_location(embedding, top_k)
-    def image_to_location(self, image: Union[str, PIL.Image.Image, np.ndarray], top_k: int = 5) -> List[Dict[str, Any]]:
         """
         Primary entry point for image-to-location prediction pipeline.
@@ -175,6 +209,151 @@ class GeoCLIPCore:
         embedding = self.embed_image(image)
         return self.predict_location(embedding, top_k)
     def compute_similarity(self, embed1: torch.Tensor, embed2: torch.Tensor) -> float:
         """
         Compute cosine similarity between two embeddings.
@@ -238,17 +417,22 @@ class GeoCLIPCore:
         return m
-def launch_gradio_interface():
-    """Deploy GeoCLIP with Gradio interface for both text and image inputs."""
     # Initialize model with optimal compute configuration
-    geo_core = GeoCLIPCore()
     def predict_from_text(text_query, top_k):
-        """Process text query and generate visualization."""
         if not text_query.strip():
             return None, "Please enter a location description."
-        # Execute prediction pipeline
         predictions = geo_core.text_to_location(text_query, top_k=int(top_k))
         # Generate map visualization
@@ -257,7 +441,7 @@ def launch_gradio_interface():
             title=f"Predictions for: {text_query}"
         )
-        # Create temporary HTML file for map
         map_html = m._repr_html_()
         # Format textual results
@@ -269,12 +453,16 @@ def launch_gradio_interface():
         return map_html, result_text
-    def predict_from_image(image, top_k):
-        """Process image input and generate visualization."""
         if image is None:
-            return None, "Please upload an image."
-        # Execute prediction pipeline
         predictions = geo_core.image_to_location(image, top_k=int(top_k))
         # Generate map visualization
@@ -283,7 +471,7 @@ def launch_gradio_interface():
             title="Predictions from Image"
         )
-        # Create temporary HTML file for map
         map_html = m._repr_html_()
         # Format textual results
@@ -293,7 +481,25 @@ def launch_gradio_interface():
             conf = pred["confidence"]
             result_text += f"{i}. ({coords[0]:.6f}, {coords[1]:.6f}) - confidence: {conf:.6f}\n"
-        return map_html, result_text
     def compute_text_similarity(text1, text2):
         """Compute semantic similarity between two text descriptions."""
@@ -308,8 +514,8 @@ def launch_gradio_interface():
     # Create Gradio interface with tabs for different functions
     with gr.Blocks(title="GeoCLIP Location Intelligence") as demo:
-        gr.Markdown("# GeoCLIP Location Intelligence")
-        gr.Markdown("Predict locations from text descriptions or images.")
         with gr.Tabs():
             with gr.TabItem("Text → Location"):
@@ -352,10 +558,14 @@ def launch_gradio_interface():
                     outputs=[text_map_output, text_result_output]
                 )
-            with gr.TabItem("Image → Location"):
                 with gr.Row():
                     with gr.Column():
                         image_input = gr.Image(type="pil", label="Upload Image")
                         image_top_k = gr.Slider(
                             minimum=1,
                             maximum=20,
@@ -363,15 +573,16 @@ def launch_gradio_interface():
                             step=1,
                             label="Number of Predictions"
                         )
-                        image_submit = gr.Button("Predict Location")
                 image_map_output = gr.HTML(label="Map Visualization")
                 image_result_output = gr.Textbox(label="Prediction Results")
                 image_submit.click(
-                    predict_from_image,
-                    inputs=[image_input, image_top_k],
-                    outputs=[image_map_output, image_result_output]
                 )
             with gr.TabItem("Semantic Similarity"):
@@ -393,11 +604,31 @@ def launch_gradio_interface():
                     inputs=[text1_input, text2_input],
                     outputs=similarity_output
                 )
     # Launch Gradio interface with optimized server settings
-    demo.launch(share=True, server_name="0.0.0.0")
 if __name__ == "__main__":
     # Execute vectorized deployment pipeline
-    launch_gradio_interface()

 import torch
 import numpy as np
 import folium
 import PIL.Image
 from io import BytesIO
 import base64
+import json
+import time
+from typing import Tuple, List, Dict, Any, Optional, Union, Callable
 from pathlib import Path
+from datasets import Dataset, load_dataset, concatenate_datasets
 # GeoCLIP dependencies
 from geoclip import GeoCLIP
 from transformers import CLIPTokenizer, CLIPProcessor
+from huggingface_hub import HfApi
 class GeoCLIPCore:
     """
+    Vectorized GeoCLIP implementation with HuggingFace Hub integration.
+    Implements tensor-optimized inference with persistent dataset storage:
+    1. Text-to-location prediction with confidence scoring
+    2. Image-to-location prediction with metadata extraction
+    3. Coordinate embedding generation for vector analysis
+    4. Cross-modal similarity computation
+    5. Dataset persistence to HuggingFace Hub
     """
+    def __init__(self,
+                 device: Optional[str] = None,
+                 dataset_id: str = "latterworks/geo-metadata",
+                 token: Optional[str] = None) -> None:
         """
+        Initialize model with optimal compute allocation and dataset connection.
         Args:
             device: Target compute device (None for auto-detection)
+            dataset_id: HuggingFace dataset identifier
+            token: HuggingFace API token
         """
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.dataset_id = dataset_id
+        self.token = token
+        # Initialize HuggingFace API for dataset operations
+        self.api = HfApi(token=token)
+        # Load and configure core model components with vectorized execution path
         self._model = GeoCLIP().to(self.device)
         self._tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
         self._processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        # Cache frequently accessed components for reduced latency
         self._location_encoder = self._model.location_encoder
         self._image_encoder = self._model.image_encoder
         self._gps_gallery = None  # Lazy-loaded on first prediction
+        # Initialize local dataset cache
+        self._initialize_dataset()
+        print(f"GeoCLIP initialized on {self.device} with Hub dataset: {dataset_id}")
+    def _initialize_dataset(self) -> None:
+        """Initialize connection to HuggingFace dataset with atomic transaction handling."""
+        try:
+            # Attempt to load existing dataset
+            self.dataset = load_dataset(self.dataset_id, split="train", token=self.token)
+            print(f"Loaded existing dataset with {len(self.dataset)} entries")
+        except Exception as e:
+            print(f"Creating new dataset: {e}")
+            # Create empty dataset with required schema
+            self.dataset = Dataset.from_dict({
+                "filename": [],
+                "classes": [],
+                "metadata": []
+            })
     def embed_text(self, text: str) -> torch.Tensor:
         """
             L2-normalized embedding tensor (shape: [1, 512])
         """
         with torch.no_grad():
+            # Process different image input types with type-specific optimizations
             if isinstance(image, str):
                 # Path to image file
                 image = PIL.Image.open(image).convert("RGB")
             elif isinstance(image, np.ndarray):
+                # Convert numpy array to PIL Image with optimal memory layout
                 image = PIL.Image.fromarray(np.uint8(image)).convert("RGB")
+            # Process image using CLIP processor with tensor allocation
             inputs = self._processor(images=image, return_tensors="pt").to(self.device)
             embedding = self._model.image_encoder(inputs.pixel_values)
             return torch.nn.functional.normalize(embedding, dim=1)
             embedding = self._location_encoder(coords_tensor)
             return torch.nn.functional.normalize(embedding, dim=1)
+    def _ensure_gps_gallery(self) -> None:
         """Ensure GPS gallery is loaded and cached for efficient reuse."""
         if self._gps_gallery is None:
             self._gps_gallery = self._model.gps_gallery.to(self.device)
             List of prediction dictionaries with coordinates and confidence scores
         """
         with torch.no_grad():
+            # Ensure GPS gallery is loaded with resource pooling
             self._ensure_gps_gallery()
+            # Generate location embeddings with memory-efficient tensor operations
             location_embeddings = self._location_encoder(self._gps_gallery)
             location_embeddings = torch.nn.functional.normalize(location_embeddings, dim=1)
+            # Calculate similarity with vectorized matrix multiplication
             similarity = self._model.logit_scale.exp() * (query_embedding @ location_embeddings.T)
             probs = similarity.softmax(dim=-1)
+            # Extract top predictions with single tensor operation
             top_values, top_indices = torch.topk(probs[0], min(top_k, len(self._gps_gallery)))
+            # Format results with CPU offloading
             predictions = []
             for idx, confidence in zip(top_indices.cpu().numpy(), top_values.cpu().numpy()):
                 predictions.append({
         embedding = self.embed_text(text)
         return self.predict_location(embedding, top_k)
+    def image_to_location(self,
+                          image: Union[str, PIL.Image.Image, np.ndarray],
+                          top_k: int = 5) -> List[Dict[str, Any]]:
         """
         Primary entry point for image-to-location prediction pipeline.
         embedding = self.embed_image(image)
         return self.predict_location(embedding, top_k)
+    def extract_image_metadata(self, image_path: str) -> Dict[str, Any]:
+        """
+        Extract comprehensive metadata from image file with GPS coordinates.
+        Args:
+            image_path: Path to image file
+        Returns:
+            Dictionary containing extracted metadata
+        """
+        try:
+            from PIL import Image, ExifTags
+            import piexif
+            # Open image and extract EXIF data with efficient memory mapping
+            img = Image.open(image_path)
+            metadata = {"file_name": image_path, "file_size": os.path.getsize(image_path)}
+            # Extract basic image properties
+            metadata["format"] = img.format
+            metadata["mode"] = img.mode
+            metadata["size"] = list(img.size)
+            if hasattr(img, "_getexif") and img._getexif():
+                exif_dict = {}
+                for tag_id, value in img._getexif().items():
+                    tag = ExifTags.TAGS.get(tag_id, tag_id)
+                    exif_dict[tag.lower()] = value
+                # Copy relevant EXIF data to metadata
+                for key, value in exif_dict.items():
+                    if isinstance(value, bytes):
+                        continue
+                    metadata[key] = value
+                # Extract GPS data with specialized parsing
+                gps_info = {}
+                if "gpsinfo" in exif_dict:
+                    gps_data = exif_dict["gpsinfo"]
+                    for key, value in gps_data.items():
+                        tag = ExifTags.GPSTAGS.get(key, key)
+                        gps_info[tag] = value
+                    # Parse GPS coordinates to decimal format
+                    if "GPSLatitude" in gps_info and "GPSLongitude" in gps_info:
+                        lat = self._convert_to_decimal(
+                            gps_info["GPSLatitude"],
+                            gps_info.get("GPSLatitudeRef", "N")
+                        )
+                        lon = self._convert_to_decimal(
+                            gps_info["GPSLongitude"],
+                            gps_info.get("GPSLongitudeRef", "E")
+                        )
+                        gps_info["Latitude"] = lat
+                        gps_info["Longitude"] = lon
+                    metadata["gps_info"] = gps_info
+            # Add file metadata
+            metadata["file_extension"] = os.path.splitext(image_path)[1]
+            metadata["extraction_timestamp"] = int(time.time())
+            return metadata
+        except Exception as e:
+            print(f"Error extracting metadata: {e}")
+            return {"error": str(e), "file_name": image_path}
+    def _convert_to_decimal(self, dms_coords, ref) -> float:
+        """
+        Convert GPS DMS (Degree, Minute, Second) to decimal format.
+        Args:
+            dms_coords: Tuple of degrees, minutes, seconds
+            ref: Direction reference (N/S/E/W)
+        Returns:
+            Decimal coordinate value
+        """
+        degrees = dms_coords[0]
+        minutes = dms_coords[1] / 60.0
+        seconds = dms_coords[2] / 3600.0
+        decimal = degrees + minutes + seconds
+        # Apply negative value for south or west coordinates
+        if ref in ['S', 'W']:
+            decimal = -decimal
+        return decimal
+    def add_to_dataset(self,
+                       image_path: str,
+                       classes: Optional[List[str]] = None,
+                       push_to_hub: bool = True) -> Dict[str, Any]:
+        """
+        Process image and add entry to dataset with optional HuggingFace Hub synchronization.
+        Args:
+            image_path: Path to image file
+            classes: Optional list of class labels
+            push_to_hub: Whether to push changes to Hub
+        Returns:
+            Dictionary containing the added entry
+        """
+        # Extract filename from path
+        filename = os.path.basename(image_path)
+        # Extract comprehensive metadata with optimized parser
+        metadata = self.extract_image_metadata(image_path)
+        # Prepare new entry
+        new_entry = {
+            "filename": filename,
+            "classes": classes or [],
+            "metadata": metadata
+        }
+        # Add to local dataset with optimized append operation
+        self.dataset = concatenate_datasets([
+            self.dataset,
+            Dataset.from_dict({
+                "filename": [new_entry["filename"]],
+                "classes": [new_entry["classes"]],
+                "metadata": [new_entry["metadata"]]
+            })
+        ])
+        # Push updates to HuggingFace Hub
+        if push_to_hub:
+            self.push_dataset_to_hub()
+        return new_entry
+    def push_dataset_to_hub(self) -> None:
+        """Push dataset updates to HuggingFace Hub with atomic transaction."""
+        if self.token:
+            try:
+                self.dataset.push_to_hub(self.dataset_id, token=self.token)
+                print(f"Successfully pushed dataset with {len(self.dataset)} entries to {self.dataset_id}")
+            except Exception as e:
+                print(f"Error pushing to Hub: {e}")
+        else:
+            print("HuggingFace token not provided. Dataset not pushed to Hub.")
     def compute_similarity(self, embed1: torch.Tensor, embed2: torch.Tensor) -> float:
         """
         Compute cosine similarity between two embeddings.
         return m
+def launch_gradio_interface(hf_token: Optional[str] = None):
+    """
+    Deploy GeoCLIP with Gradio interface with Hub data persistence.
+    Args:
+        hf_token: HuggingFace API token for dataset operations
+    """
     # Initialize model with optimal compute configuration
+    geo_core = GeoCLIPCore(token=hf_token)
     def predict_from_text(text_query, top_k):
+        """Process text query and generate visualization with vector operations."""
         if not text_query.strip():
             return None, "Please enter a location description."
+        # Execute prediction pipeline with tensor acceleration
         predictions = geo_core.text_to_location(text_query, top_k=int(top_k))
         # Generate map visualization
             title=f"Predictions for: {text_query}"
         )
+        # Create HTML representation
         map_html = m._repr_html_()
         # Format textual results
         return map_html, result_text
+    def process_image(image, image_path, save_to_hub, top_k):
+        """
+        Process image for prediction and metadata extraction with Hub integration.
+        Returns map visualization, prediction results, and metadata.
+        """
         if image is None:
+            return None, "Please upload an image.", "{}"
+        # Execute prediction pipeline with tensor acceleration
         predictions = geo_core.image_to_location(image, top_k=int(top_k))
         # Generate map visualization
             title="Predictions from Image"
         )
+        # Create HTML representation
         map_html = m._repr_html_()
         # Format textual results
             conf = pred["confidence"]
             result_text += f"{i}. ({coords[0]:.6f}, {coords[1]:.6f}) - confidence: {conf:.6f}\n"
+        # Extract metadata if image was uploaded and path is available
+        metadata = {}
+        if image_path:
+            # Add to dataset if requested
+            if save_to_hub:
+                entry = geo_core.add_to_dataset(
+                    image_path,
+                    classes=["location"],
+                    push_to_hub=True
+                )
+                metadata = entry["metadata"]
+            else:
+                # Just extract metadata without saving
+                metadata = geo_core.extract_image_metadata(image_path)
+        # Format metadata as JSON
+        metadata_json = json.dumps(metadata, indent=2)
+        return map_html, result_text, metadata_json
     def compute_text_similarity(text1, text2):
         """Compute semantic similarity between two text descriptions."""
     # Create Gradio interface with tabs for different functions
     with gr.Blocks(title="GeoCLIP Location Intelligence") as demo:
+        gr.Markdown("# GeoCLIP Location Intelligence with Hub Integration")
+        gr.Markdown("Predict locations from text descriptions or images with dataset persistence.")
         with gr.Tabs():
             with gr.TabItem("Text → Location"):
                     outputs=[text_map_output, text_result_output]
                 )
+            with gr.TabItem("Image → Location with Hub Integration"):
                 with gr.Row():
                     with gr.Column():
                         image_input = gr.Image(type="pil", label="Upload Image")
+                        save_to_hub = gr.Checkbox(
+                            label="Save to HuggingFace Dataset",
+                            value=True
+                        )
                         image_top_k = gr.Slider(
                             minimum=1,
                             maximum=20,
                             step=1,
                             label="Number of Predictions"
                         )
+                        image_submit = gr.Button("Process Image")
                 image_map_output = gr.HTML(label="Map Visualization")
                 image_result_output = gr.Textbox(label="Prediction Results")
+                metadata_output = gr.JSON(label="Image Metadata")
                 image_submit.click(
+                    process_image,
+                    inputs=[image_input, image_input.upload_path, save_to_hub, image_top_k],
+                    outputs=[image_map_output, image_result_output, metadata_output]
                 )
             with gr.TabItem("Semantic Similarity"):
                     inputs=[text1_input, text2_input],
                     outputs=similarity_output
                 )
+            with gr.TabItem("Dataset Status"):
+                dataset_info = gr.Markdown(f"Current dataset: {geo_core.dataset_id}")
+                dataset_count = gr.Markdown(f"Number of entries: {len(geo_core.dataset)}")
+                update_status = gr.Button("Refresh Dataset Status")
+                def update_dataset_status():
+                    return (
+                        f"Current dataset: {geo_core.dataset_id}",
+                        f"Number of entries: {len(geo_core.dataset)}"
+                    )
+                update_status.click(
+                    update_dataset_status,
+                    inputs=[],
+                    outputs=[dataset_info, dataset_count]
+                )
     # Launch Gradio interface with optimized server settings
+    demo.launch(share=True)
 if __name__ == "__main__":
+    # Read API token from environment variable
+    hf_token = os.environ.get("HF_TOKEN")
     # Execute vectorized deployment pipeline
+    launch_gradio_interface(hf_token=hf_token)