Spaces:

latterworks
/

geo-metadata-extractor-gradio

Runtime error

App Files Files Community

latterworks commited on Mar 18

Commit

52e39e5

verified ·

1 Parent(s): 50428b9

Update app.py

Browse files

Files changed (1) hide show

app.py +399 -25

app.py CHANGED Viewed

@@ -1,29 +1,403 @@
-import gradio as gr
 import torch
 from geoclip import GeoCLIP
-# Load the GeoCLIP model
-model = GeoCLIP()
-# Define the function for geolocation prediction
-def predict_location(image_path):
-    top_pred_gps, top_pred_prob = model.predict(image_path, top_k=5)
-    results = []
-    for i in range(5):
-        lat, lon = top_pred_gps[i]
-        prob = top_pred_prob[i]
-        results.append(f"Prediction {i+1}: ({lat:.6f}, {lon:.6f}) | Probability: {prob:.6f}")
-    return "\n".join(results)
-# Define Gradio interface
-interface = gr.Interface(
-    fn=predict_location,
-    inputs=gr.Image(type="filepath", label="Upload Image"),
-    outputs=gr.Textbox(label="Predicted Locations"),
-    title="GeoCLIP Geolocation",
-    description="Upload an image, and GeoCLIP will predict the top 5 GPS locations."
-)
-# Launch the Gradio app
 if __name__ == "__main__":
-    interface.launch()

 import torch
+import numpy as np
+import folium
+from folium.plugins import HeatMap, MarkerCluster
+import gradio as gr
+import os
+import PIL.Image
+from io import BytesIO
+import base64
+from typing import Tuple, List, Dict, Any, Optional, Union
+from pathlib import Path
+# GeoCLIP dependencies
 from geoclip import GeoCLIP
+from transformers import CLIPTokenizer, CLIPProcessor
+class GeoCLIPCore:
+    """
+    Vectorized GeoCLIP implementation with minimal compute overhead.
+    Implements tensor-optimized inference for:
+    1. Text-to-location prediction
+    2. Image-to-location prediction
+    3. Coordinate embedding generation
+    4. Cross-modal similarity analysis
+    """
+    def __init__(self, device: Optional[str] = None) -> None:
+        """
+        Initialize model with optimal compute resource allocation.
+        Args:
+            device: Target compute device (None for auto-detection)
+        """
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        # Load and configure core model components
+        self._model = GeoCLIP().to(self.device)
+        self._tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        self._processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        # Cache frequently used components for performance
+        self._location_encoder = self._model.location_encoder
+        self._image_encoder = self._model.image_encoder
+        self._gps_gallery = None  # Lazy-loaded on first prediction
+        print(f"GeoCLIP initialized on {self.device}")
+    def embed_text(self, text: str) -> torch.Tensor:
+        """
+        Generate normalized embedding for text input using vectorized operations.
+        Args:
+            text: Text description to encode
+        Returns:
+            L2-normalized embedding tensor (shape: [1, 512])
+        """
+        with torch.no_grad():
+            tokens = self._tokenizer(text, return_tensors="pt", padding=True).to(self.device)
+            embedding = self._model.image_encoder.mlp(
+                self._model.image_encoder.CLIP.get_text_features(**tokens)
+            )
+            return torch.nn.functional.normalize(embedding, dim=1)
+    def embed_image(self, image: Union[str, PIL.Image.Image, np.ndarray]) -> torch.Tensor:
+        """
+        Generate normalized embedding for image input using vectorized operations.
+        Args:
+            image: Input image (PIL Image, file path, or numpy array)
+        Returns:
+            L2-normalized embedding tensor (shape: [1, 512])
+        """
+        with torch.no_grad():
+            # Process different image input types
+            if isinstance(image, str):
+                # Path to image file
+                image = PIL.Image.open(image).convert("RGB")
+            elif isinstance(image, np.ndarray):
+                # Convert numpy array to PIL Image
+                image = PIL.Image.fromarray(np.uint8(image)).convert("RGB")
+            # Process image using CLIP processor
+            inputs = self._processor(images=image, return_tensors="pt").to(self.device)
+            embedding = self._model.image_encoder(inputs.pixel_values)
+            return torch.nn.functional.normalize(embedding, dim=1)
+    def embed_coordinates(self, coords: Tuple[float, float]) -> torch.Tensor:
+        """
+        Generate normalized embedding for geographical coordinates.
+        Args:
+            coords: Coordinate pair (latitude, longitude)
+        Returns:
+            L2-normalized embedding tensor (shape: [1, 512])
+        """
+        with torch.no_grad():
+            coords_tensor = torch.tensor([coords], dtype=torch.float32).to(self.device)
+            embedding = self._location_encoder(coords_tensor)
+            return torch.nn.functional.normalize(embedding, dim=1)
+    def _ensure_gps_gallery(self):
+        """Ensure GPS gallery is loaded and cached for efficient reuse."""
+        if self._gps_gallery is None:
+            self._gps_gallery = self._model.gps_gallery.to(self.device)
+    def predict_location(self,
+                         query_embedding: torch.Tensor,
+                         top_k: int = 5) -> List[Dict[str, Any]]:
+        """
+        Execute cosine similarity-based location retrieval against GPS gallery.
+        Args:
+            query_embedding: L2-normalized query embedding
+            top_k: Number of top predictions to return
+        Returns:
+            List of prediction dictionaries with coordinates and confidence scores
+        """
+        with torch.no_grad():
+            # Ensure GPS gallery is loaded
+            self._ensure_gps_gallery()
+            # Generate location embeddings
+            location_embeddings = self._location_encoder(self._gps_gallery)
+            location_embeddings = torch.nn.functional.normalize(location_embeddings, dim=1)
+            # Calculate similarity and softmax probabilities
+            similarity = self._model.logit_scale.exp() * (query_embedding @ location_embeddings.T)
+            probs = similarity.softmax(dim=-1)
+            # Extract top predictions
+            top_values, top_indices = torch.topk(probs[0], min(top_k, len(self._gps_gallery)))
+            # Format results
+            predictions = []
+            for idx, confidence in zip(top_indices.cpu().numpy(), top_values.cpu().numpy()):
+                predictions.append({
+                    "coordinates": tuple(self._gps_gallery[idx].cpu().numpy()),
+                    "confidence": float(confidence)
+                })
+            return predictions
+    def text_to_location(self, text: str, top_k: int = 5) -> List[Dict[str, Any]]:
+        """
+        Primary entry point for text-to-location prediction pipeline.
+        Args:
+            text: Text description to predict location for
+            top_k: Number of top predictions to return
+        Returns:
+            List of prediction dictionaries with coordinates and confidence scores
+        """
+        embedding = self.embed_text(text)
+        return self.predict_location(embedding, top_k)
+    def image_to_location(self, image: Union[str, PIL.Image.Image, np.ndarray], top_k: int = 5) -> List[Dict[str, Any]]:
+        """
+        Primary entry point for image-to-location prediction pipeline.
+        Args:
+            image: Input image (PIL Image, file path, or numpy array)
+            top_k: Number of top predictions to return
+        Returns:
+            List of prediction dictionaries with coordinates and confidence scores
+        """
+        embedding = self.embed_image(image)
+        return self.predict_location(embedding, top_k)
+    def compute_similarity(self, embed1: torch.Tensor, embed2: torch.Tensor) -> float:
+        """
+        Compute cosine similarity between two embeddings.
+        Args:
+            embed1: First embedding tensor
+            embed2: Second embedding tensor
+        Returns:
+            Similarity score between 0 and 1
+        """
+        return float(torch.nn.functional.cosine_similarity(embed1, embed2).item())
+    def create_map_visualization(self,
+                                predictions: List[Dict[str, Any]],
+                                title: str = "",
+                                cluster: bool = False) -> folium.Map:
+        """
+        Generate geospatial visualization of prediction results.
+        Args:
+            predictions: List of prediction dictionaries
+            title: Optional map title
+            cluster: Whether to cluster nearby markers
+        Returns:
+            Folium map object with marker and heatmap layers
+        """
+        # Initialize map centered on highest confidence prediction
+        center_coords = predictions[0]["coordinates"]
+        m = folium.Map(location=center_coords, zoom_start=5, tiles="OpenStreetMap")
+        # Add title if provided
+        if title:
+            title_html = f'<h3 style="text-align:center">{title}</h3>'
+            m.get_root().html.add_child(folium.Element(title_html))
+        # Create marker cluster if requested
+        marker_group = MarkerCluster() if cluster else m
+        # Add markers with confidence metadata
+        for i, pred in enumerate(predictions):
+            color = 'red' if i == 0 else 'blue' if i < 3 else 'green'
+            folium.Marker(
+                location=pred["coordinates"],
+                popup=f"Prediction #{i+1}<br>Confidence: {pred['confidence']:.6f}",
+                icon=folium.Icon(color=color)
+            ).add_to(marker_group if cluster else m)
+        # Add marker cluster to map if used
+        if cluster:
+            m.add_child(marker_group)
+        # Add heatmap layer for visual density representation
+        if len(predictions) >= 3:
+            heat_data = [[p["coordinates"][0], p["coordinates"][1], p["confidence"]]
+                         for p in predictions]
+            HeatMap(heat_data, radius=15, blur=10).add_to(m)
+        return m
+def launch_gradio_interface():
+    """Deploy GeoCLIP with Gradio interface for both text and image inputs."""
+    # Initialize model with optimal compute configuration
+    geo_core = GeoCLIPCore()
+    def predict_from_text(text_query, top_k):
+        """Process text query and generate visualization."""
+        if not text_query.strip():
+            return None, "Please enter a location description."
+        # Execute prediction pipeline
+        predictions = geo_core.text_to_location(text_query, top_k=int(top_k))
+        # Generate map visualization
+        m = geo_core.create_map_visualization(
+            predictions,
+            title=f"Predictions for: {text_query}"
+        )
+        # Create temporary HTML file for map
+        map_html = m._repr_html_()
+        # Format textual results
+        result_text = f"Top predictions for: '{text_query}'\n\n"
+        for i, pred in enumerate(predictions, 1):
+            coords = pred["coordinates"]
+            conf = pred["confidence"]
+            result_text += f"{i}. ({coords[0]:.6f}, {coords[1]:.6f}) - confidence: {conf:.6f}\n"
+        return map_html, result_text
+    def predict_from_image(image, top_k):
+        """Process image input and generate visualization."""
+        if image is None:
+            return None, "Please upload an image."
+        # Execute prediction pipeline
+        predictions = geo_core.image_to_location(image, top_k=int(top_k))
+        # Generate map visualization
+        m = geo_core.create_map_visualization(
+            predictions,
+            title="Predictions from Image"
+        )
+        # Create temporary HTML file for map
+        map_html = m._repr_html_()
+        # Format textual results
+        result_text = "Top predictions from image:\n\n"
+        for i, pred in enumerate(predictions, 1):
+            coords = pred["coordinates"]
+            conf = pred["confidence"]
+            result_text += f"{i}. ({coords[0]:.6f}, {coords[1]:.6f}) - confidence: {conf:.6f}\n"
+        return map_html, result_text
+    def compute_text_similarity(text1, text2):
+        """Compute semantic similarity between two text descriptions."""
+        if not text1.strip() or not text2.strip():
+            return "Please enter both text descriptions."
+        embed1 = geo_core.embed_text(text1)
+        embed2 = geo_core.embed_text(text2)
+        similarity = geo_core.compute_similarity(embed1, embed2)
+        return f"Similarity between the texts: {similarity:.4f} (range: 0-1)"
+    # Create Gradio interface with tabs for different functions
+    with gr.Blocks(title="GeoCLIP Location Intelligence") as demo:
+        gr.Markdown("# GeoCLIP Location Intelligence")
+        gr.Markdown("Predict locations from text descriptions or images.")
+        with gr.Tabs():
+            with gr.TabItem("Text → Location"):
+                with gr.Row():
+                    with gr.Column():
+                        text_input = gr.Textbox(
+                            lines=3,
+                            placeholder="Enter location description...",
+                            label="Location Description"
+                        )
+                        text_top_k = gr.Slider(
+                            minimum=1,
+                            maximum=20,
+                            value=10,
+                            step=1,
+                            label="Number of Predictions"
+                        )
+                        text_submit = gr.Button("Predict Location")
+                    with gr.Column():
+                        text_examples = gr.Examples(
+                            examples=[
+                                "ancient pyramids in desert",
+                                "Eiffel Tower in Paris",
+                                "beach resort with palm trees",
+                                "technology hub with startups",
+                                "busy downtown with skyscrapers",
+                                "mountain with snow and ski slopes",
+                                "tropical island with clear water"
+                            ],
+                            inputs=text_input
+                        )
+                text_map_output = gr.HTML(label="Map Visualization")
+                text_result_output = gr.Textbox(label="Prediction Results")
+                text_submit.click(
+                    predict_from_text,
+                    inputs=[text_input, text_top_k],
+                    outputs=[text_map_output, text_result_output]
+                )
+            with gr.TabItem("Image → Location"):
+                with gr.Row():
+                    with gr.Column():
+                        image_input = gr.Image(type="pil", label="Upload Image")
+                        image_top_k = gr.Slider(
+                            minimum=1,
+                            maximum=20,
+                            value=10,
+                            step=1,
+                            label="Number of Predictions"
+                        )
+                        image_submit = gr.Button("Predict Location")
+                image_map_output = gr.HTML(label="Map Visualization")
+                image_result_output = gr.Textbox(label="Prediction Results")
+                image_submit.click(
+                    predict_from_image,
+                    inputs=[image_input, image_top_k],
+                    outputs=[image_map_output, image_result_output]
+                )
+            with gr.TabItem("Semantic Similarity"):
+                text1_input = gr.Textbox(
+                    lines=2,
+                    placeholder="Enter first description...",
+                    label="Text Description 1"
+                )
+                text2_input = gr.Textbox(
+                    lines=2,
+                    placeholder="Enter second description...",
+                    label="Text Description 2"
+                )
+                similarity_submit = gr.Button("Compute Similarity")
+                similarity_output = gr.Textbox(label="Similarity Result")
+                similarity_submit.click(
+                    compute_text_similarity,
+                    inputs=[text1_input, text2_input],
+                    outputs=similarity_output
+                )
+    # Launch Gradio interface with optimized server settings
+    demo.launch(share=True, server_name="0.0.0.0")
 if __name__ == "__main__":
+    # Execute vectorized deployment pipeline
+    launch_gradio_interface()