Spaces:

WhereIsAI
/

Flickr30k-Search

Running

App Files Files Community

SeanLee97 commited on 16 days ago

Commit

5a30220

1 Parent(s): 490ff19

init commit

Browse files

Files changed (2) hide show

app.py +147 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from typing import List, Dict, Optional, Tuple
+import faiss
+import angle_emb
+import torch
+import numpy as np
+from tqdm import tqdm
+from datasets import Dataset
+class FlickrAngleSearch:
+    def __init__(self, model_name: str = "WhereIsAI/UAE-Large-V1", device: str = "cuda:0"):
+        """Initialize the search engine with model and empty index"""
+        self.model = angle_emb.AnglE(model_name, pooling_strategy='cls', device=device)
+        self._index: Optional[faiss.IndexFlatIP] = None
+        self.captions: Optional[List[str]] = None
+        self.caption2image: Optional[Dict[str, int]] = None
+        self.ds: Optional[Dataset] = None
+    def index(self, dataset: Dataset) -> "FlickrAngleSearch":
+        """Build the search index from a dataset"""
+        self.ds = dataset
+        # Extract unique captions and build caption->image mapping
+        captions: List[str] = []
+        caption2image: Dict[str, int] = {}
+        for i, example in enumerate(tqdm(dataset)):
+            for caption in example['caption']:
+                if caption not in caption2image:
+                    captions.append(caption)
+                    caption2image[caption] = i
+        self.captions = captions
+        self.caption2image = caption2image
+        # Encode captions
+        print(f"Encoding {len(captions)} unique captions...")
+        caption_embeddings = self.encode(captions)
+        # Build FAISS index
+        dimension = caption_embeddings.shape[1]
+        self._index = faiss.IndexFlatIP(dimension)
+        self._index.add(caption_embeddings)
+        return self
+    @classmethod
+    def from_preindexed(cls, index_path: str, captions_path: str, caption2image_path: str, dataset: Dataset, device: str = "cpu") -> "FlickrAngleSearch":
+        """Load a pre-built index and mappings"""
+        instance = cls(device=device)
+        instance._index = faiss.read_index(index_path)
+        instance.captions = torch.load(captions_path)
+        instance.caption2image = torch.load(caption2image_path)
+        instance.ds = dataset
+        return instance
+    def save_index(self, index_path: str, captions_path: str, caption2image_path: str) -> None:
+        """Save the index and mappings to disk"""
+        faiss.write_index(self._index, index_path)
+        torch.save(self.captions, captions_path)
+        torch.save(self.caption2image, caption2image_path)
+    def encode(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
+        """Encode a list of texts to embeddings"""
+        embeddings: List[np.ndarray] = []
+        for i in tqdm(range(0, len(texts), batch_size), desc="Encoding texts"):
+            batch = texts[i:i + batch_size]
+            with torch.no_grad():
+                embs = self.model.encode(batch, to_numpy=True, device=self.model.device)
+                embeddings.extend(embs)
+        return np.stack(embeddings)
+    def search(self, query: str, k: int = 5) -> List[Tuple[float, str, int]]:
+        """
+        Search for the top-k most relevant captions and their corresponding images
+        Args:
+            query: Text query to search for
+            k: Number of results to return
+        Returns:
+            List of (score, caption, image_index) tuples
+        """
+        # Encode the query text
+        query_embedding = self.encode([query])
+        # Search the index
+        scores, indices = self._index.search(query_embedding, k)
+        # Get the results
+        results: List[Tuple[float, str, int]] = []
+        for score, idx in zip(scores[0], indices[0]):
+            caption = self.captions[idx]
+            image_idx = self.caption2image[caption]
+            results.append((float(score), caption, image_idx))
+        return results
+if __name__ == "__main__":
+    import os
+    import gradio as gr
+    from datasets import load_dataset
+    from huggingface_hub import snapshot_download
+    local_dir = snapshot_download(repo_id='WhereIsAI/angle-flickr-index')
+    ds = load_dataset("nlphuji/flickr30k", split='test')
+    search = FlickrAngleSearch.from_preindexed(
+        os.path.join(local_dir, 'index.faiss'),
+        os.path.join(local_dir, 'captions.pt'),
+        os.path.join(local_dir, 'caption2image.pt'),
+        ds,
+        device='cpu'
+    )
+    def search_and_display(query, num_results=5):
+        results = search.search(query, k=num_results)
+        images = []
+        captions = []
+        similarities = []
+        for similarity, caption, image_idx in results:
+            image = ds[image_idx]['image']
+            images.append(image)
+            captions.append(caption)
+            similarities.append(f"{similarity:.4f}")
+        return images, captions, similarities
+    demo = gr.Interface(
+        fn=search_and_display,
+        inputs=[
+            gr.Textbox(label="Search Query"),
+            gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Number of Results")
+        ],
+        outputs=[
+            gr.Gallery(label="Top Results"),
+            gr.Dataframe(headers=["Caption"], label="Captions"),
+            gr.Dataframe(headers=["Similarity Score"], label="Similarity Scores")
+        ],
+        title="Flickr Image Search",
+        description="Search through Flickr images using natural language queries"
+    )
+    demo.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+datasets
+faiss-cpu
+gradio
+huggingface-hub
+angle-emb