Spaces:

tooba248
/

Bidirectional-Retrieval-Model

Runtime error

App Files Files Community

tooba248 commited on May 30

Commit

e54509f

verified ·

1 Parent(s): 5685a0e

Upload 2 files

Browse files

Files changed (2) hide show

best_model.pt +3 -0
eval.py +74 -0

best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed57591f55d71c06050876296cfabd390e5265ca035dd98e4b8eaecd12203cfe
+size 605264460

eval.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+import clip
+from datasets import load_dataset
+from PIL import Image
+from torchvision import transforms
+import requests
+from io import BytesIO
+import numpy as np
+import faiss
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_clip, preprocess = clip.load("ViT-B/32", device=device)
+# Load Flickr30k test split
+dataset = load_dataset("nlphuji/flickr30k", split="test")
+image_embeddings = []
+text_embeddings = []
+ground_truth = []
+images = []
+captions = []
+print("Extracting embeddings...")
+for i, example in enumerate(dataset):
+    try:
+        img = Image.open(requests.get(example["image"], stream=True).raw).convert("RGB")
+        images.append(img)
+        captions.append(example["sentence"])
+        img_tensor = preprocess(img).unsqueeze(0).to(device)
+        with torch.no_grad():
+            img_feat = model_clip.encode_image(img_tensor)
+            img_feat /= img_feat.norm(dim=-1, keepdim=True)
+            image_embeddings.append(img_feat.cpu())
+            txt_token = clip.tokenize([example["sentence"]]).to(device)
+            txt_feat = model_clip.encode_text(txt_token)
+            txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
+            text_embeddings.append(txt_feat.cpu())
+        ground_truth.append(i)
+    except:
+        continue
+image_embeddings = torch.cat(image_embeddings, dim=0)
+text_embeddings = torch.cat(text_embeddings, dim=0)
+# Build FAISS indexes
+image_index = faiss.IndexFlatIP(image_embeddings.shape[1])
+image_index.add(image_embeddings.numpy())
+text_index = faiss.IndexFlatIP(text_embeddings.shape[1])
+text_index.add(text_embeddings.numpy())
+# Text-to-Image Retrieval Accuracy (Recall@1, 5, 10)
+def compute_recall(query_embeddings, index, ground_truth, k_values=[1, 5, 10]):
+    D, I = index.search(query_embeddings.numpy(), max(k_values))
+    recalls = {k: 0 for k in k_values}
+    for i, gt in enumerate(ground_truth):
+        for k in k_values:
+            if gt in I[i][:k]:
+                recalls[k] += 1
+    total = len(ground_truth)
+    return {f"Recall@{k}": round((recalls[k] / total) * 100, 2) for k in k_values}
+print("Evaluating text-to-image retrieval...")
+text_to_image_recall = compute_recall(text_embeddings, image_index, ground_truth)
+print("Text-to-Image:", text_to_image_recall)
+print("Evaluating image-to-text retrieval...")
+image_to_text_recall = compute_recall(image_embeddings, text_index, ground_truth)
+print("Image-to-Text:", image_to_text_recall)