Spaces:

bala1802
/

clip_demo

Sleeping

App Files Files Community

bala1802 commited on Dec 21, 2023

Commit

d4e8957

1 Parent(s): 83526ef

Upload 3 files

Browse files

added inferencing files

Files changed (3) hide show

clip_inferencing.py +65 -0
clip_model.py +53 -0
configuration.py +33 -0

clip_inferencing.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn.functional as F
+from transformers import DistilBertTokenizer
+from tqdm.autonotebook import tqdm
+import pickle
+from clip_model import CLIPModel
+from configuration import CFG
+import matplotlib.pyplot as plt
+import cv2
+def load_model(model_path):
+    model = CLIPModel().to(CFG.device)
+    model.load_state_dict(torch.load(model_path, map_location=CFG.device))
+    model.eval()
+    return model
+def load_df():
+    with open("pickles/valid_df.pkl", 'rb') as file:
+        valid_df = pickle.load(file)
+        return valid_df
+def load_image_embeddings():
+    with open("pickles/image_embeddings.pkl", 'rb') as file:
+        image_embeddings = pickle.load(file)
+        return image_embeddings
+def find_matches(model, image_embeddings, query, image_filenames, n=9):
+    tokenizer = DistilBertTokenizer.from_pretrained(CFG.text_tokenizer)
+    encoded_query = tokenizer([query])
+    batch = {
+        key: torch.tensor(values).to(CFG.device)
+        for key, values in encoded_query.items()
+    }
+    with torch.no_grad():
+        text_features = model.text_encoder(
+            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
+        )
+        text_embeddings = model.text_projection(text_features)
+    image_embeddings_n = F.normalize(image_embeddings, p=2, dim=-1)
+    text_embeddings_n = F.normalize(text_embeddings, p=2, dim=-1)
+    dot_similarity = text_embeddings_n @ image_embeddings_n.T
+    values, indices = torch.topk(dot_similarity.squeeze(0), n * 5)
+    matches = [image_filenames[idx] for idx in indices[::5]]
+    _, axes = plt.subplots(3, 3, figsize=(10, 10))
+    for match, ax in zip(matches, axes.flatten()):
+        image = cv2.imread(f"Images/{match}")
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        ax.imshow(image)
+        ax.axis("off")
+    plt.show()
+def inference():
+    valid_df = load_df()
+    image_embeddings = load_image_embeddings()
+    find_matches(load_model(model_path="model/best.pt"),
+                       image_embeddings,
+                       query="dogs on the grass",
+                       image_filenames=valid_df['image'].values, n=9)

clip_model.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from torch import nn
+import torch.nn.functional as F
+from image_encoder import ImageEncoder
+from text_encoder import TextEncoder
+from projection_head import ProjectionHead
+from configuration import CFG
+class CLIPModel(nn.Module):
+    def __init__(
+        self,
+        temperature=CFG.temperature,
+        image_embedding=CFG.image_embedding,
+        text_embedding=CFG.text_embedding,
+    ):
+        super().__init__()
+        self.image_encoder = ImageEncoder()
+        self.text_encoder = TextEncoder()
+        self.image_projection = ProjectionHead(embedding_dim=image_embedding)
+        self.text_projection = ProjectionHead(embedding_dim=text_embedding)
+        self.temperature = temperature
+    def forward(self, batch):
+        # Getting Image and Text Features
+        image_features = self.image_encoder(batch["image"])
+        text_features = self.text_encoder(
+            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
+        )
+        # Getting Image and Text Embeddings (with same dimension)
+        image_embeddings = self.image_projection(image_features)
+        text_embeddings = self.text_projection(text_features)
+        # Calculating the Loss
+        logits = (text_embeddings @ image_embeddings.T) / self.temperature
+        images_similarity = image_embeddings @ image_embeddings.T
+        texts_similarity = text_embeddings @ text_embeddings.T
+        targets = F.softmax(
+            (images_similarity + texts_similarity) / 2 * self.temperature, dim=-1
+        )
+        texts_loss = cross_entropy(logits, targets, reduction='none')
+        images_loss = cross_entropy(logits.T, targets.T, reduction='none')
+        loss =  (images_loss + texts_loss) / 2.0 # shape: (batch_size)
+        return loss.mean()
+def cross_entropy(preds, targets, reduction='none'):
+    log_softmax = nn.LogSoftmax(dim=-1)
+    loss = (-targets * log_softmax(preds)).sum(1)
+    if reduction == "none":
+        return loss
+    elif reduction == "mean":
+        return loss.mean()

configuration.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+class CFG:
+    debug = False
+    batch_size = 32
+    num_workers = 2
+    head_lr = 1e-3
+    image_encoder_lr = 1e-4
+    text_encoder_lr = 1e-5
+    weight_decay = 1e-3
+    patience = 1
+    factor = 0.8
+    epochs = 1 #4
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_name = 'resnet50'
+    image_embedding = 2048
+    text_encoder_model = "distilbert-base-uncased"
+    text_embedding = 768
+    text_tokenizer = "distilbert-base-uncased"
+    max_length = 200
+    pretrained = True # for both image encoder and text encoder
+    trainable = True # for both image encoder and text encoder
+    temperature = 1.0
+    # image size
+    size = 224
+    # for projection head; used for both image and text encoders
+    num_projection_layers = 1
+    projection_dim = 256
+    dropout = 0.1