Spaces:

seddiktrk
/

CLIP-GPT_Image_Captionning

Sleeping

App Files Files Community

seddiktrk commited on Aug 31, 2024

Commit

4a09c08

verified ·

1 Parent(s): 4970c30

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -4

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ print(torch.__version__)
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(device)
 from transformers import GPT2Tokenizer,GPT2LMHeadModel,DataCollatorWithPadding
 tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
@@ -132,12 +133,9 @@ class ClipCaptionModel(nn.Module):
         # prepare mask
         if mask.shape[1] != embedding_cat.shape[1]:
-            dummy_mask = torch.ones(tokens.shape[0],self.prefix_length, dtype=torch.int64, device=self.gpt.device)
             mask = torch.cat([dummy_mask,mask],dim=1)
-        if labels is not None:
-            dummy_token = torch.zeros(tokens.shape[0],self.prefix_length, dtype=torch.int64, device=device)
-            labels = torch.cat((dummy_token, tokens), dim=1)
         return self.gpt(inputs_embeds=embedding_cat,
                        labels=labels,
@@ -167,6 +165,7 @@ class ClipCaptionModel(nn.Module):
                  dropout_rate = dropout_rate)
 ## Prepare Model
 CliPGPT = ClipCaptionModel()
 path = "model_epoch_1.pt"
@@ -176,6 +175,7 @@ state_dict = torch.load(path)
 CliPGPT.load_state_dict(state_dict)
 CliPGPT.to(device)
 from transformers import CLIPProcessor, CLIPModel
 model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
@@ -247,6 +247,7 @@ def generate(image,
       return tokens[0].replace('#','').strip()
 st.title("CLIP GPT2 Image Captionning")
 st.write("This is a web app for generating captions for images using a model built with CLIP & GPT2.")

 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(device)
+print('importing tokenizer')
 from transformers import GPT2Tokenizer,GPT2LMHeadModel,DataCollatorWithPadding
 tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         # prepare mask
         if mask.shape[1] != embedding_cat.shape[1]:
+            dummy_mask = torch.ones(tokens.shape[0],self.prefix_length, dtype=torch.int64, device=mask.device)
             mask = torch.cat([dummy_mask,mask],dim=1)
         return self.gpt(inputs_embeds=embedding_cat,
                        labels=labels,
                  dropout_rate = dropout_rate)
+print('loading model')
 ## Prepare Model
 CliPGPT = ClipCaptionModel()
 path = "model_epoch_1.pt"
 CliPGPT.load_state_dict(state_dict)
 CliPGPT.to(device)
+print('importing CLIP')
 from transformers import CLIPProcessor, CLIPModel
 model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
       return tokens[0].replace('#','').strip()
+print('app starts')
 st.title("CLIP GPT2 Image Captionning")
 st.write("This is a web app for generating captions for images using a model built with CLIP & GPT2.")