Spaces:

mshukor
/

eP-ALM

Runtime error

App Files Files Community

mshukor commited on Jul 14, 2023

Commit

902be23

1 Parent(s): 78ad2cd

vqa

Browse files

Files changed (1) hide show

app.py +41 -9

app.py CHANGED Viewed

@@ -80,13 +80,9 @@ checkpoint = torch.load(checkpoint_path, map_location='cpu')
 state_dict = checkpoint['model']
 msg = model_caption.load_state_dict(state_dict,strict=False)
 ###### VQA
 config = 'configs/image/ePALM_vqa.yaml'
 config = yaml.load(open(config, 'r'))
@@ -112,6 +108,28 @@ state_dict = checkpoint['model']
 msg = model_vqa.load_state_dict(state_dict,strict=False)
@@ -148,8 +166,7 @@ num_beams=3
 max_length=30
-model_caption.bfloat16()
-model_vqa.bfloat16()
 def inference(image, audio, video, task_type, instruction):
@@ -157,11 +174,26 @@ def inference(image, audio, video, task_type, instruction):
     if task_type == 'Image Captioning':
         text = ['']
         text_input = tokenizer(text, padding='longest', return_tensors="pt").to(device)
-        model = model_caption
     elif task_type == 'Visual Question Answering':
         question = instruction+'?'+special_answer_token
         text_input = tokenizer(question, padding='longest', return_tensors="pt").to(device)
-        model = model_vqa
     else:
         raise NotImplemented

 state_dict = checkpoint['model']
 msg = model_caption.load_state_dict(state_dict,strict=False)
+model_caption.bfloat16()
 ###### VQA
 config = 'configs/image/ePALM_vqa.yaml'
 config = yaml.load(open(config, 'r'))
 msg = model_vqa.load_state_dict(state_dict,strict=False)
+model_vqa.bfloat16()
+# Video Captioning
+checkpoint_path = 'checkpoints/float32/ePALM_video_caption_msrvtt/checkpoint_best.pth'
+# checkpoint_path = '/data/mshukor/logs/eplam/models/accelerate/ePALM_pt_L_acc_caption/checkpoint_best.pth'
+checkpoint = torch.load(checkpoint_path, map_location='cpu')
+state_dict_video_caption = checkpoint['model']
+# Video QA
+checkpoint_path = 'checkpoints/float32/ePALM_video_qa_msrvtt/checkpoint_best.pth'
+# checkpoint_path = '/data/mshukor/logs/eplam/models/accelerate/ePALM_pt_L_acc_caption/checkpoint_best.pth'
+checkpoint = torch.load(checkpoint_path, map_location='cpu')
+state_dict_video_qa = checkpoint['model']
+# Audio Captioning
+checkpoint_path = 'checkpoints/float32/ePALM_audio_caption/checkpoint_best.pth'
+# checkpoint_path = '/data/mshukor/logs/eplam/models/accelerate/ePALM_pt_L_acc_caption/checkpoint_best.pth'
+checkpoint = torch.load(checkpoint_path, map_location='cpu')
+state_dict_audio_caption = checkpoint['model']
 max_length=30
 def inference(image, audio, video, task_type, instruction):
     if task_type == 'Image Captioning':
         text = ['']
         text_input = tokenizer(text, padding='longest', return_tensors="pt").to(device)
+        model = model_caption.clone()
+    elif task_type == 'Video Captioning':
+        text = ['']
+        text_input = tokenizer(text, padding='longest', return_tensors="pt").to(device)
+        model_caption = model_caption.load_state_dict(state_dict_video_caption,strict=False)
+        model = model_caption.clone()
+    elif task_type == 'Audio Captioning':
+        text = ['']
+        text_input = tokenizer(text, padding='longest', return_tensors="pt").to(device)
+        model_caption = model_caption.load_state_dict(state_dict_audio_caption,strict=False)
+        model = model_caption.clone()
+    elif task_type == 'Visual Question Answering':
+        question = instruction+'?'+special_answer_token
+        text_input = tokenizer(question, padding='longest', return_tensors="pt").to(device)
+        model = model_vqa.clone()
     elif task_type == 'Visual Question Answering':
         question = instruction+'?'+special_answer_token
         text_input = tokenizer(question, padding='longest', return_tensors="pt").to(device)
+        model_vqa = model_vqa.load_state_dict(state_dict_video_qa,strict=False)
+        model = model_vqa.clone()
     else:
         raise NotImplemented