Spaces:

mshukor
/

eP-ALM

Runtime error

App Files Files Community

mshukor commited on Jul 14, 2023

Commit

d5f4cd4

1 Parent(s): 02eb18e

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -13

app.py CHANGED Viewed

@@ -63,7 +63,7 @@ vision_model_name = 'vit_base_patch16_224'
 start_layer_idx = 19
 end_layer_idx = 31
 low_cpu = True
-model_caption = ePALM(opt_model_name=text_model,
                vision_model_name=vision_model_name,
                use_vis_prefix=True,
                start_layer_idx=start_layer_idx,
@@ -73,15 +73,15 @@ model_caption = ePALM(opt_model_name=text_model,
                low_cpu=low_cpu
 )
 print("Model Built")
-model_caption.to(device)
 checkpoint_path = 'checkpoints/float32/ePALM_caption/checkpoint_best.pth'
 # checkpoint_path = '/data/mshukor/logs/eplam/models/accelerate/ePALM_pt_L_acc_caption/checkpoint_best.pth'
 checkpoint = torch.load(checkpoint_path, map_location='cpu')
 state_dict = checkpoint['model']
-msg = model_caption.load_state_dict(state_dict,strict=False)
-model_caption.bfloat16()
 # ###### VQA
 # config = 'configs/image/ePALM_vqa.yaml'
@@ -242,27 +242,22 @@ def inference(image, audio, video, task_type, instruction):
     if task_type == 'Image Captioning':
         text = ['']
         text_input = tokenizer(text, padding='longest', return_tensors="pt").to(device)
-        model = model_caption
     elif task_type == 'Video Captioning':
         text = ['']
         text_input = tokenizer(text, padding='longest', return_tensors="pt").to(device)
-        model_caption = model_caption.load_state_dict(state_dict_video_caption,strict=False)
-        model = model_caption
     elif task_type == 'Audio Captioning':
         text = ['']
         text_input = tokenizer(text, padding='longest', return_tensors="pt").to(device)
-        model_caption = model_caption.load_state_dict(state_dict_audio_caption,strict=False)
-        model = model_caption
     elif task_type == 'Visual Question Answering':
         question = instruction+'?'+special_answer_token
         text_input = tokenizer(question, padding='longest', return_tensors="pt").to(device)
-        model_caption = model_caption.load_state_dict(state_dict_vqa,strict=False)
-        model = model_caption
     elif task_type == 'Visual Question Answering':
         question = instruction+'?'+special_answer_token
         text_input = tokenizer(question, padding='longest', return_tensors="pt").to(device)
-        model_caption = model_caption.load_state_dict(state_dict_video_qa,strict=False)
-        model = model_caption
     else:
         raise NotImplemented

 start_layer_idx = 19
 end_layer_idx = 31
 low_cpu = True
+model = ePALM(opt_model_name=text_model,
                vision_model_name=vision_model_name,
                use_vis_prefix=True,
                start_layer_idx=start_layer_idx,
                low_cpu=low_cpu
 )
 print("Model Built")
+model.to(device)
 checkpoint_path = 'checkpoints/float32/ePALM_caption/checkpoint_best.pth'
 # checkpoint_path = '/data/mshukor/logs/eplam/models/accelerate/ePALM_pt_L_acc_caption/checkpoint_best.pth'
 checkpoint = torch.load(checkpoint_path, map_location='cpu')
 state_dict = checkpoint['model']
+msg = model.load_state_dict(state_dict,strict=False)
+model.bfloat16()
 # ###### VQA
 # config = 'configs/image/ePALM_vqa.yaml'
     if task_type == 'Image Captioning':
         text = ['']
         text_input = tokenizer(text, padding='longest', return_tensors="pt").to(device)
     elif task_type == 'Video Captioning':
         text = ['']
         text_input = tokenizer(text, padding='longest', return_tensors="pt").to(device)
+        model = model.load_state_dict(state_dict_video_caption,strict=False)
     elif task_type == 'Audio Captioning':
         text = ['']
         text_input = tokenizer(text, padding='longest', return_tensors="pt").to(device)
+        model = model.load_state_dict(state_dict_audio_caption,strict=False)
     elif task_type == 'Visual Question Answering':
         question = instruction+'?'+special_answer_token
         text_input = tokenizer(question, padding='longest', return_tensors="pt").to(device)
+        model = model.load_state_dict(state_dict_vqa,strict=False)
     elif task_type == 'Visual Question Answering':
         question = instruction+'?'+special_answer_token
         text_input = tokenizer(question, padding='longest', return_tensors="pt").to(device)
+        model = model.load_state_dict(state_dict_video_qa,strict=False)
     else:
         raise NotImplemented