Spaces:

LN1996
/

Multi-Modal-Phi2

Runtime error

App Files Files Community

LN1996 commited on Jan 26, 2024

Commit

6fd64e4

verified ·

1 Parent(s): 0aeb624

Create app.py

Browse files

Files changed (1) hide show

app.py +191 -0

app.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import pandas as pd
+import torch
+from transformers import  AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+model_name = "microsoft/phi-2"
+phi2_model_pretrained = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    trust_remote_code=True,
+    device_map = 'cpu'
+)
+phi2_model_pretrained.config.use_cache = False
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.bos_token = tokenizer.eos_token
+def convert_text_input_embeds(text):
+    in_tokens = tokenizer(text, return_tensors="pt", return_attention_mask=False)
+    in_embeds = phi2_model_pretrained.get_input_embeddings()(in_tokens.input_ids)
+    return in_embeds
+import whisperx
+whisper_model = whisperx.load_model('small', device='cpu', compute_type='float32')
+def convert_audio_file_text_embeds(fname):
+    result = whisper_model.transcribe(fname)
+    full_text = ''
+    for seg in result['segments']:
+        full_text = full_text + seg['text']
+    return full_text.strip()
+from transformers import CLIPVisionModel, CLIPImageProcessor
+vision_tower_name = 'openai/clip-vit-base-patch32' ## torch.Size([1, 49, 768])
+image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
+vision_tower = CLIPVisionModel.from_pretrained(vision_tower_name)
+def feature_select(image_forward_outs):
+    image_features = image_forward_outs.hidden_states[-1] # last layer
+    image_features = image_features[:, 1:, :]
+    return image_features # [1, 49, 768]
+def image_CLIP_embed(image):
+    _ = vision_tower.requires_grad_(False)
+    image = image_processor(images=image, return_tensors="pt")
+    image_forward_out = vision_tower(image['pixel_values'].to(device=vision_tower.device), output_hidden_states=True)
+    image_feature = feature_select(image_forward_out)
+    return image_feature
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class CustomGELU(nn.Module):
+    def forward(self, x):
+        return F.gelu(x.clone())
+class SimpleResBlock(nn.Module):
+    def __init__(self, input_size):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(input_size)
+        self.proj = nn.Sequential(
+            nn.Linear(input_size, input_size),
+            nn.GELU(),
+            nn.Linear(input_size, input_size)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+class CLIPembed_projection(nn.Module):
+    def __init__(self,  input_dim_CLIP=768, input_dim_phi2=2560):
+        super(CLIPembed_projection, self).__init__()
+        self.input_dim_CLIP = input_dim_CLIP
+        self.input_dim_phi2 = input_dim_phi2
+        self.projection_img = nn.Linear(self.input_dim_CLIP, self.input_dim_phi2,
+                                        bias=False)
+        self.resblock = SimpleResBlock(self.input_dim_phi2)
+    def forward(self, x):
+        x = self.projection_img(x)
+        x = self.resblock(x)
+        return x
+Image_projection_layer = CLIPembed_projection()
+location_projection_img_p1 = f'./weights/stage_2/run2_projection_img.pth'
+location_projection_img_p2 = f'./weights/stage_2/run2_resblock.pth'
+# load projection_img, resblock from stage 2
+Image_projection_layer.projection_img.load_state_dict(torch.load(location_projection_img_p1, map_location='cpu'))
+Image_projection_layer.resblock.load_state_dict(torch.load(location_projection_img_p2, map_location='cpu'))
+def img_input_embed(image):
+    clip_embed = image_CLIP_embed(image)
+    post_projection = Image_projection_layer(clip_embed)
+    return post_projection
+device = 'cpu'
+user = "LN1996"  # put your user name here
+model_name = "peft-qlora-run2"
+model_id = f"{user}/{model_name}"
+import peft
+phi2_model_pretrained_peft = peft.PeftModel.from_pretrained(phi2_model_pretrained, model_id)
+def input_multimodel(image=None, audio=None, text=None, query=None):
+    if len(text) == 0:
+        text = None
+    if len(query) == 0:
+        query = None
+    if query is None:
+        print('Please ask a query')
+        return None
+    if image is None and audio is None and text is None:
+        print('Please provide context in form of image, audio, text')
+        return None
+    bos = tokenizer("Context: ", return_tensors="pt", return_attention_mask=False)
+    input_embeds_stage_2 = phi2_model_pretrained_peft.get_input_embeddings()(bos.input_ids)
+    if image is not None:
+        image_embeds = img_input_embed(image)
+        input_embeds_stage_2 = torch.cat((input_embeds_stage_2, image_embeds), dim=1)
+    if audio is not None:
+        audio_transcribed = convert_audio_file_text_embeds(audio)
+        audio_embeds = convert_text_input_embeds(audio_transcribed)
+        input_embeds_stage_2 = torch.cat((input_embeds_stage_2, audio_embeds), dim=1)
+    if text is not None:
+        text_embeds = convert_text_input_embeds(text)
+        input_embeds_stage_2 = torch.cat((input_embeds_stage_2, text_embeds), dim=1)
+    qus = tokenizer(" Question: " + query, return_tensors="pt",
+                    return_attention_mask=False)
+    qus_embeds = phi2_model_pretrained_peft.get_input_embeddings()(qus.input_ids)
+    input_embeds_stage_2 = torch.cat((input_embeds_stage_2, qus_embeds), dim=1)
+    ans = tokenizer(" Answer: ", return_tensors="pt", return_attention_mask=False)
+    ans_embeds = phi2_model_pretrained_peft.get_input_embeddings()(ans.input_ids)
+    input_embeds_stage_2 = torch.cat((input_embeds_stage_2, ans_embeds), dim=1)
+    result = phi2_model_pretrained_peft.generate(inputs_embeds=input_embeds_stage_2,
+                                                 bos_token_id = tokenizer.bos_token_id)
+    process = tokenizer.batch_decode(result)[0]
+    process = process.split(tokenizer.eos_token)
+    if process[0] == '':
+        return process[1]
+    else:
+        return process[0]
+title = "Multi-Model Phi-2 "
+description = "A simple Gradio interface to use a Multi-model (image, text, audio) version of Microsoft Phi-2"
+demo = gr.Interface(input_multimodel,
+                    inputs = [gr.Image(label="Input context Image"),
+                              gr.Audio(label="Input context Audio", source="microphone", type="filepath"),
+                              gr.Textbox(label="Input context Text"),
+                              gr.Textbox(label="Input Query"),
+                              ],
+                    outputs = [
+                              gr.Textbox(label='Answer'),
+                              ],
+                    title = title,
+                    description = description,
+                    )
+demo.launch(share=True)