LN1996's picture
Update app.py
05ffd40 verified
import pandas as pd
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
model_name = "microsoft/phi-2"
phi2_model_pretrained = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
device_map = 'cpu'
)
phi2_model_pretrained.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.bos_token = tokenizer.eos_token
def convert_text_input_embeds(text):
in_tokens = tokenizer(text, return_tensors="pt", return_attention_mask=False)
in_embeds = phi2_model_pretrained.get_input_embeddings()(in_tokens.input_ids)
return in_embeds
import whisperx
whisper_model = whisperx.load_model('small', device='cpu', compute_type='float32')
def convert_audio_file_text_embeds(fname):
result = whisper_model.transcribe(fname)
full_text = ''
for seg in result['segments']:
full_text = full_text + seg['text']
return full_text.strip()
from transformers import CLIPVisionModel, CLIPImageProcessor
vision_tower_name = 'openai/clip-vit-base-patch32' ## torch.Size([1, 49, 768])
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
vision_tower = CLIPVisionModel.from_pretrained(vision_tower_name)
def feature_select(image_forward_outs):
image_features = image_forward_outs.hidden_states[-1] # last layer
image_features = image_features[:, 1:, :]
return image_features # [1, 49, 768]
def image_CLIP_embed(image):
_ = vision_tower.requires_grad_(False)
image = image_processor(images=image, return_tensors="pt")
image_forward_out = vision_tower(image['pixel_values'].to(device=vision_tower.device), output_hidden_states=True)
image_feature = feature_select(image_forward_out)
return image_feature
import torch
import torch.nn as nn
import torch.nn.functional as F
class CustomGELU(nn.Module):
def forward(self, x):
return F.gelu(x.clone())
class SimpleResBlock(nn.Module):
def __init__(self, input_size):
super().__init__()
self.pre_norm = nn.LayerNorm(input_size)
self.proj = nn.Sequential(
nn.Linear(input_size, input_size),
nn.GELU(),
nn.Linear(input_size, input_size)
)
def forward(self, x):
x = self.pre_norm(x)
return x + self.proj(x)
class CLIPembed_projection(nn.Module):
def __init__(self, input_dim_CLIP=768, input_dim_phi2=2560):
super(CLIPembed_projection, self).__init__()
self.input_dim_CLIP = input_dim_CLIP
self.input_dim_phi2 = input_dim_phi2
self.projection_img = nn.Linear(self.input_dim_CLIP, self.input_dim_phi2,
bias=False)
self.resblock = SimpleResBlock(self.input_dim_phi2)
def forward(self, x):
x = self.projection_img(x)
x = self.resblock(x)
return x
Image_projection_layer = CLIPembed_projection()
location_projection_img_p1 = f'./weights/stage_2/run2_projection_img.pth'
location_projection_img_p2 = f'./weights/stage_2/run2_resblock.pth'
# load projection_img, resblock from stage 2
Image_projection_layer.projection_img.load_state_dict(torch.load(location_projection_img_p1, map_location='cpu'))
Image_projection_layer.resblock.load_state_dict(torch.load(location_projection_img_p2, map_location='cpu'))
def img_input_embed(image):
clip_embed = image_CLIP_embed(image)
post_projection = Image_projection_layer(clip_embed)
return post_projection
device = 'cpu'
user = "LN1996" # put your user name here
model_name = "peft-qlora-run2"
model_id = f"{user}/{model_name}"
import peft
phi2_model_pretrained_peft = peft.PeftModel.from_pretrained(phi2_model_pretrained, model_id)
def input_multimodel(image=None, audio=None, text=None, query=None):
if len(text) == 0:
text = None
if len(query) == 0:
query = None
if query is None:
print('Please ask a query')
return None
if image is None and audio is None and text is None:
print('Please provide context in form of image, audio, text')
return None
bos = tokenizer("Context: ", return_tensors="pt", return_attention_mask=False)
input_embeds_stage_2 = phi2_model_pretrained_peft.get_input_embeddings()(bos.input_ids)
if image is not None:
image_embeds = img_input_embed(image)
input_embeds_stage_2 = torch.cat((input_embeds_stage_2, image_embeds), dim=1)
if audio is not None:
audio_transcribed = convert_audio_file_text_embeds(audio)
audio_embeds = convert_text_input_embeds(audio_transcribed)
input_embeds_stage_2 = torch.cat((input_embeds_stage_2, audio_embeds), dim=1)
if text is not None:
text_embeds = convert_text_input_embeds(text)
input_embeds_stage_2 = torch.cat((input_embeds_stage_2, text_embeds), dim=1)
qus = tokenizer(" Question: " + query, return_tensors="pt",
return_attention_mask=False)
qus_embeds = phi2_model_pretrained_peft.get_input_embeddings()(qus.input_ids)
input_embeds_stage_2 = torch.cat((input_embeds_stage_2, qus_embeds), dim=1)
ans = tokenizer(" Answer: ", return_tensors="pt", return_attention_mask=False)
ans_embeds = phi2_model_pretrained_peft.get_input_embeddings()(ans.input_ids)
input_embeds_stage_2 = torch.cat((input_embeds_stage_2, ans_embeds), dim=1)
result = phi2_model_pretrained_peft.generate(inputs_embeds=input_embeds_stage_2,
bos_token_id = tokenizer.bos_token_id)
process = tokenizer.batch_decode(result)[0]
process = process.split(tokenizer.eos_token)
if process[0] == '':
return process[1]
else:
return process[0]
import gradio as gr
title = "Multi-Modal Phi-2 "
description = "A simple Gradio interface to use a custom Multi-modal (image, text, audio) version of Microsoft Phi-2"
demo = gr.Interface(input_multimodel,
inputs = [gr.Image(label="Input context Image"),
gr.Audio(label="Input context Audio", sources=["microphone", "upload"], type="filepath"),
gr.Textbox(label="Input context Text"),
gr.Textbox(label="Input Query"),
],
outputs = [
gr.Textbox(label='Answer'),
],
title = title,
description = description,
)
demo.launch()