import pandas as pd import torch from transformers import AutoTokenizer from transformers import AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig model_name = "microsoft/phi-2" phi2_model_pretrained = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, device_map = 'cpu' ) phi2_model_pretrained.config.use_cache = False tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False) tokenizer.pad_token = tokenizer.eos_token tokenizer.bos_token = tokenizer.eos_token def convert_text_input_embeds(text): in_tokens = tokenizer(text, return_tensors="pt", return_attention_mask=False) in_embeds = phi2_model_pretrained.get_input_embeddings()(in_tokens.input_ids) return in_embeds import whisperx whisper_model = whisperx.load_model('small', device='cpu', compute_type='float32') def convert_audio_file_text_embeds(fname): result = whisper_model.transcribe(fname) full_text = '' for seg in result['segments']: full_text = full_text + seg['text'] return full_text.strip() from transformers import CLIPVisionModel, CLIPImageProcessor vision_tower_name = 'openai/clip-vit-base-patch32' ## torch.Size([1, 49, 768]) image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name) vision_tower = CLIPVisionModel.from_pretrained(vision_tower_name) def feature_select(image_forward_outs): image_features = image_forward_outs.hidden_states[-1] # last layer image_features = image_features[:, 1:, :] return image_features # [1, 49, 768] def image_CLIP_embed(image): _ = vision_tower.requires_grad_(False) image = image_processor(images=image, return_tensors="pt") image_forward_out = vision_tower(image['pixel_values'].to(device=vision_tower.device), output_hidden_states=True) image_feature = feature_select(image_forward_out) return image_feature import torch import torch.nn as nn import torch.nn.functional as F class CustomGELU(nn.Module): def forward(self, x): return F.gelu(x.clone()) class SimpleResBlock(nn.Module): def __init__(self, input_size): super().__init__() self.pre_norm = nn.LayerNorm(input_size) self.proj = nn.Sequential( nn.Linear(input_size, input_size), nn.GELU(), nn.Linear(input_size, input_size) ) def forward(self, x): x = self.pre_norm(x) return x + self.proj(x) class CLIPembed_projection(nn.Module): def __init__(self, input_dim_CLIP=768, input_dim_phi2=2560): super(CLIPembed_projection, self).__init__() self.input_dim_CLIP = input_dim_CLIP self.input_dim_phi2 = input_dim_phi2 self.projection_img = nn.Linear(self.input_dim_CLIP, self.input_dim_phi2, bias=False) self.resblock = SimpleResBlock(self.input_dim_phi2) def forward(self, x): x = self.projection_img(x) x = self.resblock(x) return x Image_projection_layer = CLIPembed_projection() location_projection_img_p1 = f'./weights/stage_2/run2_projection_img.pth' location_projection_img_p2 = f'./weights/stage_2/run2_resblock.pth' # load projection_img, resblock from stage 2 Image_projection_layer.projection_img.load_state_dict(torch.load(location_projection_img_p1, map_location='cpu')) Image_projection_layer.resblock.load_state_dict(torch.load(location_projection_img_p2, map_location='cpu')) def img_input_embed(image): clip_embed = image_CLIP_embed(image) post_projection = Image_projection_layer(clip_embed) return post_projection device = 'cpu' user = "LN1996" # put your user name here model_name = "peft-qlora-run2" model_id = f"{user}/{model_name}" import peft phi2_model_pretrained_peft = peft.PeftModel.from_pretrained(phi2_model_pretrained, model_id) def input_multimodel(image=None, audio=None, text=None, query=None): if len(text) == 0: text = None if len(query) == 0: query = None if query is None: print('Please ask a query') return None if image is None and audio is None and text is None: print('Please provide context in form of image, audio, text') return None bos = tokenizer("Context: ", return_tensors="pt", return_attention_mask=False) input_embeds_stage_2 = phi2_model_pretrained_peft.get_input_embeddings()(bos.input_ids) if image is not None: image_embeds = img_input_embed(image) input_embeds_stage_2 = torch.cat((input_embeds_stage_2, image_embeds), dim=1) if audio is not None: audio_transcribed = convert_audio_file_text_embeds(audio) audio_embeds = convert_text_input_embeds(audio_transcribed) input_embeds_stage_2 = torch.cat((input_embeds_stage_2, audio_embeds), dim=1) if text is not None: text_embeds = convert_text_input_embeds(text) input_embeds_stage_2 = torch.cat((input_embeds_stage_2, text_embeds), dim=1) qus = tokenizer(" Question: " + query, return_tensors="pt", return_attention_mask=False) qus_embeds = phi2_model_pretrained_peft.get_input_embeddings()(qus.input_ids) input_embeds_stage_2 = torch.cat((input_embeds_stage_2, qus_embeds), dim=1) ans = tokenizer(" Answer: ", return_tensors="pt", return_attention_mask=False) ans_embeds = phi2_model_pretrained_peft.get_input_embeddings()(ans.input_ids) input_embeds_stage_2 = torch.cat((input_embeds_stage_2, ans_embeds), dim=1) result = phi2_model_pretrained_peft.generate(inputs_embeds=input_embeds_stage_2, bos_token_id = tokenizer.bos_token_id) process = tokenizer.batch_decode(result)[0] process = process.split(tokenizer.eos_token) if process[0] == '': return process[1] else: return process[0] import gradio as gr title = "Multi-Modal Phi-2 " description = "A simple Gradio interface to use a custom Multi-modal (image, text, audio) version of Microsoft Phi-2" demo = gr.Interface(input_multimodel, inputs = [gr.Image(label="Input context Image"), gr.Audio(label="Input context Audio", sources=["microphone", "upload"], type="filepath"), gr.Textbox(label="Input context Text"), gr.Textbox(label="Input Query"), ], outputs = [ gr.Textbox(label='Answer'), ], title = title, description = description, ) demo.launch()