sagar007 commited on
Commit
ccb9319
·
verified ·
1 Parent(s): 15cd21c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -20
app.py CHANGED
@@ -69,15 +69,15 @@ def load_sarvam():
69
  @spaces.GPU
70
  def load_vision_model():
71
  try:
72
- model_id = "microsoft/Phi-3.5-vision-instruct"
73
- model = AutoModelForCausalLM.from_pretrained(
74
- model_id, trust_remote_code=True, torch_dtype=torch.float16, use_flash_attention_2=False
75
- )
76
- processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
77
- return model, processor
78
  except Exception as e:
79
  print(f"Error loading vision model: {e}")
80
- return None, None
 
81
 
82
  # Process audio input within a GPU-decorated function
83
  @spaces.GPU
@@ -94,24 +94,35 @@ def process_audio_input(audio, whisper_processor, whisper_model):
94
  except Exception as e:
95
  return f"Error processing audio: {str(e)}. Please type your message instead."
96
 
97
- # Process image input
98
  @spaces.GPU
99
- def process_image_input(image, vision_model, vision_processor):
100
- if vision_model is None or vision_processor is None:
101
  return "Error: Vision model is not available."
102
 
103
  try:
104
- # Add a generic prompt for image description
105
- prompt = "Describe this image in detail."
 
106
 
107
- inputs = vision_processor(images=image, text=prompt, return_tensors="pt")
108
- inputs = {k: v.to(vision_model.device) for k, v in inputs.items()}
 
109
 
 
110
  with torch.no_grad():
111
- outputs = vision_model.generate(**inputs, max_new_tokens=512, do_sample=True, top_k=50, top_p=0.95)
 
 
 
 
 
 
 
 
112
 
113
- generated_text = vision_processor.batch_decode(outputs, skip_special_tokens=True)[0]
114
- return generated_text
115
  except Exception as e:
116
  return f"Error processing image: {str(e)}"
117
  # Generate response within a GPU-decorated function
@@ -168,17 +179,16 @@ def detect_language(text):
168
  @spaces.GPU
169
  def indic_vision_assistant(input_type, audio_input, text_input, image_input):
170
  try:
171
- # Load models within the GPU-decorated function
172
  whisper_processor, whisper_model = load_whisper()
173
  sarvam_pipe = load_sarvam()
174
- vision_model, vision_processor = load_vision_model()
175
 
176
  if input_type == "audio" and audio_input is not None:
177
  transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
178
  elif input_type == "text" and text_input:
179
  transcription = text_input
180
  elif input_type == "image" and image_input is not None:
181
- transcription = process_image_input(image_input, vision_model, vision_processor)
182
  else:
183
  return "Please provide either audio, text, or image input.", "No input provided.", None
184
 
@@ -191,6 +201,7 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
191
  error_message = f"An error occurred: {str(e)}"
192
  return error_message, error_message, None
193
 
 
194
  # Custom CSS
195
  custom_css = """
196
  body {
 
69
  @spaces.GPU
70
  def load_vision_model():
71
  try:
72
+ model_id = "microsoft/phi-2" # Changed to phi-2 as it's more widely available
73
+ model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16)
74
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
75
+ image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
76
+ return model, tokenizer, image_processor
 
77
  except Exception as e:
78
  print(f"Error loading vision model: {e}")
79
+ return None, None, None
80
+
81
 
82
  # Process audio input within a GPU-decorated function
83
  @spaces.GPU
 
94
  except Exception as e:
95
  return f"Error processing audio: {str(e)}. Please type your message instead."
96
 
97
+ # Updated process_image_input function
98
  @spaces.GPU
99
+ def process_image_input(image, vision_model, tokenizer, image_processor):
100
+ if vision_model is None or tokenizer is None or image_processor is None:
101
  return "Error: Vision model is not available."
102
 
103
  try:
104
+ # Process the image
105
+ image = Image.open(io.BytesIO(image)).convert('RGB')
106
+ image_features = image_processor(images=image, return_tensors="pt")["pixel_values"].to(vision_model.device)
107
 
108
+ # Create a prompt
109
+ prompt = "Describe this image in detail:\n"
110
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(vision_model.device)
111
 
112
+ # Generate text
113
  with torch.no_grad():
114
+ outputs = vision_model.generate(
115
+ input_ids,
116
+ max_new_tokens=100,
117
+ do_sample=True,
118
+ top_k=50,
119
+ top_p=0.95,
120
+ num_return_sequences=1,
121
+ image_features=image_features
122
+ )
123
 
124
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
125
+ return generated_text.replace(prompt, "") # Remove the prompt from the output
126
  except Exception as e:
127
  return f"Error processing image: {str(e)}"
128
  # Generate response within a GPU-decorated function
 
179
  @spaces.GPU
180
  def indic_vision_assistant(input_type, audio_input, text_input, image_input):
181
  try:
 
182
  whisper_processor, whisper_model = load_whisper()
183
  sarvam_pipe = load_sarvam()
184
+ vision_model, tokenizer, image_processor = load_vision_model()
185
 
186
  if input_type == "audio" and audio_input is not None:
187
  transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
188
  elif input_type == "text" and text_input:
189
  transcription = text_input
190
  elif input_type == "image" and image_input is not None:
191
+ transcription = process_image_input(image_input, vision_model, tokenizer, image_processor)
192
  else:
193
  return "Please provide either audio, text, or image input.", "No input provided.", None
194
 
 
201
  error_message = f"An error occurred: {str(e)}"
202
  return error_message, error_message, None
203
 
204
+
205
  # Custom CSS
206
  custom_css = """
207
  body {