sagar007 commited on
Commit
bb4d7fc
·
verified ·
1 Parent(s): ccb9319

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -22
app.py CHANGED
@@ -69,14 +69,23 @@ def load_sarvam():
69
  @spaces.GPU
70
  def load_vision_model():
71
  try:
72
- model_id = "microsoft/phi-2" # Changed to phi-2 as it's more widely available
73
- model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16)
74
- tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
75
- image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
76
- return model, tokenizer, image_processor
 
 
 
 
 
 
 
 
 
77
  except Exception as e:
78
- print(f"Error loading vision model: {e}")
79
- return None, None, None
80
 
81
 
82
  # Process audio input within a GPU-decorated function
@@ -96,33 +105,28 @@ def process_audio_input(audio, whisper_processor, whisper_model):
96
 
97
  # Updated process_image_input function
98
  @spaces.GPU
99
- def process_image_input(image, vision_model, tokenizer, image_processor):
100
- if vision_model is None or tokenizer is None or image_processor is None:
101
  return "Error: Vision model is not available."
102
 
103
  try:
104
  # Process the image
105
  image = Image.open(io.BytesIO(image)).convert('RGB')
106
- image_features = image_processor(images=image, return_tensors="pt")["pixel_values"].to(vision_model.device)
107
-
108
- # Create a prompt
109
- prompt = "Describe this image in detail:\n"
110
- input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(vision_model.device)
111
 
112
  # Generate text
113
  with torch.no_grad():
114
  outputs = vision_model.generate(
115
- input_ids,
116
  max_new_tokens=100,
117
  do_sample=True,
118
  top_k=50,
119
  top_p=0.95,
120
- num_return_sequences=1,
121
- image_features=image_features
122
  )
123
 
124
- generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
125
- return generated_text.replace(prompt, "") # Remove the prompt from the output
126
  except Exception as e:
127
  return f"Error processing image: {str(e)}"
128
  # Generate response within a GPU-decorated function
@@ -181,14 +185,14 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
181
  try:
182
  whisper_processor, whisper_model = load_whisper()
183
  sarvam_pipe = load_sarvam()
184
- vision_model, tokenizer, image_processor = load_vision_model()
185
 
186
  if input_type == "audio" and audio_input is not None:
187
  transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
188
  elif input_type == "text" and text_input:
189
  transcription = text_input
190
  elif input_type == "image" and image_input is not None:
191
- transcription = process_image_input(image_input, vision_model, tokenizer, image_processor)
192
  else:
193
  return "Please provide either audio, text, or image input.", "No input provided.", None
194
 
@@ -201,7 +205,6 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
201
  error_message = f"An error occurred: {str(e)}"
202
  return error_message, error_message, None
203
 
204
-
205
  # Custom CSS
206
  custom_css = """
207
  body {
 
69
  @spaces.GPU
70
  def load_vision_model():
71
  try:
72
+ print("Starting to load vision model...")
73
+ model_id = "microsoft/Phi-3.5-vision-instruct"
74
+ print(f"Loading model from {model_id}")
75
+ model = AutoModelForCausalLM.from_pretrained(
76
+ model_id,
77
+ trust_remote_code=True,
78
+ torch_dtype=torch.float16,
79
+ use_flash_attention_2=False
80
+ )
81
+ print("Model loaded successfully")
82
+ print("Loading processor...")
83
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
84
+ print("Processor loaded successfully")
85
+ return model, processor
86
  except Exception as e:
87
+ print(f"Detailed error in loading vision model: {str(e)}")
88
+ return None, None
89
 
90
 
91
  # Process audio input within a GPU-decorated function
 
105
 
106
  # Updated process_image_input function
107
  @spaces.GPU
108
+ def process_image_input(image, vision_model, processor):
109
+ if vision_model is None or processor is None:
110
  return "Error: Vision model is not available."
111
 
112
  try:
113
  # Process the image
114
  image = Image.open(io.BytesIO(image)).convert('RGB')
115
+ inputs = processor(images=image, return_tensors="pt").to(vision_model.device)
 
 
 
 
116
 
117
  # Generate text
118
  with torch.no_grad():
119
  outputs = vision_model.generate(
120
+ **inputs,
121
  max_new_tokens=100,
122
  do_sample=True,
123
  top_k=50,
124
  top_p=0.95,
125
+ num_return_sequences=1
 
126
  )
127
 
128
+ generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
129
+ return generated_text
130
  except Exception as e:
131
  return f"Error processing image: {str(e)}"
132
  # Generate response within a GPU-decorated function
 
185
  try:
186
  whisper_processor, whisper_model = load_whisper()
187
  sarvam_pipe = load_sarvam()
188
+ vision_model, processor = load_vision_model()
189
 
190
  if input_type == "audio" and audio_input is not None:
191
  transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
192
  elif input_type == "text" and text_input:
193
  transcription = text_input
194
  elif input_type == "image" and image_input is not None:
195
+ transcription = process_image_input(image_input, vision_model, processor)
196
  else:
197
  return "Please provide either audio, text, or image input.", "No input provided.", None
198
 
 
205
  error_message = f"An error occurred: {str(e)}"
206
  return error_message, error_message, None
207
 
 
208
  # Custom CSS
209
  custom_css = """
210
  body {