sagar007 commited on
Commit
11cd804
·
verified ·
1 Parent(s): 4c05f69

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -7
app.py CHANGED
@@ -9,6 +9,9 @@ from gtts import gTTS
9
  import gradio as gr
10
  from PIL import Image
11
  import os
 
 
 
12
  import io
13
  import subprocess
14
  from langdetect import detect
@@ -105,18 +108,27 @@ def process_audio_input(audio, whisper_processor, whisper_model):
105
 
106
  # Updated process_image_input function
107
  @spaces.GPU
 
108
  def process_image_input(image, text_prompt, vision_model, processor):
109
  if vision_model is None or processor is None:
110
  return "Error: Vision model is not available."
111
 
112
  try:
113
- # Check if image is already a PIL Image
114
- if not isinstance(image, Image.Image):
115
- # If it's not, assume it's a file path or bytes and open it
116
- image = Image.open(image).convert('RGB')
 
 
 
 
 
 
 
 
117
 
118
- # Process the image and text
119
- inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(vision_model.device)
120
 
121
  # Generate text
122
  with torch.no_grad():
@@ -133,6 +145,7 @@ def process_image_input(image, text_prompt, vision_model, processor):
133
  return generated_text
134
  except Exception as e:
135
  return f"Error processing image: {str(e)}"
 
136
  # Generate response within a GPU-decorated function
137
  @spaces.GPU
138
  def generate_response(transcription, sarvam_pipe):
@@ -236,7 +249,33 @@ body {
236
  #custom-header h1 .pink {
237
  color: #f472b6;
238
  }
239
- #custom-header h2 {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  font-size: 1.5rem;
241
  color: #94a3b8;
242
  }
 
9
  import gradio as gr
10
  from PIL import Image
11
  import os
12
+ import base64
13
+ from io import BytesIO
14
+
15
  import io
16
  import subprocess
17
  from langdetect import detect
 
108
 
109
  # Updated process_image_input function
110
  @spaces.GPU
111
+ @spaces.GPU
112
  def process_image_input(image, text_prompt, vision_model, processor):
113
  if vision_model is None or processor is None:
114
  return "Error: Vision model is not available."
115
 
116
  try:
117
+ # Convert image to base64
118
+ if isinstance(image, Image.Image):
119
+ buffered = BytesIO()
120
+ image.save(buffered, format="PNG")
121
+ img_str = base64.b64encode(buffered.getvalue()).decode()
122
+ else:
123
+ # If it's not a PIL Image, assume it's a file path
124
+ with open(image, "rb") as image_file:
125
+ img_str = base64.b64encode(image_file.read()).decode()
126
+
127
+ # Format the input with image tag
128
+ formatted_prompt = f"{text_prompt}\n<image>data:image/png;base64,{img_str}</image>"
129
 
130
+ # Process the formatted prompt
131
+ inputs = processor(text=formatted_prompt, return_tensors="pt").to(vision_model.device)
132
 
133
  # Generate text
134
  with torch.no_grad():
 
145
  return generated_text
146
  except Exception as e:
147
  return f"Error processing image: {str(e)}"
148
+
149
  # Generate response within a GPU-decorated function
150
  @spaces.GPU
151
  def generate_response(transcription, sarvam_pipe):
 
249
  #custom-header h1 .pink {
250
  color: #f472b6;
251
  }
252
+ #custom-header h2 {@spaces.GPU
253
+ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
254
+ try:
255
+ whisper_processor, whisper_model = load_whisper()
256
+ sarvam_pipe = load_sarvam()
257
+ vision_model, processor = load_vision_model()
258
+
259
+ if input_type == "audio" and audio_input is not None:
260
+ transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
261
+ elif input_type == "text" and text_input:
262
+ transcription = text_input
263
+ elif input_type == "image" and image_input is not None:
264
+ # Use a default prompt if no text input is provided
265
+ text_prompt = text_input if text_input else "Describe this image in detail."
266
+ transcription = process_image_input(image_input, text_prompt, vision_model, processor)
267
+ else:
268
+ return "Please provide either audio, text, or image input.", "No input provided.", None
269
+
270
+ response = generate_response(transcription, sarvam_pipe)
271
+ lang = detect_language(response)
272
+ audio_response = text_to_speech(response, lang)
273
+
274
+ return transcription, response, audio_response
275
+ except Exception as e:
276
+ error_message = f"An error occurred: {str(e)}"
277
+ return error_message, error_message, None
278
+
279
  font-size: 1.5rem;
280
  color: #94a3b8;
281
  }