Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,9 @@ from gtts import gTTS
|
|
9 |
import gradio as gr
|
10 |
from PIL import Image
|
11 |
import os
|
|
|
|
|
|
|
12 |
import io
|
13 |
import subprocess
|
14 |
from langdetect import detect
|
@@ -105,18 +108,27 @@ def process_audio_input(audio, whisper_processor, whisper_model):
|
|
105 |
|
106 |
# Updated process_image_input function
|
107 |
@spaces.GPU
|
|
|
108 |
def process_image_input(image, text_prompt, vision_model, processor):
|
109 |
if vision_model is None or processor is None:
|
110 |
return "Error: Vision model is not available."
|
111 |
|
112 |
try:
|
113 |
-
#
|
114 |
-
if
|
115 |
-
|
116 |
-
image =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
-
# Process the
|
119 |
-
inputs = processor(
|
120 |
|
121 |
# Generate text
|
122 |
with torch.no_grad():
|
@@ -133,6 +145,7 @@ def process_image_input(image, text_prompt, vision_model, processor):
|
|
133 |
return generated_text
|
134 |
except Exception as e:
|
135 |
return f"Error processing image: {str(e)}"
|
|
|
136 |
# Generate response within a GPU-decorated function
|
137 |
@spaces.GPU
|
138 |
def generate_response(transcription, sarvam_pipe):
|
@@ -236,7 +249,33 @@ body {
|
|
236 |
#custom-header h1 .pink {
|
237 |
color: #f472b6;
|
238 |
}
|
239 |
-
#custom-header h2 {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
font-size: 1.5rem;
|
241 |
color: #94a3b8;
|
242 |
}
|
|
|
9 |
import gradio as gr
|
10 |
from PIL import Image
|
11 |
import os
|
12 |
+
import base64
|
13 |
+
from io import BytesIO
|
14 |
+
|
15 |
import io
|
16 |
import subprocess
|
17 |
from langdetect import detect
|
|
|
108 |
|
109 |
# Updated process_image_input function
|
110 |
@spaces.GPU
|
111 |
+
@spaces.GPU
|
112 |
def process_image_input(image, text_prompt, vision_model, processor):
|
113 |
if vision_model is None or processor is None:
|
114 |
return "Error: Vision model is not available."
|
115 |
|
116 |
try:
|
117 |
+
# Convert image to base64
|
118 |
+
if isinstance(image, Image.Image):
|
119 |
+
buffered = BytesIO()
|
120 |
+
image.save(buffered, format="PNG")
|
121 |
+
img_str = base64.b64encode(buffered.getvalue()).decode()
|
122 |
+
else:
|
123 |
+
# If it's not a PIL Image, assume it's a file path
|
124 |
+
with open(image, "rb") as image_file:
|
125 |
+
img_str = base64.b64encode(image_file.read()).decode()
|
126 |
+
|
127 |
+
# Format the input with image tag
|
128 |
+
formatted_prompt = f"{text_prompt}\n<image>data:image/png;base64,{img_str}</image>"
|
129 |
|
130 |
+
# Process the formatted prompt
|
131 |
+
inputs = processor(text=formatted_prompt, return_tensors="pt").to(vision_model.device)
|
132 |
|
133 |
# Generate text
|
134 |
with torch.no_grad():
|
|
|
145 |
return generated_text
|
146 |
except Exception as e:
|
147 |
return f"Error processing image: {str(e)}"
|
148 |
+
|
149 |
# Generate response within a GPU-decorated function
|
150 |
@spaces.GPU
|
151 |
def generate_response(transcription, sarvam_pipe):
|
|
|
249 |
#custom-header h1 .pink {
|
250 |
color: #f472b6;
|
251 |
}
|
252 |
+
#custom-header h2 {@spaces.GPU
|
253 |
+
def indic_vision_assistant(input_type, audio_input, text_input, image_input):
|
254 |
+
try:
|
255 |
+
whisper_processor, whisper_model = load_whisper()
|
256 |
+
sarvam_pipe = load_sarvam()
|
257 |
+
vision_model, processor = load_vision_model()
|
258 |
+
|
259 |
+
if input_type == "audio" and audio_input is not None:
|
260 |
+
transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
|
261 |
+
elif input_type == "text" and text_input:
|
262 |
+
transcription = text_input
|
263 |
+
elif input_type == "image" and image_input is not None:
|
264 |
+
# Use a default prompt if no text input is provided
|
265 |
+
text_prompt = text_input if text_input else "Describe this image in detail."
|
266 |
+
transcription = process_image_input(image_input, text_prompt, vision_model, processor)
|
267 |
+
else:
|
268 |
+
return "Please provide either audio, text, or image input.", "No input provided.", None
|
269 |
+
|
270 |
+
response = generate_response(transcription, sarvam_pipe)
|
271 |
+
lang = detect_language(response)
|
272 |
+
audio_response = text_to_speech(response, lang)
|
273 |
+
|
274 |
+
return transcription, response, audio_response
|
275 |
+
except Exception as e:
|
276 |
+
error_message = f"An error occurred: {str(e)}"
|
277 |
+
return error_message, error_message, None
|
278 |
+
|
279 |
font-size: 1.5rem;
|
280 |
color: #94a3b8;
|
281 |
}
|