Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -105,7 +105,7 @@ def process_audio_input(audio, whisper_processor, whisper_model):
|
|
105 |
|
106 |
# Updated process_image_input function
|
107 |
@spaces.GPU
|
108 |
-
def process_image_input(image, vision_model, processor):
|
109 |
if vision_model is None or processor is None:
|
110 |
return "Error: Vision model is not available."
|
111 |
|
@@ -115,8 +115,8 @@ def process_image_input(image, vision_model, processor):
|
|
115 |
# If it's not, assume it's a file path or bytes and open it
|
116 |
image = Image.open(image).convert('RGB')
|
117 |
|
118 |
-
# Process the image
|
119 |
-
inputs = processor(images=image, return_tensors="pt").to(vision_model.device)
|
120 |
|
121 |
# Generate text
|
122 |
with torch.no_grad():
|
@@ -133,7 +133,6 @@ def process_image_input(image, vision_model, processor):
|
|
133 |
return generated_text
|
134 |
except Exception as e:
|
135 |
return f"Error processing image: {str(e)}"
|
136 |
-
|
137 |
# Generate response within a GPU-decorated function
|
138 |
@spaces.GPU
|
139 |
def generate_response(transcription, sarvam_pipe):
|
@@ -197,8 +196,9 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
|
|
197 |
elif input_type == "text" and text_input:
|
198 |
transcription = text_input
|
199 |
elif input_type == "image" and image_input is not None:
|
200 |
-
#
|
201 |
-
|
|
|
202 |
else:
|
203 |
return "Please provide either audio, text, or image input.", "No input provided.", None
|
204 |
|
@@ -211,6 +211,7 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
|
|
211 |
error_message = f"An error occurred: {str(e)}"
|
212 |
return error_message, error_message, None
|
213 |
|
|
|
214 |
# Custom CSS
|
215 |
custom_css = """
|
216 |
body {
|
@@ -314,7 +315,7 @@ custom_suggestions = """
|
|
314 |
</div>
|
315 |
</div>
|
316 |
"""
|
317 |
-
#
|
318 |
with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
|
319 |
body_background_fill="#0b0f19",
|
320 |
body_text_color="#e2e8f0",
|
@@ -333,7 +334,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
|
|
333 |
|
334 |
input_type = gr.Radio(["audio", "text", "image"], label="Input Type", value="audio")
|
335 |
audio_input = gr.Audio(type="filepath", label="Speak (if audio input selected)")
|
336 |
-
text_input = gr.Textbox(label="Type your message
|
337 |
image_input = gr.Image(type="pil", label="Upload an image (if image input selected)")
|
338 |
|
339 |
submit_btn = gr.Button("Submit")
|
@@ -348,6 +349,5 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
|
|
348 |
outputs=[output_transcription, output_response, output_audio]
|
349 |
)
|
350 |
gr.HTML("<footer>Powered by Indic Language AI with Vision Capabilities</footer>")
|
351 |
-
|
352 |
# Launch the app
|
353 |
iface.launch()
|
|
|
105 |
|
106 |
# Updated process_image_input function
|
107 |
@spaces.GPU
|
108 |
+
def process_image_input(image, text_prompt, vision_model, processor):
|
109 |
if vision_model is None or processor is None:
|
110 |
return "Error: Vision model is not available."
|
111 |
|
|
|
115 |
# If it's not, assume it's a file path or bytes and open it
|
116 |
image = Image.open(image).convert('RGB')
|
117 |
|
118 |
+
# Process the image and text
|
119 |
+
inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(vision_model.device)
|
120 |
|
121 |
# Generate text
|
122 |
with torch.no_grad():
|
|
|
133 |
return generated_text
|
134 |
except Exception as e:
|
135 |
return f"Error processing image: {str(e)}"
|
|
|
136 |
# Generate response within a GPU-decorated function
|
137 |
@spaces.GPU
|
138 |
def generate_response(transcription, sarvam_pipe):
|
|
|
196 |
elif input_type == "text" and text_input:
|
197 |
transcription = text_input
|
198 |
elif input_type == "image" and image_input is not None:
|
199 |
+
# Use a default prompt if no text input is provided
|
200 |
+
text_prompt = text_input if text_input else "Describe this image in detail."
|
201 |
+
transcription = process_image_input(image_input, text_prompt, vision_model, processor)
|
202 |
else:
|
203 |
return "Please provide either audio, text, or image input.", "No input provided.", None
|
204 |
|
|
|
211 |
error_message = f"An error occurred: {str(e)}"
|
212 |
return error_message, error_message, None
|
213 |
|
214 |
+
|
215 |
# Custom CSS
|
216 |
custom_css = """
|
217 |
body {
|
|
|
315 |
</div>
|
316 |
</div>
|
317 |
"""
|
318 |
+
# Update the Gradio interface to allow text input for image processing
|
319 |
with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
|
320 |
body_background_fill="#0b0f19",
|
321 |
body_text_color="#e2e8f0",
|
|
|
334 |
|
335 |
input_type = gr.Radio(["audio", "text", "image"], label="Input Type", value="audio")
|
336 |
audio_input = gr.Audio(type="filepath", label="Speak (if audio input selected)")
|
337 |
+
text_input = gr.Textbox(label="Type your message or image prompt")
|
338 |
image_input = gr.Image(type="pil", label="Upload an image (if image input selected)")
|
339 |
|
340 |
submit_btn = gr.Button("Submit")
|
|
|
349 |
outputs=[output_transcription, output_response, output_audio]
|
350 |
)
|
351 |
gr.HTML("<footer>Powered by Indic Language AI with Vision Capabilities</footer>")
|
|
|
352 |
# Launch the app
|
353 |
iface.launch()
|