Spaces:

sagar007
/

Multimodal_App

Build error

App Files Files Community

sagar007 commited on Aug 24, 2024

Commit

11cd804

verified ·

1 Parent(s): 4c05f69

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -7

app.py CHANGED Viewed

@@ -9,6 +9,9 @@ from gtts import gTTS
 import gradio as gr
 from PIL import Image
 import os
 import io
 import subprocess
 from langdetect import detect
@@ -105,18 +108,27 @@ def process_audio_input(audio, whisper_processor, whisper_model):
 # Updated process_image_input function
 @spaces.GPU
 def process_image_input(image, text_prompt, vision_model, processor):
     if vision_model is None or processor is None:
         return "Error: Vision model is not available."
     try:
-        # Check if image is already a PIL Image
-        if not isinstance(image, Image.Image):
-            # If it's not, assume it's a file path or bytes and open it
-            image = Image.open(image).convert('RGB')
-        # Process the image and text
-        inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(vision_model.device)
         # Generate text
         with torch.no_grad():
@@ -133,6 +145,7 @@ def process_image_input(image, text_prompt, vision_model, processor):
         return generated_text
     except Exception as e:
         return f"Error processing image: {str(e)}"
 # Generate response within a GPU-decorated function
 @spaces.GPU
 def generate_response(transcription, sarvam_pipe):
@@ -236,7 +249,33 @@ body {
 #custom-header h1 .pink {
     color: #f472b6;
 }
-#custom-header h2 {
     font-size: 1.5rem;
     color: #94a3b8;
 }

 import gradio as gr
 from PIL import Image
 import os
+import base64
+from io import BytesIO
 import io
 import subprocess
 from langdetect import detect
 # Updated process_image_input function
 @spaces.GPU
+@spaces.GPU
 def process_image_input(image, text_prompt, vision_model, processor):
     if vision_model is None or processor is None:
         return "Error: Vision model is not available."
     try:
+        # Convert image to base64
+        if isinstance(image, Image.Image):
+            buffered = BytesIO()
+            image.save(buffered, format="PNG")
+            img_str = base64.b64encode(buffered.getvalue()).decode()
+        else:
+            # If it's not a PIL Image, assume it's a file path
+            with open(image, "rb") as image_file:
+                img_str = base64.b64encode(image_file.read()).decode()
+        # Format the input with image tag
+        formatted_prompt = f"{text_prompt}\n<image>data:image/png;base64,{img_str}</image>"
+        # Process the formatted prompt
+        inputs = processor(text=formatted_prompt, return_tensors="pt").to(vision_model.device)
         # Generate text
         with torch.no_grad():
         return generated_text
     except Exception as e:
         return f"Error processing image: {str(e)}"
 # Generate response within a GPU-decorated function
 @spaces.GPU
 def generate_response(transcription, sarvam_pipe):
 #custom-header h1 .pink {
     color: #f472b6;
 }
+#custom-header h2 {@spaces.GPU
+def indic_vision_assistant(input_type, audio_input, text_input, image_input):
+    try:
+        whisper_processor, whisper_model = load_whisper()
+        sarvam_pipe = load_sarvam()
+        vision_model, processor = load_vision_model()
+        if input_type == "audio" and audio_input is not None:
+            transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
+        elif input_type == "text" and text_input:
+            transcription = text_input
+        elif input_type == "image" and image_input is not None:
+            # Use a default prompt if no text input is provided
+            text_prompt = text_input if text_input else "Describe this image in detail."
+            transcription = process_image_input(image_input, text_prompt, vision_model, processor)
+        else:
+            return "Please provide either audio, text, or image input.", "No input provided.", None
+        response = generate_response(transcription, sarvam_pipe)
+        lang = detect_language(response)
+        audio_response = text_to_speech(response, lang)
+        return transcription, response, audio_response
+    except Exception as e:
+        error_message = f"An error occurred: {str(e)}"
+        return error_message, error_message, None
     font-size: 1.5rem;
     color: #94a3b8;
 }