Spaces:

sagar007
/

Multimodal_App

Running on Zero

App Files Files Community

sagar007 commited on Aug 24, 2024

Commit

4c05f69

verified ·

1 Parent(s): 7d4688b

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -9

app.py CHANGED Viewed

@@ -105,7 +105,7 @@ def process_audio_input(audio, whisper_processor, whisper_model):
 # Updated process_image_input function
 @spaces.GPU
-def process_image_input(image, vision_model, processor):
     if vision_model is None or processor is None:
         return "Error: Vision model is not available."
@@ -115,8 +115,8 @@ def process_image_input(image, vision_model, processor):
             # If it's not, assume it's a file path or bytes and open it
             image = Image.open(image).convert('RGB')
-        # Process the image
-        inputs = processor(images=image, return_tensors="pt").to(vision_model.device)
         # Generate text
         with torch.no_grad():
@@ -133,7 +133,6 @@ def process_image_input(image, vision_model, processor):
         return generated_text
     except Exception as e:
         return f"Error processing image: {str(e)}"
 # Generate response within a GPU-decorated function
 @spaces.GPU
 def generate_response(transcription, sarvam_pipe):
@@ -197,8 +196,9 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
         elif input_type == "text" and text_input:
             transcription = text_input
         elif input_type == "image" and image_input is not None:
-            # Directly pass the image_input to process_image_input
-            transcription = process_image_input(image_input, vision_model, processor)
         else:
             return "Please provide either audio, text, or image input.", "No input provided.", None
@@ -211,6 +211,7 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
         error_message = f"An error occurred: {str(e)}"
         return error_message, error_message, None
 # Custom CSS
 custom_css = """
 body {
@@ -314,7 +315,7 @@ custom_suggestions = """
     </div>
 </div>
 """
-# Create Gradio interface
 with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
     body_background_fill="#0b0f19",
     body_text_color="#e2e8f0",
@@ -333,7 +334,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
     input_type = gr.Radio(["audio", "text", "image"], label="Input Type", value="audio")
     audio_input = gr.Audio(type="filepath", label="Speak (if audio input selected)")
-    text_input = gr.Textbox(label="Type your message (if text input selected)")
     image_input = gr.Image(type="pil", label="Upload an image (if image input selected)")
     submit_btn = gr.Button("Submit")
@@ -348,6 +349,5 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
         outputs=[output_transcription, output_response, output_audio]
     )
     gr.HTML("<footer>Powered by Indic Language AI with Vision Capabilities</footer>")
 # Launch the app
 iface.launch()

 # Updated process_image_input function
 @spaces.GPU
+def process_image_input(image, text_prompt, vision_model, processor):
     if vision_model is None or processor is None:
         return "Error: Vision model is not available."
             # If it's not, assume it's a file path or bytes and open it
             image = Image.open(image).convert('RGB')
+        # Process the image and text
+        inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(vision_model.device)
         # Generate text
         with torch.no_grad():
         return generated_text
     except Exception as e:
         return f"Error processing image: {str(e)}"
 # Generate response within a GPU-decorated function
 @spaces.GPU
 def generate_response(transcription, sarvam_pipe):
         elif input_type == "text" and text_input:
             transcription = text_input
         elif input_type == "image" and image_input is not None:
+            # Use a default prompt if no text input is provided
+            text_prompt = text_input if text_input else "Describe this image in detail."
+            transcription = process_image_input(image_input, text_prompt, vision_model, processor)
         else:
             return "Please provide either audio, text, or image input.", "No input provided.", None
         error_message = f"An error occurred: {str(e)}"
         return error_message, error_message, None
 # Custom CSS
 custom_css = """
 body {
     </div>
 </div>
 """
+# Update the Gradio interface to allow text input for image processing
 with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
     body_background_fill="#0b0f19",
     body_text_color="#e2e8f0",
     input_type = gr.Radio(["audio", "text", "image"], label="Input Type", value="audio")
     audio_input = gr.Audio(type="filepath", label="Speak (if audio input selected)")
+    text_input = gr.Textbox(label="Type your message or image prompt")
     image_input = gr.Image(type="pil", label="Upload an image (if image input selected)")
     submit_btn = gr.Button("Submit")
         outputs=[output_transcription, output_response, output_audio]
     )
     gr.HTML("<footer>Powered by Indic Language AI with Vision Capabilities</footer>")
 # Launch the app
 iface.launch()