sagar007 commited on
Commit
4c05f69
·
verified ·
1 Parent(s): 7d4688b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -9
app.py CHANGED
@@ -105,7 +105,7 @@ def process_audio_input(audio, whisper_processor, whisper_model):
105
 
106
  # Updated process_image_input function
107
  @spaces.GPU
108
- def process_image_input(image, vision_model, processor):
109
  if vision_model is None or processor is None:
110
  return "Error: Vision model is not available."
111
 
@@ -115,8 +115,8 @@ def process_image_input(image, vision_model, processor):
115
  # If it's not, assume it's a file path or bytes and open it
116
  image = Image.open(image).convert('RGB')
117
 
118
- # Process the image
119
- inputs = processor(images=image, return_tensors="pt").to(vision_model.device)
120
 
121
  # Generate text
122
  with torch.no_grad():
@@ -133,7 +133,6 @@ def process_image_input(image, vision_model, processor):
133
  return generated_text
134
  except Exception as e:
135
  return f"Error processing image: {str(e)}"
136
-
137
  # Generate response within a GPU-decorated function
138
  @spaces.GPU
139
  def generate_response(transcription, sarvam_pipe):
@@ -197,8 +196,9 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
197
  elif input_type == "text" and text_input:
198
  transcription = text_input
199
  elif input_type == "image" and image_input is not None:
200
- # Directly pass the image_input to process_image_input
201
- transcription = process_image_input(image_input, vision_model, processor)
 
202
  else:
203
  return "Please provide either audio, text, or image input.", "No input provided.", None
204
 
@@ -211,6 +211,7 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
211
  error_message = f"An error occurred: {str(e)}"
212
  return error_message, error_message, None
213
 
 
214
  # Custom CSS
215
  custom_css = """
216
  body {
@@ -314,7 +315,7 @@ custom_suggestions = """
314
  </div>
315
  </div>
316
  """
317
- # Create Gradio interface
318
  with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
319
  body_background_fill="#0b0f19",
320
  body_text_color="#e2e8f0",
@@ -333,7 +334,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
333
 
334
  input_type = gr.Radio(["audio", "text", "image"], label="Input Type", value="audio")
335
  audio_input = gr.Audio(type="filepath", label="Speak (if audio input selected)")
336
- text_input = gr.Textbox(label="Type your message (if text input selected)")
337
  image_input = gr.Image(type="pil", label="Upload an image (if image input selected)")
338
 
339
  submit_btn = gr.Button("Submit")
@@ -348,6 +349,5 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
348
  outputs=[output_transcription, output_response, output_audio]
349
  )
350
  gr.HTML("<footer>Powered by Indic Language AI with Vision Capabilities</footer>")
351
-
352
  # Launch the app
353
  iface.launch()
 
105
 
106
  # Updated process_image_input function
107
  @spaces.GPU
108
+ def process_image_input(image, text_prompt, vision_model, processor):
109
  if vision_model is None or processor is None:
110
  return "Error: Vision model is not available."
111
 
 
115
  # If it's not, assume it's a file path or bytes and open it
116
  image = Image.open(image).convert('RGB')
117
 
118
+ # Process the image and text
119
+ inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(vision_model.device)
120
 
121
  # Generate text
122
  with torch.no_grad():
 
133
  return generated_text
134
  except Exception as e:
135
  return f"Error processing image: {str(e)}"
 
136
  # Generate response within a GPU-decorated function
137
  @spaces.GPU
138
  def generate_response(transcription, sarvam_pipe):
 
196
  elif input_type == "text" and text_input:
197
  transcription = text_input
198
  elif input_type == "image" and image_input is not None:
199
+ # Use a default prompt if no text input is provided
200
+ text_prompt = text_input if text_input else "Describe this image in detail."
201
+ transcription = process_image_input(image_input, text_prompt, vision_model, processor)
202
  else:
203
  return "Please provide either audio, text, or image input.", "No input provided.", None
204
 
 
211
  error_message = f"An error occurred: {str(e)}"
212
  return error_message, error_message, None
213
 
214
+
215
  # Custom CSS
216
  custom_css = """
217
  body {
 
315
  </div>
316
  </div>
317
  """
318
+ # Update the Gradio interface to allow text input for image processing
319
  with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
320
  body_background_fill="#0b0f19",
321
  body_text_color="#e2e8f0",
 
334
 
335
  input_type = gr.Radio(["audio", "text", "image"], label="Input Type", value="audio")
336
  audio_input = gr.Audio(type="filepath", label="Speak (if audio input selected)")
337
+ text_input = gr.Textbox(label="Type your message or image prompt")
338
  image_input = gr.Image(type="pil", label="Upload an image (if image input selected)")
339
 
340
  submit_btn = gr.Button("Submit")
 
349
  outputs=[output_transcription, output_response, output_audio]
350
  )
351
  gr.HTML("<footer>Powered by Indic Language AI with Vision Capabilities</footer>")
 
352
  # Launch the app
353
  iface.launch()