sagar007 commited on
Commit
2b390ac
·
verified ·
1 Parent(s): c51ef31

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -18
app.py CHANGED
@@ -92,7 +92,7 @@ def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_t
92
  yield history + [[message, buffer]], audio_path
93
 
94
  @spaces.GPU
95
- def process_vision_query(image, text_input):
96
  prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
97
 
98
  # Check if image is already a PIL Image
@@ -115,26 +115,35 @@ def process_vision_query(image, text_input):
115
 
116
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
117
  response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
118
- return response
 
 
 
 
 
119
  except RuntimeError as e:
120
  if "CUDA out of memory" in str(e):
121
- return "Error: GPU out of memory. Try processing a smaller image or freeing up GPU resources."
 
122
  else:
123
  raise e
124
 
125
- @spaces.GPU
126
- def generate_speech(prompt, description):
127
  input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(tts_device)
128
- prompt_input_ids = tts_tokenizer(prompt, return_tensors="pt").input_ids.to(tts_device)
129
 
130
- generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
 
 
131
  audio_arr = generation.cpu().numpy().squeeze()
132
 
133
- output_path = f"output_audio_{hash(prompt)}.wav"
134
  sf.write(output_path, audio_arr, tts_model.config.sampling_rate)
135
 
136
  return output_path
137
 
 
138
  # Custom CSS
139
  custom_css = """
140
  body { background-color: #0b0f19; color: #e2e8f0; font-family: 'Arial', sans-serif;}
@@ -214,16 +223,20 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
214
  outputs=[chatbot, audio_output])
215
  clear_btn.click(lambda: (None, None), None, [chatbot, audio_output], queue=False)
216
 
217
- with gr.Tab("Vision Model (Phi-3.5-vision)"):
218
- with gr.Row():
219
- with gr.Column(scale=1):
220
- vision_input_img = gr.Image(label="Upload an Image", type="pil")
221
- vision_text_input = gr.Textbox(label="Ask a question about the image", placeholder="What do you see in this image?")
222
- vision_submit_btn = gr.Button("Analyze Image", variant="primary")
223
- with gr.Column(scale=1):
224
- vision_output_text = gr.Textbox(label="AI Analysis", lines=10)
225
-
226
- vision_submit_btn.click(process_vision_query, [vision_input_img, vision_text_input], [vision_output_text])
 
 
 
 
227
 
228
  with gr.Tab("Text-to-Speech (Parler-TTS)"):
229
  with gr.Row():
 
92
  yield history + [[message, buffer]], audio_path
93
 
94
  @spaces.GPU
95
+ def process_vision_query(image, text_input, generate_speech=True):
96
  prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
97
 
98
  # Check if image is already a PIL Image
 
115
 
116
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
117
  response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
118
+
119
+ if generate_speech:
120
+ audio_path = generate_speech_from_text(response)
121
+ return response, audio_path
122
+ else:
123
+ return response, None
124
  except RuntimeError as e:
125
  if "CUDA out of memory" in str(e):
126
+ error_message = "Error: GPU out of memory. Try processing a smaller image or freeing up GPU resources."
127
+ return error_message, None
128
  else:
129
  raise e
130
 
131
+
132
+ def generate_speech_from_text(text, description="A clear voice reads out the response."):
133
  input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(tts_device)
134
+ prompt_input_ids = tts_tokenizer(text, return_tensors="pt").input_ids.to(tts_device)
135
 
136
+ with torch.no_grad():
137
+ generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
138
+
139
  audio_arr = generation.cpu().numpy().squeeze()
140
 
141
+ output_path = f"output_audio_{hash(text)}.wav"
142
  sf.write(output_path, audio_arr, tts_model.config.sampling_rate)
143
 
144
  return output_path
145
 
146
+
147
  # Custom CSS
148
  custom_css = """
149
  body { background-color: #0b0f19; color: #e2e8f0; font-family: 'Arial', sans-serif;}
 
223
  outputs=[chatbot, audio_output])
224
  clear_btn.click(lambda: (None, None), None, [chatbot, audio_output], queue=False)
225
 
226
+ with gr.Tab("Vision Model with TTS (Phi-3.5-vision)"):
227
+ with gr.Row():
228
+ with gr.Column(scale=1):
229
+ vision_input_img = gr.Image(label="Upload an Image", type="pil")
230
+ vision_text_input = gr.Textbox(label="Ask a question about the image", placeholder="What do you see in this image?")
231
+ vision_submit_btn = gr.Button("Analyze Image and Generate Speech", variant="primary")
232
+ with gr.Column(scale=1):
233
+ vision_output_text = gr.Textbox(label="AI Analysis", lines=10)
234
+ vision_output_audio = gr.Audio(label="Generated Speech")
235
+
236
+ vision_submit_btn.click(process_vision_query,
237
+ inputs=[vision_input_img, vision_text_input],
238
+ outputs=[vision_output_text, vision_output_audio])
239
+
240
 
241
  with gr.Tab("Text-to-Speech (Parler-TTS)"):
242
  with gr.Row():