Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -92,7 +92,7 @@ def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_t
|
|
92 |
yield history + [[message, buffer]], audio_path
|
93 |
|
94 |
@spaces.GPU
|
95 |
-
def process_vision_query(image, text_input):
|
96 |
prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
|
97 |
|
98 |
# Check if image is already a PIL Image
|
@@ -115,26 +115,35 @@ def process_vision_query(image, text_input):
|
|
115 |
|
116 |
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
117 |
response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
119 |
except RuntimeError as e:
|
120 |
if "CUDA out of memory" in str(e):
|
121 |
-
|
|
|
122 |
else:
|
123 |
raise e
|
124 |
|
125 |
-
|
126 |
-
def
|
127 |
input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(tts_device)
|
128 |
-
prompt_input_ids = tts_tokenizer(
|
129 |
|
130 |
-
|
|
|
|
|
131 |
audio_arr = generation.cpu().numpy().squeeze()
|
132 |
|
133 |
-
output_path = f"output_audio_{hash(
|
134 |
sf.write(output_path, audio_arr, tts_model.config.sampling_rate)
|
135 |
|
136 |
return output_path
|
137 |
|
|
|
138 |
# Custom CSS
|
139 |
custom_css = """
|
140 |
body { background-color: #0b0f19; color: #e2e8f0; font-family: 'Arial', sans-serif;}
|
@@ -214,16 +223,20 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
|
|
214 |
outputs=[chatbot, audio_output])
|
215 |
clear_btn.click(lambda: (None, None), None, [chatbot, audio_output], queue=False)
|
216 |
|
217 |
-
with gr.Tab("Vision Model (Phi-3.5-vision)"):
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
|
|
|
|
|
|
|
|
227 |
|
228 |
with gr.Tab("Text-to-Speech (Parler-TTS)"):
|
229 |
with gr.Row():
|
|
|
92 |
yield history + [[message, buffer]], audio_path
|
93 |
|
94 |
@spaces.GPU
|
95 |
+
def process_vision_query(image, text_input, generate_speech=True):
|
96 |
prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
|
97 |
|
98 |
# Check if image is already a PIL Image
|
|
|
115 |
|
116 |
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
117 |
response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
118 |
+
|
119 |
+
if generate_speech:
|
120 |
+
audio_path = generate_speech_from_text(response)
|
121 |
+
return response, audio_path
|
122 |
+
else:
|
123 |
+
return response, None
|
124 |
except RuntimeError as e:
|
125 |
if "CUDA out of memory" in str(e):
|
126 |
+
error_message = "Error: GPU out of memory. Try processing a smaller image or freeing up GPU resources."
|
127 |
+
return error_message, None
|
128 |
else:
|
129 |
raise e
|
130 |
|
131 |
+
|
132 |
+
def generate_speech_from_text(text, description="A clear voice reads out the response."):
|
133 |
input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(tts_device)
|
134 |
+
prompt_input_ids = tts_tokenizer(text, return_tensors="pt").input_ids.to(tts_device)
|
135 |
|
136 |
+
with torch.no_grad():
|
137 |
+
generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
138 |
+
|
139 |
audio_arr = generation.cpu().numpy().squeeze()
|
140 |
|
141 |
+
output_path = f"output_audio_{hash(text)}.wav"
|
142 |
sf.write(output_path, audio_arr, tts_model.config.sampling_rate)
|
143 |
|
144 |
return output_path
|
145 |
|
146 |
+
|
147 |
# Custom CSS
|
148 |
custom_css = """
|
149 |
body { background-color: #0b0f19; color: #e2e8f0; font-family: 'Arial', sans-serif;}
|
|
|
223 |
outputs=[chatbot, audio_output])
|
224 |
clear_btn.click(lambda: (None, None), None, [chatbot, audio_output], queue=False)
|
225 |
|
226 |
+
with gr.Tab("Vision Model with TTS (Phi-3.5-vision)"):
|
227 |
+
with gr.Row():
|
228 |
+
with gr.Column(scale=1):
|
229 |
+
vision_input_img = gr.Image(label="Upload an Image", type="pil")
|
230 |
+
vision_text_input = gr.Textbox(label="Ask a question about the image", placeholder="What do you see in this image?")
|
231 |
+
vision_submit_btn = gr.Button("Analyze Image and Generate Speech", variant="primary")
|
232 |
+
with gr.Column(scale=1):
|
233 |
+
vision_output_text = gr.Textbox(label="AI Analysis", lines=10)
|
234 |
+
vision_output_audio = gr.Audio(label="Generated Speech")
|
235 |
+
|
236 |
+
vision_submit_btn.click(process_vision_query,
|
237 |
+
inputs=[vision_input_img, vision_text_input],
|
238 |
+
outputs=[vision_output_text, vision_output_audio])
|
239 |
+
|
240 |
|
241 |
with gr.Tab("Text-to-Speech (Parler-TTS)"):
|
242 |
with gr.Row():
|