Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -49,14 +49,10 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
49 |
torch_dtype=torch.float16
|
50 |
).to("cuda").eval()
|
51 |
|
52 |
-
#
|
53 |
-
# Use any of these tags at the start of your prompt to trigger TTS.
|
54 |
TTS_VOICE_MAP = {
|
55 |
"@jennyneural": "en-US-JennyNeural",
|
56 |
"@guyneural": "en-US-GuyNeural",
|
57 |
-
"@arianeural": "en-US-AriaNeural",
|
58 |
-
"@michaelneural": "en-US-MichaelNeural",
|
59 |
-
"@olivianeural": "en-US-OliviaNeural",
|
60 |
}
|
61 |
|
62 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
@@ -129,7 +125,7 @@ def generate(input_dict: dict, chat_history: list[dict],
|
|
129 |
repetition_penalty: float = 1.2):
|
130 |
"""
|
131 |
Generates chatbot responses with support for multimodal input, video processing,
|
132 |
-
and Edge TTS when using the new tags
|
133 |
Special command:
|
134 |
- "@video-infer": triggers video processing using Callisto OCR3.
|
135 |
"""
|
@@ -285,13 +281,10 @@ demo = gr.ChatInterface(
|
|
285 |
examples=[
|
286 |
["Write the code that converts temperatures between Celsius and Fahrenheit in short"],
|
287 |
[{"text": "Create a short story based on the image.", "files": ["examples/1.jpg"]}],
|
288 |
-
["@
|
289 |
[{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
|
290 |
[{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
|
291 |
-
["@
|
292 |
-
["@AriaNeural Provide an overview of the solar system."],
|
293 |
-
["@MichaelNeural Summarize the benefits of a healthy lifestyle."],
|
294 |
-
["@OliviaNeural Tell me a joke."]
|
295 |
],
|
296 |
cache_examples=False,
|
297 |
description="# **Pocket Llama**",
|
|
|
49 |
torch_dtype=torch.float16
|
50 |
).to("cuda").eval()
|
51 |
|
52 |
+
# Edge TTS voices mapping for new tags.
|
|
|
53 |
TTS_VOICE_MAP = {
|
54 |
"@jennyneural": "en-US-JennyNeural",
|
55 |
"@guyneural": "en-US-GuyNeural",
|
|
|
|
|
|
|
56 |
}
|
57 |
|
58 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
|
|
125 |
repetition_penalty: float = 1.2):
|
126 |
"""
|
127 |
Generates chatbot responses with support for multimodal input, video processing,
|
128 |
+
and Edge TTS when using the new tags @JennyNeural or @GuyNeural.
|
129 |
Special command:
|
130 |
- "@video-infer": triggers video processing using Callisto OCR3.
|
131 |
"""
|
|
|
281 |
examples=[
|
282 |
["Write the code that converts temperatures between Celsius and Fahrenheit in short"],
|
283 |
[{"text": "Create a short story based on the image.", "files": ["examples/1.jpg"]}],
|
284 |
+
["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
|
285 |
[{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
|
286 |
[{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
|
287 |
+
["@GuyNeural Explain how rainbows are formed."]
|
|
|
|
|
|
|
288 |
],
|
289 |
cache_examples=False,
|
290 |
description="# **Pocket Llama**",
|