Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -49,21 +49,14 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
49 |
torch_dtype=torch.float16
|
50 |
).to("cuda").eval()
|
51 |
|
52 |
-
#
|
|
|
53 |
TTS_VOICE_MAP = {
|
54 |
"@jennyneural": "en-US-JennyNeural",
|
55 |
"@guyneural": "en-US-GuyNeural",
|
56 |
-
"@
|
57 |
-
"@
|
58 |
-
"@
|
59 |
-
"@mekdesneural": "am-ET-MekdesNeural",
|
60 |
-
"@fatimaneural": "ar-AE-FatimaNeural",
|
61 |
-
"@hamdanneural": "ar-AE-HamdanNeural",
|
62 |
-
"@alineural": "ar-BH-AliNeural",
|
63 |
-
"@lailaneural": "ar-BH-LailaNeural",
|
64 |
-
"@aminaneural": "ar-DZ-AminaNeural",
|
65 |
-
"@ismaelneural": "ar-DZ-IsmaelNeural",
|
66 |
-
"@salmaneural": "ar-EG-SalmaNeural",
|
67 |
}
|
68 |
|
69 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
@@ -136,7 +129,7 @@ def generate(input_dict: dict, chat_history: list[dict],
|
|
136 |
repetition_penalty: float = 1.2):
|
137 |
"""
|
138 |
Generates chatbot responses with support for multimodal input, video processing,
|
139 |
-
and Edge TTS when using the new tags for
|
140 |
Special command:
|
141 |
- "@video-infer": triggers video processing using Callisto OCR3.
|
142 |
"""
|
@@ -292,24 +285,16 @@ demo = gr.ChatInterface(
|
|
292 |
examples=[
|
293 |
["Write the code that converts temperatures between Celsius and Fahrenheit in short"],
|
294 |
[{"text": "Create a short story based on the image.", "files": ["examples/1.jpg"]}],
|
|
|
295 |
[{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
|
296 |
[{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
|
297 |
["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
|
298 |
-
["@
|
299 |
-
["@
|
300 |
-
["@
|
301 |
-
["@AmehaNeural What are the main features of Ethiopian culture?"],
|
302 |
-
["@MekdesNeural Share a short story about innovation."],
|
303 |
-
["@FatimaNeural Explain the importance of renewable energy."],
|
304 |
-
["@HamdanNeural Describe the evolution of modern technology."],
|
305 |
-
["@AliNeural What causes thunderstorms?"],
|
306 |
-
["@LailaNeural Describe the process of photosynthesis."],
|
307 |
-
["@AminaNeural Summarize the history of North Africa."],
|
308 |
-
["@IsmaelNeural What are the benefits of meditation?"],
|
309 |
-
["@SalmaNeural Tell me about the influence of ancient Egyptian culture."]
|
310 |
],
|
311 |
cache_examples=False,
|
312 |
-
description="# **Pocket Llama
|
313 |
type="messages",
|
314 |
fill_height=True,
|
315 |
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
|
|
|
49 |
torch_dtype=torch.float16
|
50 |
).to("cuda").eval()
|
51 |
|
52 |
+
# Extended Edge TTS voices mapping for new tags.
|
53 |
+
# Use any of these tags at the start of your prompt to trigger TTS.
|
54 |
TTS_VOICE_MAP = {
|
55 |
"@jennyneural": "en-US-JennyNeural",
|
56 |
"@guyneural": "en-US-GuyNeural",
|
57 |
+
"@arianeural": "en-US-AriaNeural",
|
58 |
+
"@michaelneural": "en-US-MichaelNeural",
|
59 |
+
"@olivianeural": "en-US-OliviaNeural",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
}
|
61 |
|
62 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
|
|
129 |
repetition_penalty: float = 1.2):
|
130 |
"""
|
131 |
Generates chatbot responses with support for multimodal input, video processing,
|
132 |
+
and Edge TTS when using the new tags for TTS.
|
133 |
Special command:
|
134 |
- "@video-infer": triggers video processing using Callisto OCR3.
|
135 |
"""
|
|
|
285 |
examples=[
|
286 |
["Write the code that converts temperatures between Celsius and Fahrenheit in short"],
|
287 |
[{"text": "Create a short story based on the image.", "files": ["examples/1.jpg"]}],
|
288 |
+
["@GuyNeural Explain how rainbows are formed."],
|
289 |
[{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
|
290 |
[{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
|
291 |
["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
|
292 |
+
["@AriaNeural Provide an overview of the solar system."],
|
293 |
+
["@MichaelNeural Summarize the benefits of a healthy lifestyle."],
|
294 |
+
["@OliviaNeural Tell me a joke."]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
],
|
296 |
cache_examples=False,
|
297 |
+
description="# **Pocket Llama**",
|
298 |
type="messages",
|
299 |
fill_height=True,
|
300 |
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
|