Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -49,10 +49,21 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
49 |
torch_dtype=torch.float16
|
50 |
).to("cuda").eval()
|
51 |
|
52 |
-
# Edge TTS voices mapping for new tags.
|
53 |
TTS_VOICE_MAP = {
|
54 |
"@jennyneural": "en-US-JennyNeural",
|
55 |
"@guyneural": "en-US-GuyNeural",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
}
|
57 |
|
58 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
@@ -125,7 +136,7 @@ def generate(input_dict: dict, chat_history: list[dict],
|
|
125 |
repetition_penalty: float = 1.2):
|
126 |
"""
|
127 |
Generates chatbot responses with support for multimodal input, video processing,
|
128 |
-
and Edge TTS when using the new tags
|
129 |
Special command:
|
130 |
- "@video-infer": triggers video processing using Callisto OCR3.
|
131 |
"""
|
@@ -284,10 +295,21 @@ demo = gr.ChatInterface(
|
|
284 |
[{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
|
285 |
[{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
|
286 |
["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
|
287 |
-
["@GuyNeural Explain how rainbows are formed."]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
],
|
289 |
cache_examples=False,
|
290 |
-
description="# **Pocket Llama
|
291 |
type="messages",
|
292 |
fill_height=True,
|
293 |
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
|
|
|
49 |
torch_dtype=torch.float16
|
50 |
).to("cuda").eval()
|
51 |
|
52 |
+
# Expanded Edge TTS voices mapping for new tags.
|
53 |
TTS_VOICE_MAP = {
|
54 |
"@jennyneural": "en-US-JennyNeural",
|
55 |
"@guyneural": "en-US-GuyNeural",
|
56 |
+
"@adrineural": "af-ZA-AdriNeural",
|
57 |
+
"@willemneural": "af-ZA-WillemNeural",
|
58 |
+
"@amehaneural": "am-ET-AmehaNeural",
|
59 |
+
"@mekdesneural": "am-ET-MekdesNeural",
|
60 |
+
"@fatimaneural": "ar-AE-FatimaNeural",
|
61 |
+
"@hamdanneural": "ar-AE-HamdanNeural",
|
62 |
+
"@alineural": "ar-BH-AliNeural",
|
63 |
+
"@lailaneural": "ar-BH-LailaNeural",
|
64 |
+
"@aminaneural": "ar-DZ-AminaNeural",
|
65 |
+
"@ismaelneural": "ar-DZ-IsmaelNeural",
|
66 |
+
"@salmaneural": "ar-EG-SalmaNeural",
|
67 |
}
|
68 |
|
69 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
|
|
136 |
repetition_penalty: float = 1.2):
|
137 |
"""
|
138 |
Generates chatbot responses with support for multimodal input, video processing,
|
139 |
+
and Edge TTS when using the new tags for voices.
|
140 |
Special command:
|
141 |
- "@video-infer": triggers video processing using Callisto OCR3.
|
142 |
"""
|
|
|
295 |
[{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
|
296 |
[{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
|
297 |
["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
|
298 |
+
["@GuyNeural Explain how rainbows are formed."],
|
299 |
+
["@AdriNeural Provide a brief overview of South African wildlife."],
|
300 |
+
["@WillemNeural Tell me a fun fact about astronomy."],
|
301 |
+
["@AmehaNeural What are the main features of Ethiopian culture?"],
|
302 |
+
["@MekdesNeural Share a short story about innovation."],
|
303 |
+
["@FatimaNeural Explain the importance of renewable energy."],
|
304 |
+
["@HamdanNeural Describe the evolution of modern technology."],
|
305 |
+
["@AliNeural What causes thunderstorms?"],
|
306 |
+
["@LailaNeural Describe the process of photosynthesis."],
|
307 |
+
["@AminaNeural Summarize the history of North Africa."],
|
308 |
+
["@IsmaelNeural What are the benefits of meditation?"],
|
309 |
+
["@SalmaNeural Tell me about the influence of ancient Egyptian culture."]
|
310 |
],
|
311 |
cache_examples=False,
|
312 |
+
description="# **Pocket Llama with Expanded Edge TTS**\n\nUse one of the TTS tags at the beginning of your query (e.g., **@JennyNeural**, **@GuyNeural**, **@AdriNeural**, etc.) to trigger text-to-speech output.",
|
313 |
type="messages",
|
314 |
fill_height=True,
|
315 |
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
|