prithivMLmods commited on
Commit
0aa3c52
·
verified ·
1 Parent(s): ce5f63a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -4
app.py CHANGED
@@ -49,10 +49,21 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
49
  torch_dtype=torch.float16
50
  ).to("cuda").eval()
51
 
52
- # Edge TTS voices mapping for new tags.
53
  TTS_VOICE_MAP = {
54
  "@jennyneural": "en-US-JennyNeural",
55
  "@guyneural": "en-US-GuyNeural",
 
 
 
 
 
 
 
 
 
 
 
56
  }
57
 
58
  async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
@@ -125,7 +136,7 @@ def generate(input_dict: dict, chat_history: list[dict],
125
  repetition_penalty: float = 1.2):
126
  """
127
  Generates chatbot responses with support for multimodal input, video processing,
128
- and Edge TTS when using the new tags @JennyNeural or @GuyNeural.
129
  Special command:
130
  - "@video-infer": triggers video processing using Callisto OCR3.
131
  """
@@ -284,10 +295,21 @@ demo = gr.ChatInterface(
284
  [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
285
  [{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
286
  ["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
287
- ["@GuyNeural Explain how rainbows are formed."]
 
 
 
 
 
 
 
 
 
 
 
288
  ],
289
  cache_examples=False,
290
- description="# **Pocket Llama**",
291
  type="messages",
292
  fill_height=True,
293
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
 
49
  torch_dtype=torch.float16
50
  ).to("cuda").eval()
51
 
52
+ # Expanded Edge TTS voices mapping for new tags.
53
  TTS_VOICE_MAP = {
54
  "@jennyneural": "en-US-JennyNeural",
55
  "@guyneural": "en-US-GuyNeural",
56
+ "@adrineural": "af-ZA-AdriNeural",
57
+ "@willemneural": "af-ZA-WillemNeural",
58
+ "@amehaneural": "am-ET-AmehaNeural",
59
+ "@mekdesneural": "am-ET-MekdesNeural",
60
+ "@fatimaneural": "ar-AE-FatimaNeural",
61
+ "@hamdanneural": "ar-AE-HamdanNeural",
62
+ "@alineural": "ar-BH-AliNeural",
63
+ "@lailaneural": "ar-BH-LailaNeural",
64
+ "@aminaneural": "ar-DZ-AminaNeural",
65
+ "@ismaelneural": "ar-DZ-IsmaelNeural",
66
+ "@salmaneural": "ar-EG-SalmaNeural",
67
  }
68
 
69
  async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
 
136
  repetition_penalty: float = 1.2):
137
  """
138
  Generates chatbot responses with support for multimodal input, video processing,
139
+ and Edge TTS when using the new tags for voices.
140
  Special command:
141
  - "@video-infer": triggers video processing using Callisto OCR3.
142
  """
 
295
  [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
296
  [{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
297
  ["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
298
+ ["@GuyNeural Explain how rainbows are formed."],
299
+ ["@AdriNeural Provide a brief overview of South African wildlife."],
300
+ ["@WillemNeural Tell me a fun fact about astronomy."],
301
+ ["@AmehaNeural What are the main features of Ethiopian culture?"],
302
+ ["@MekdesNeural Share a short story about innovation."],
303
+ ["@FatimaNeural Explain the importance of renewable energy."],
304
+ ["@HamdanNeural Describe the evolution of modern technology."],
305
+ ["@AliNeural What causes thunderstorms?"],
306
+ ["@LailaNeural Describe the process of photosynthesis."],
307
+ ["@AminaNeural Summarize the history of North Africa."],
308
+ ["@IsmaelNeural What are the benefits of meditation?"],
309
+ ["@SalmaNeural Tell me about the influence of ancient Egyptian culture."]
310
  ],
311
  cache_examples=False,
312
+ description="# **Pocket Llama with Expanded Edge TTS**\n\nUse one of the TTS tags at the beginning of your query (e.g., **@JennyNeural**, **@GuyNeural**, **@AdriNeural**, etc.) to trigger text-to-speech output.",
313
  type="messages",
314
  fill_height=True,
315
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),