prithivMLmods commited on
Commit
7d47057
·
verified ·
1 Parent(s): 0aa3c52

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -26
app.py CHANGED
@@ -49,21 +49,14 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
49
  torch_dtype=torch.float16
50
  ).to("cuda").eval()
51
 
52
- # Expanded Edge TTS voices mapping for new tags.
 
53
  TTS_VOICE_MAP = {
54
  "@jennyneural": "en-US-JennyNeural",
55
  "@guyneural": "en-US-GuyNeural",
56
- "@adrineural": "af-ZA-AdriNeural",
57
- "@willemneural": "af-ZA-WillemNeural",
58
- "@amehaneural": "am-ET-AmehaNeural",
59
- "@mekdesneural": "am-ET-MekdesNeural",
60
- "@fatimaneural": "ar-AE-FatimaNeural",
61
- "@hamdanneural": "ar-AE-HamdanNeural",
62
- "@alineural": "ar-BH-AliNeural",
63
- "@lailaneural": "ar-BH-LailaNeural",
64
- "@aminaneural": "ar-DZ-AminaNeural",
65
- "@ismaelneural": "ar-DZ-IsmaelNeural",
66
- "@salmaneural": "ar-EG-SalmaNeural",
67
  }
68
 
69
  async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
@@ -136,7 +129,7 @@ def generate(input_dict: dict, chat_history: list[dict],
136
  repetition_penalty: float = 1.2):
137
  """
138
  Generates chatbot responses with support for multimodal input, video processing,
139
- and Edge TTS when using the new tags for voices.
140
  Special command:
141
  - "@video-infer": triggers video processing using Callisto OCR3.
142
  """
@@ -292,24 +285,16 @@ demo = gr.ChatInterface(
292
  examples=[
293
  ["Write the code that converts temperatures between Celsius and Fahrenheit in short"],
294
  [{"text": "Create a short story based on the image.", "files": ["examples/1.jpg"]}],
 
295
  [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
296
  [{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
297
  ["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
298
- ["@GuyNeural Explain how rainbows are formed."],
299
- ["@AdriNeural Provide a brief overview of South African wildlife."],
300
- ["@WillemNeural Tell me a fun fact about astronomy."],
301
- ["@AmehaNeural What are the main features of Ethiopian culture?"],
302
- ["@MekdesNeural Share a short story about innovation."],
303
- ["@FatimaNeural Explain the importance of renewable energy."],
304
- ["@HamdanNeural Describe the evolution of modern technology."],
305
- ["@AliNeural What causes thunderstorms?"],
306
- ["@LailaNeural Describe the process of photosynthesis."],
307
- ["@AminaNeural Summarize the history of North Africa."],
308
- ["@IsmaelNeural What are the benefits of meditation?"],
309
- ["@SalmaNeural Tell me about the influence of ancient Egyptian culture."]
310
  ],
311
  cache_examples=False,
312
- description="# **Pocket Llama with Expanded Edge TTS**\n\nUse one of the TTS tags at the beginning of your query (e.g., **@JennyNeural**, **@GuyNeural**, **@AdriNeural**, etc.) to trigger text-to-speech output.",
313
  type="messages",
314
  fill_height=True,
315
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
 
49
  torch_dtype=torch.float16
50
  ).to("cuda").eval()
51
 
52
+ # Extended Edge TTS voices mapping for new tags.
53
+ # Use any of these tags at the start of your prompt to trigger TTS.
54
  TTS_VOICE_MAP = {
55
  "@jennyneural": "en-US-JennyNeural",
56
  "@guyneural": "en-US-GuyNeural",
57
+ "@arianeural": "en-US-AriaNeural",
58
+ "@michaelneural": "en-US-MichaelNeural",
59
+ "@olivianeural": "en-US-OliviaNeural",
 
 
 
 
 
 
 
 
60
  }
61
 
62
  async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
 
129
  repetition_penalty: float = 1.2):
130
  """
131
  Generates chatbot responses with support for multimodal input, video processing,
132
+ and Edge TTS when using the new tags for TTS.
133
  Special command:
134
  - "@video-infer": triggers video processing using Callisto OCR3.
135
  """
 
285
  examples=[
286
  ["Write the code that converts temperatures between Celsius and Fahrenheit in short"],
287
  [{"text": "Create a short story based on the image.", "files": ["examples/1.jpg"]}],
288
+ ["@GuyNeural Explain how rainbows are formed."],
289
  [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
290
  [{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
291
  ["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
292
+ ["@AriaNeural Provide an overview of the solar system."],
293
+ ["@MichaelNeural Summarize the benefits of a healthy lifestyle."],
294
+ ["@OliviaNeural Tell me a joke."]
 
 
 
 
 
 
 
 
 
295
  ],
296
  cache_examples=False,
297
+ description="# **Pocket Llama**",
298
  type="messages",
299
  fill_height=True,
300
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),