prithivMLmods commited on
Commit
6d1a1b7
·
verified ·
1 Parent(s): 0d63337

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -8
app.py CHANGED
@@ -11,6 +11,7 @@ import spaces
11
  import torch
12
  import numpy as np
13
  from PIL import Image
 
14
  import cv2
15
 
16
  from transformers import (
@@ -30,7 +31,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
32
  # Load text-only model and tokenizer
33
- model_id = "prithivMLmods/DeepHermes-3-Llama-3-3B-Preview-abliterated"
34
  tokenizer = AutoTokenizer.from_pretrained(model_id)
35
  model = AutoModelForCausalLM.from_pretrained(
36
  model_id,
@@ -39,6 +40,11 @@ model = AutoModelForCausalLM.from_pretrained(
39
  )
40
  model.eval()
41
 
 
 
 
 
 
42
  MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
43
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
44
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -47,6 +53,12 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
47
  torch_dtype=torch.float16
48
  ).to("cuda").eval()
49
 
 
 
 
 
 
 
50
  def clean_chat_history(chat_history):
51
  """
52
  Filter out any chat entries whose "content" is not a string.
@@ -59,6 +71,7 @@ def clean_chat_history(chat_history):
59
  return cleaned
60
 
61
  # Environment variables and parameters for Stable Diffusion XL
 
62
  MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # SDXL Model repository path via env variable
63
  MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
64
  USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
@@ -201,8 +214,9 @@ def generate(
201
  repetition_penalty: float = 1.2,
202
  ):
203
  """
204
- Generates chatbot responses with support for multimodal input and image generation.
205
  Special commands:
 
206
  - "@image": triggers image generation using the SDXL pipeline.
207
  - "@qwen2vl-video": triggers video processing using Qwen2VL.
208
  """
@@ -279,10 +293,20 @@ def generate(
279
  yield buffer
280
  return
281
 
282
- # For regular chat (text and multimodal input), process the conversation.
283
- text = text.strip()
284
- conversation = clean_chat_history(chat_history)
285
- conversation.append({"role": "user", "content": text})
 
 
 
 
 
 
 
 
 
 
286
 
287
  if files:
288
  if len(files) > 1:
@@ -338,6 +362,9 @@ def generate(
338
  yield "".join(outputs)
339
  final_response = "".join(outputs)
340
  yield final_response
 
 
 
341
 
342
  demo = gr.ChatInterface(
343
  fn=generate,
@@ -354,14 +381,16 @@ demo = gr.ChatInterface(
354
  [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
355
  ["@image Chocolate dripping from a donut"],
356
  ["Python Program for Array Rotation"],
 
357
  [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
358
  [{"text": "summarize the letter", "files": ["examples/1.png"]}],
 
359
  ],
360
  cache_examples=False,
361
  type="messages",
362
- description="# **QwQ Edge @video-infer 'prompt..', @image**",
363
  fill_height=True,
364
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ @image for image gen, @video-infer for video, default [text, vision]"),
365
  stop_btn="Stop Generation",
366
  multimodal=True,
367
  )
 
11
  import torch
12
  import numpy as np
13
  from PIL import Image
14
+ import edge_tts
15
  import cv2
16
 
17
  from transformers import (
 
31
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
32
 
33
  # Load text-only model and tokenizer
34
+ model_id = "prithivMLmods/FastThink-0.5B-Tiny"
35
  tokenizer = AutoTokenizer.from_pretrained(model_id)
36
  model = AutoModelForCausalLM.from_pretrained(
37
  model_id,
 
40
  )
41
  model.eval()
42
 
43
+ TTS_VOICES = [
44
+ "en-US-JennyNeural", # @tts1
45
+ "en-US-GuyNeural", # @tts2
46
+ ]
47
+
48
  MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
49
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
50
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
 
53
  torch_dtype=torch.float16
54
  ).to("cuda").eval()
55
 
56
+ async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
57
+ """Convert text to speech using Edge TTS and save as MP3"""
58
+ communicate = edge_tts.Communicate(text, voice)
59
+ await communicate.save(output_file)
60
+ return output_file
61
+
62
  def clean_chat_history(chat_history):
63
  """
64
  Filter out any chat entries whose "content" is not a string.
 
71
  return cleaned
72
 
73
  # Environment variables and parameters for Stable Diffusion XL
74
+ # Use : SG161222/RealVisXL_V4.0_Lightning or SG161222/RealVisXL_V5.0_Lightning
75
  MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # SDXL Model repository path via env variable
76
  MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
77
  USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 
214
  repetition_penalty: float = 1.2,
215
  ):
216
  """
217
+ Generates chatbot responses with support for multimodal input, TTS, and image generation.
218
  Special commands:
219
+ - "@tts1" or "@tts2": triggers text-to-speech.
220
  - "@image": triggers image generation using the SDXL pipeline.
221
  - "@qwen2vl-video": triggers video processing using Qwen2VL.
222
  """
 
293
  yield buffer
294
  return
295
 
296
+ # Determine if TTS is requested.
297
+ tts_prefix = "@tts"
298
+ is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
299
+ voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
300
+
301
+ if is_tts and voice_index:
302
+ voice = TTS_VOICES[voice_index - 1]
303
+ text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
304
+ conversation = [{"role": "user", "content": text}]
305
+ else:
306
+ voice = None
307
+ text = text.replace(tts_prefix, "").strip()
308
+ conversation = clean_chat_history(chat_history)
309
+ conversation.append({"role": "user", "content": text})
310
 
311
  if files:
312
  if len(files) > 1:
 
362
  yield "".join(outputs)
363
  final_response = "".join(outputs)
364
  yield final_response
365
+ if is_tts and voice:
366
+ output_file = asyncio.run(text_to_speech(final_response, voice))
367
+ yield gr.Audio(output_file, autoplay=True)
368
 
369
  demo = gr.ChatInterface(
370
  fn=generate,
 
381
  [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
382
  ["@image Chocolate dripping from a donut"],
383
  ["Python Program for Array Rotation"],
384
+ ["@tts1 Who is Nikola Tesla, and why did he die?"],
385
  [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
386
  [{"text": "summarize the letter", "files": ["examples/1.png"]}],
387
+ ["@tts2 What causes rainbows to form?"],
388
  ],
389
  cache_examples=False,
390
  type="messages",
391
+ description="# **QwQ Edge `@video-infer 'prompt..', @image, @tts1`**",
392
  fill_height=True,
393
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ @tts1, @tts2-voices, @image for image gen, @video-infer for video, default [text, vision]"),
394
  stop_btn="Stop Generation",
395
  multimodal=True,
396
  )