Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,7 @@ import spaces
|
|
11 |
import torch
|
12 |
import numpy as np
|
13 |
from PIL import Image
|
|
|
14 |
import cv2
|
15 |
|
16 |
from transformers import (
|
@@ -30,7 +31,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
|
30 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
31 |
|
32 |
# Load text-only model and tokenizer
|
33 |
-
model_id = "prithivMLmods/
|
34 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
35 |
model = AutoModelForCausalLM.from_pretrained(
|
36 |
model_id,
|
@@ -39,6 +40,11 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
39 |
)
|
40 |
model.eval()
|
41 |
|
|
|
|
|
|
|
|
|
|
|
42 |
MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
43 |
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
|
44 |
model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
@@ -47,6 +53,12 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
47 |
torch_dtype=torch.float16
|
48 |
).to("cuda").eval()
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
def clean_chat_history(chat_history):
|
51 |
"""
|
52 |
Filter out any chat entries whose "content" is not a string.
|
@@ -59,6 +71,7 @@ def clean_chat_history(chat_history):
|
|
59 |
return cleaned
|
60 |
|
61 |
# Environment variables and parameters for Stable Diffusion XL
|
|
|
62 |
MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # SDXL Model repository path via env variable
|
63 |
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
|
64 |
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
|
@@ -201,8 +214,9 @@ def generate(
|
|
201 |
repetition_penalty: float = 1.2,
|
202 |
):
|
203 |
"""
|
204 |
-
Generates chatbot responses with support for multimodal input and image generation.
|
205 |
Special commands:
|
|
|
206 |
- "@image": triggers image generation using the SDXL pipeline.
|
207 |
- "@qwen2vl-video": triggers video processing using Qwen2VL.
|
208 |
"""
|
@@ -279,10 +293,20 @@ def generate(
|
|
279 |
yield buffer
|
280 |
return
|
281 |
|
282 |
-
#
|
283 |
-
|
284 |
-
|
285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
|
287 |
if files:
|
288 |
if len(files) > 1:
|
@@ -338,6 +362,9 @@ def generate(
|
|
338 |
yield "".join(outputs)
|
339 |
final_response = "".join(outputs)
|
340 |
yield final_response
|
|
|
|
|
|
|
341 |
|
342 |
demo = gr.ChatInterface(
|
343 |
fn=generate,
|
@@ -354,14 +381,16 @@ demo = gr.ChatInterface(
|
|
354 |
[{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
|
355 |
["@image Chocolate dripping from a donut"],
|
356 |
["Python Program for Array Rotation"],
|
|
|
357 |
[{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
|
358 |
[{"text": "summarize the letter", "files": ["examples/1.png"]}],
|
|
|
359 |
],
|
360 |
cache_examples=False,
|
361 |
type="messages",
|
362 |
-
description="# **QwQ Edge
|
363 |
fill_height=True,
|
364 |
-
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder=" @image for image gen, @video-infer for video, default [text, vision]"),
|
365 |
stop_btn="Stop Generation",
|
366 |
multimodal=True,
|
367 |
)
|
|
|
11 |
import torch
|
12 |
import numpy as np
|
13 |
from PIL import Image
|
14 |
+
import edge_tts
|
15 |
import cv2
|
16 |
|
17 |
from transformers import (
|
|
|
31 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
32 |
|
33 |
# Load text-only model and tokenizer
|
34 |
+
model_id = "prithivMLmods/FastThink-0.5B-Tiny"
|
35 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
36 |
model = AutoModelForCausalLM.from_pretrained(
|
37 |
model_id,
|
|
|
40 |
)
|
41 |
model.eval()
|
42 |
|
43 |
+
TTS_VOICES = [
|
44 |
+
"en-US-JennyNeural", # @tts1
|
45 |
+
"en-US-GuyNeural", # @tts2
|
46 |
+
]
|
47 |
+
|
48 |
MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
49 |
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
|
50 |
model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
|
53 |
torch_dtype=torch.float16
|
54 |
).to("cuda").eval()
|
55 |
|
56 |
+
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
57 |
+
"""Convert text to speech using Edge TTS and save as MP3"""
|
58 |
+
communicate = edge_tts.Communicate(text, voice)
|
59 |
+
await communicate.save(output_file)
|
60 |
+
return output_file
|
61 |
+
|
62 |
def clean_chat_history(chat_history):
|
63 |
"""
|
64 |
Filter out any chat entries whose "content" is not a string.
|
|
|
71 |
return cleaned
|
72 |
|
73 |
# Environment variables and parameters for Stable Diffusion XL
|
74 |
+
# Use : SG161222/RealVisXL_V4.0_Lightning or SG161222/RealVisXL_V5.0_Lightning
|
75 |
MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # SDXL Model repository path via env variable
|
76 |
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
|
77 |
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
|
|
|
214 |
repetition_penalty: float = 1.2,
|
215 |
):
|
216 |
"""
|
217 |
+
Generates chatbot responses with support for multimodal input, TTS, and image generation.
|
218 |
Special commands:
|
219 |
+
- "@tts1" or "@tts2": triggers text-to-speech.
|
220 |
- "@image": triggers image generation using the SDXL pipeline.
|
221 |
- "@qwen2vl-video": triggers video processing using Qwen2VL.
|
222 |
"""
|
|
|
293 |
yield buffer
|
294 |
return
|
295 |
|
296 |
+
# Determine if TTS is requested.
|
297 |
+
tts_prefix = "@tts"
|
298 |
+
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
299 |
+
voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
|
300 |
+
|
301 |
+
if is_tts and voice_index:
|
302 |
+
voice = TTS_VOICES[voice_index - 1]
|
303 |
+
text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
|
304 |
+
conversation = [{"role": "user", "content": text}]
|
305 |
+
else:
|
306 |
+
voice = None
|
307 |
+
text = text.replace(tts_prefix, "").strip()
|
308 |
+
conversation = clean_chat_history(chat_history)
|
309 |
+
conversation.append({"role": "user", "content": text})
|
310 |
|
311 |
if files:
|
312 |
if len(files) > 1:
|
|
|
362 |
yield "".join(outputs)
|
363 |
final_response = "".join(outputs)
|
364 |
yield final_response
|
365 |
+
if is_tts and voice:
|
366 |
+
output_file = asyncio.run(text_to_speech(final_response, voice))
|
367 |
+
yield gr.Audio(output_file, autoplay=True)
|
368 |
|
369 |
demo = gr.ChatInterface(
|
370 |
fn=generate,
|
|
|
381 |
[{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
|
382 |
["@image Chocolate dripping from a donut"],
|
383 |
["Python Program for Array Rotation"],
|
384 |
+
["@tts1 Who is Nikola Tesla, and why did he die?"],
|
385 |
[{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
|
386 |
[{"text": "summarize the letter", "files": ["examples/1.png"]}],
|
387 |
+
["@tts2 What causes rainbows to form?"],
|
388 |
],
|
389 |
cache_examples=False,
|
390 |
type="messages",
|
391 |
+
description="# **QwQ Edge `@video-infer 'prompt..', @image, @tts1`**",
|
392 |
fill_height=True,
|
393 |
+
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder=" @tts1, @tts2-voices, @image for image gen, @video-infer for video, default [text, vision]"),
|
394 |
stop_btn="Stop Generation",
|
395 |
multimodal=True,
|
396 |
)
|