Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -21,7 +21,6 @@ from transformers import (
|
|
21 |
AutoProcessor,
|
22 |
)
|
23 |
from transformers.image_utils import load_image
|
24 |
-
from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
25 |
|
26 |
# Additional imports for new TTS
|
27 |
from snac import SNAC
|
@@ -43,7 +42,7 @@ hermes_llm_model = AutoModelForCausalLM.from_pretrained(
|
|
43 |
)
|
44 |
hermes_llm_model.eval()
|
45 |
|
46 |
-
# Load Qwen2-VL processor and model for multimodal tasks
|
47 |
MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
|
48 |
processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
|
49 |
model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
@@ -84,32 +83,12 @@ orpheus_tts_model.to(tts_device)
|
|
84 |
orpheus_tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_name)
|
85 |
print(f"Orpheus TTS model loaded to {tts_device}")
|
86 |
|
87 |
-
# Some global parameters for chat
|
88 |
MAX_MAX_NEW_TOKENS = 2048
|
89 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
90 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
91 |
|
92 |
-
#
|
93 |
-
MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # e.g. SG161222/RealVisXL_V5.0_Lightning
|
94 |
-
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
|
95 |
-
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
|
96 |
-
ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
|
97 |
-
BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
|
98 |
-
|
99 |
-
sd_pipe = StableDiffusionXLPipeline.from_pretrained(
|
100 |
-
MODEL_ID_SD,
|
101 |
-
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
102 |
-
use_safetensors=True,
|
103 |
-
add_watermarker=False,
|
104 |
-
).to(device)
|
105 |
-
sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
|
106 |
-
|
107 |
-
if torch.cuda.is_available():
|
108 |
-
sd_pipe.text_encoder = sd_pipe.text_encoder.half()
|
109 |
-
if USE_TORCH_COMPILE:
|
110 |
-
sd_pipe.compile()
|
111 |
-
if ENABLE_CPU_OFFLOAD:
|
112 |
-
sd_pipe.enable_model_cpu_offload()
|
113 |
|
114 |
MAX_SEED = np.iinfo(np.int32).max
|
115 |
|
@@ -164,50 +143,6 @@ def clean_chat_history(chat_history):
|
|
164 |
cleaned.append(msg)
|
165 |
return cleaned
|
166 |
|
167 |
-
@spaces.GPU(duration=60, enable_queue=True)
|
168 |
-
def generate_image_fn(
|
169 |
-
prompt: str,
|
170 |
-
negative_prompt: str = "",
|
171 |
-
use_negative_prompt: bool = False,
|
172 |
-
seed: int = 1,
|
173 |
-
width: int = 1024,
|
174 |
-
height: int = 1024,
|
175 |
-
guidance_scale: float = 3,
|
176 |
-
num_inference_steps: int = 25,
|
177 |
-
randomize_seed: bool = False,
|
178 |
-
use_resolution_binning: bool = True,
|
179 |
-
num_images: int = 1,
|
180 |
-
progress=gr.Progress(track_tqdm=True),
|
181 |
-
):
|
182 |
-
seed = int(randomize_seed_fn(seed, randomize_seed))
|
183 |
-
generator = torch.Generator(device=device).manual_seed(seed)
|
184 |
-
options = {
|
185 |
-
"prompt": [prompt] * num_images,
|
186 |
-
"negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
|
187 |
-
"width": width,
|
188 |
-
"height": height,
|
189 |
-
"guidance_scale": guidance_scale,
|
190 |
-
"num_inference_steps": num_inference_steps,
|
191 |
-
"generator": generator,
|
192 |
-
"output_type": "pil",
|
193 |
-
}
|
194 |
-
if use_resolution_binning:
|
195 |
-
options["use_resolution_binning"] = True
|
196 |
-
images = []
|
197 |
-
for i in range(0, num_images, BATCH_SIZE):
|
198 |
-
batch_options = options.copy()
|
199 |
-
batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
|
200 |
-
if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
|
201 |
-
batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
|
202 |
-
if device.type == "cuda":
|
203 |
-
with torch.autocast("cuda", dtype=torch.float16):
|
204 |
-
outputs = sd_pipe(**batch_options)
|
205 |
-
else:
|
206 |
-
outputs = sd_pipe(**batch_options)
|
207 |
-
images.extend(outputs.images)
|
208 |
-
image_paths = [save_image(img) for img in images]
|
209 |
-
return image_paths, seed
|
210 |
-
|
211 |
# New TTS functions (SNAC/Orpheus pipeline)
|
212 |
def process_prompt(prompt, voice, tokenizer, device):
|
213 |
prompt = f"{voice}: {prompt}"
|
@@ -298,11 +233,10 @@ def generate(
|
|
298 |
repetition_penalty: float = 1.2,
|
299 |
):
|
300 |
"""
|
301 |
-
Generates chatbot responses with support for multimodal input,
|
302 |
TTS, and LLM-augmented TTS.
|
303 |
|
304 |
Trigger commands:
|
305 |
-
- "@image": generate an image.
|
306 |
- "@video-infer": process video.
|
307 |
- "@<voice>-tts": directly convert text to speech.
|
308 |
- "@<voice>-llm": infer with the DeepHermes Llama model then convert to speech.
|
@@ -311,26 +245,6 @@ def generate(
|
|
311 |
files = input_dict.get("files", [])
|
312 |
lower_text = text.strip().lower()
|
313 |
|
314 |
-
# Branch for image generation.
|
315 |
-
if lower_text.startswith("@image"):
|
316 |
-
prompt = text[len("@image"):].strip()
|
317 |
-
yield progress_bar_html("Generating Image")
|
318 |
-
image_paths, used_seed = generate_image_fn(
|
319 |
-
prompt=prompt,
|
320 |
-
negative_prompt="",
|
321 |
-
use_negative_prompt=False,
|
322 |
-
seed=1,
|
323 |
-
width=1024,
|
324 |
-
height=1024,
|
325 |
-
guidance_scale=3,
|
326 |
-
num_inference_steps=25,
|
327 |
-
randomize_seed=True,
|
328 |
-
use_resolution_binning=True,
|
329 |
-
num_images=1,
|
330 |
-
)
|
331 |
-
yield gr.Image(image_paths[0])
|
332 |
-
return
|
333 |
-
|
334 |
# Branch for video processing.
|
335 |
if lower_text.startswith("@video-infer"):
|
336 |
prompt = text[len("@video-infer"):].strip()
|
@@ -424,28 +338,30 @@ def generate(
|
|
424 |
# Default branch for regular chat (text and multimodal without TTS).
|
425 |
conversation = clean_chat_history(chat_history)
|
426 |
conversation.append({"role": "user", "content": text})
|
|
|
427 |
if files:
|
|
|
428 |
if len(files) > 1:
|
429 |
-
|
430 |
elif len(files) == 1:
|
431 |
-
|
432 |
else:
|
433 |
-
|
434 |
messages = [{
|
435 |
"role": "user",
|
436 |
"content": [
|
437 |
-
*[{"type": "image", "image":
|
438 |
{"type": "text", "text": text},
|
439 |
]
|
440 |
}]
|
441 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
442 |
-
inputs = processor(text=[prompt_full], images=
|
443 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
444 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
445 |
thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
|
446 |
thread.start()
|
447 |
buffer = ""
|
448 |
-
yield progress_bar_html("Processing Qwen2VL")
|
449 |
for new_text in streamer:
|
450 |
buffer += new_text.replace("<|im_end|>", "")
|
451 |
time.sleep(0.01)
|
@@ -496,16 +412,14 @@ demo = gr.ChatInterface(
|
|
496 |
["@dan-tts Yo, I’m Dan, [groan] and yes, I can even sound annoyed if I have to."],
|
497 |
["Write python program for array rotation"],
|
498 |
["@tara-tts Hey there, my name is Tara, [laugh] and I’m a speech generation model that can sound just like you!"],
|
499 |
-
[{"text": "summarize the letter", "files": ["examples/1.png"]}],
|
500 |
["@tara-llm Who is Nikola Tesla, and why did he die?"],
|
501 |
["@emma-llm Explain the causes of rainbows"],
|
502 |
-
["@image Chocolate dripping from a donut"],
|
503 |
[{"text": "@video-infer Summarize the event in video", "files": ["examples/sky.mp4"]}],
|
504 |
[{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
|
505 |
],
|
506 |
cache_examples=False,
|
507 |
type="messages",
|
508 |
-
description="# **Orpheus Edge🧤** `voice: tara, dan, emma, josh` \n `emotion: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>. Use @video-infer,
|
509 |
fill_height=True,
|
510 |
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder=" Use @tara-tts/@dan-tts for direct TTS or @tara-llm/@dan-llm for LLM+TTS, etc."),
|
511 |
stop_btn="Stop Generation",
|
@@ -513,4 +427,4 @@ demo = gr.ChatInterface(
|
|
513 |
)
|
514 |
|
515 |
if __name__ == "__main__":
|
516 |
-
demo.queue(max_size=
|
|
|
21 |
AutoProcessor,
|
22 |
)
|
23 |
from transformers.image_utils import load_image
|
|
|
24 |
|
25 |
# Additional imports for new TTS
|
26 |
from snac import SNAC
|
|
|
42 |
)
|
43 |
hermes_llm_model.eval()
|
44 |
|
45 |
+
# Load Qwen2-VL processor and model for multimodal tasks (e.g. video processing)
|
46 |
MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
|
47 |
processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
|
48 |
model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
|
83 |
orpheus_tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_name)
|
84 |
print(f"Orpheus TTS model loaded to {tts_device}")
|
85 |
|
86 |
+
# Some global parameters for chat responses
|
87 |
MAX_MAX_NEW_TOKENS = 2048
|
88 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
89 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
90 |
|
91 |
+
# (Image generation related code has been fully removed.)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
MAX_SEED = np.iinfo(np.int32).max
|
94 |
|
|
|
143 |
cleaned.append(msg)
|
144 |
return cleaned
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
# New TTS functions (SNAC/Orpheus pipeline)
|
147 |
def process_prompt(prompt, voice, tokenizer, device):
|
148 |
prompt = f"{voice}: {prompt}"
|
|
|
233 |
repetition_penalty: float = 1.2,
|
234 |
):
|
235 |
"""
|
236 |
+
Generates chatbot responses with support for multimodal input, video processing,
|
237 |
TTS, and LLM-augmented TTS.
|
238 |
|
239 |
Trigger commands:
|
|
|
240 |
- "@video-infer": process video.
|
241 |
- "@<voice>-tts": directly convert text to speech.
|
242 |
- "@<voice>-llm": infer with the DeepHermes Llama model then convert to speech.
|
|
|
245 |
files = input_dict.get("files", [])
|
246 |
lower_text = text.strip().lower()
|
247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
# Branch for video processing.
|
249 |
if lower_text.startswith("@video-infer"):
|
250 |
prompt = text[len("@video-infer"):].strip()
|
|
|
338 |
# Default branch for regular chat (text and multimodal without TTS).
|
339 |
conversation = clean_chat_history(chat_history)
|
340 |
conversation.append({"role": "user", "content": text})
|
341 |
+
# If files are provided, only non-image files (e.g. video) are processed via Qwen2VL.
|
342 |
if files:
|
343 |
+
# Process files using the processor (this branch no longer handles image generation)
|
344 |
if len(files) > 1:
|
345 |
+
inputs_list = [load_image(image) for image in files]
|
346 |
elif len(files) == 1:
|
347 |
+
inputs_list = [load_image(files[0])]
|
348 |
else:
|
349 |
+
inputs_list = []
|
350 |
messages = [{
|
351 |
"role": "user",
|
352 |
"content": [
|
353 |
+
*[{"type": "image", "image": img} for img in inputs_list],
|
354 |
{"type": "text", "text": text},
|
355 |
]
|
356 |
}]
|
357 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
358 |
+
inputs = processor(text=[prompt_full], images=inputs_list, return_tensors="pt", padding=True).to("cuda")
|
359 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
360 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
361 |
thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
|
362 |
thread.start()
|
363 |
buffer = ""
|
364 |
+
yield progress_bar_html("Processing with Qwen2VL")
|
365 |
for new_text in streamer:
|
366 |
buffer += new_text.replace("<|im_end|>", "")
|
367 |
time.sleep(0.01)
|
|
|
412 |
["@dan-tts Yo, I’m Dan, [groan] and yes, I can even sound annoyed if I have to."],
|
413 |
["Write python program for array rotation"],
|
414 |
["@tara-tts Hey there, my name is Tara, [laugh] and I’m a speech generation model that can sound just like you!"],
|
|
|
415 |
["@tara-llm Who is Nikola Tesla, and why did he die?"],
|
416 |
["@emma-llm Explain the causes of rainbows"],
|
|
|
417 |
[{"text": "@video-infer Summarize the event in video", "files": ["examples/sky.mp4"]}],
|
418 |
[{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
|
419 |
],
|
420 |
cache_examples=False,
|
421 |
type="messages",
|
422 |
+
description="# **Orpheus Edge🧤** `voice: tara, dan, emma, josh` \n `emotion: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>. Use @video-infer, orpheus: @<voice>-tts, or @<voice>-llm triggers llm response`",
|
423 |
fill_height=True,
|
424 |
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder=" Use @tara-tts/@dan-tts for direct TTS or @tara-llm/@dan-llm for LLM+TTS, etc."),
|
425 |
stop_btn="Stop Generation",
|
|
|
427 |
)
|
428 |
|
429 |
if __name__ == "__main__":
|
430 |
+
demo.queue(max_size=30).launch(share=True)
|