prithivMLmods commited on
Commit
40993df
·
verified ·
1 Parent(s): fff10dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -100
app.py CHANGED
@@ -21,7 +21,6 @@ from transformers import (
21
  AutoProcessor,
22
  )
23
  from transformers.image_utils import load_image
24
- from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
25
 
26
  # Additional imports for new TTS
27
  from snac import SNAC
@@ -43,7 +42,7 @@ hermes_llm_model = AutoModelForCausalLM.from_pretrained(
43
  )
44
  hermes_llm_model.eval()
45
 
46
- # Load Qwen2-VL processor and model for multimodal tasks
47
  MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
48
  processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
49
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -84,32 +83,12 @@ orpheus_tts_model.to(tts_device)
84
  orpheus_tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_name)
85
  print(f"Orpheus TTS model loaded to {tts_device}")
86
 
87
- # Some global parameters for chat and image generation
88
  MAX_MAX_NEW_TOKENS = 2048
89
  DEFAULT_MAX_NEW_TOKENS = 1024
90
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
91
 
92
- # Stable Diffusion XL setup
93
- MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # e.g. SG161222/RealVisXL_V5.0_Lightning
94
- MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
95
- USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
96
- ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
97
- BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
98
-
99
- sd_pipe = StableDiffusionXLPipeline.from_pretrained(
100
- MODEL_ID_SD,
101
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
102
- use_safetensors=True,
103
- add_watermarker=False,
104
- ).to(device)
105
- sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
106
-
107
- if torch.cuda.is_available():
108
- sd_pipe.text_encoder = sd_pipe.text_encoder.half()
109
- if USE_TORCH_COMPILE:
110
- sd_pipe.compile()
111
- if ENABLE_CPU_OFFLOAD:
112
- sd_pipe.enable_model_cpu_offload()
113
 
114
  MAX_SEED = np.iinfo(np.int32).max
115
 
@@ -164,50 +143,6 @@ def clean_chat_history(chat_history):
164
  cleaned.append(msg)
165
  return cleaned
166
 
167
- @spaces.GPU(duration=60, enable_queue=True)
168
- def generate_image_fn(
169
- prompt: str,
170
- negative_prompt: str = "",
171
- use_negative_prompt: bool = False,
172
- seed: int = 1,
173
- width: int = 1024,
174
- height: int = 1024,
175
- guidance_scale: float = 3,
176
- num_inference_steps: int = 25,
177
- randomize_seed: bool = False,
178
- use_resolution_binning: bool = True,
179
- num_images: int = 1,
180
- progress=gr.Progress(track_tqdm=True),
181
- ):
182
- seed = int(randomize_seed_fn(seed, randomize_seed))
183
- generator = torch.Generator(device=device).manual_seed(seed)
184
- options = {
185
- "prompt": [prompt] * num_images,
186
- "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
187
- "width": width,
188
- "height": height,
189
- "guidance_scale": guidance_scale,
190
- "num_inference_steps": num_inference_steps,
191
- "generator": generator,
192
- "output_type": "pil",
193
- }
194
- if use_resolution_binning:
195
- options["use_resolution_binning"] = True
196
- images = []
197
- for i in range(0, num_images, BATCH_SIZE):
198
- batch_options = options.copy()
199
- batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
200
- if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
201
- batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
202
- if device.type == "cuda":
203
- with torch.autocast("cuda", dtype=torch.float16):
204
- outputs = sd_pipe(**batch_options)
205
- else:
206
- outputs = sd_pipe(**batch_options)
207
- images.extend(outputs.images)
208
- image_paths = [save_image(img) for img in images]
209
- return image_paths, seed
210
-
211
  # New TTS functions (SNAC/Orpheus pipeline)
212
  def process_prompt(prompt, voice, tokenizer, device):
213
  prompt = f"{voice}: {prompt}"
@@ -298,11 +233,10 @@ def generate(
298
  repetition_penalty: float = 1.2,
299
  ):
300
  """
301
- Generates chatbot responses with support for multimodal input, image generation,
302
  TTS, and LLM-augmented TTS.
303
 
304
  Trigger commands:
305
- - "@image": generate an image.
306
  - "@video-infer": process video.
307
  - "@<voice>-tts": directly convert text to speech.
308
  - "@<voice>-llm": infer with the DeepHermes Llama model then convert to speech.
@@ -311,26 +245,6 @@ def generate(
311
  files = input_dict.get("files", [])
312
  lower_text = text.strip().lower()
313
 
314
- # Branch for image generation.
315
- if lower_text.startswith("@image"):
316
- prompt = text[len("@image"):].strip()
317
- yield progress_bar_html("Generating Image")
318
- image_paths, used_seed = generate_image_fn(
319
- prompt=prompt,
320
- negative_prompt="",
321
- use_negative_prompt=False,
322
- seed=1,
323
- width=1024,
324
- height=1024,
325
- guidance_scale=3,
326
- num_inference_steps=25,
327
- randomize_seed=True,
328
- use_resolution_binning=True,
329
- num_images=1,
330
- )
331
- yield gr.Image(image_paths[0])
332
- return
333
-
334
  # Branch for video processing.
335
  if lower_text.startswith("@video-infer"):
336
  prompt = text[len("@video-infer"):].strip()
@@ -424,28 +338,30 @@ def generate(
424
  # Default branch for regular chat (text and multimodal without TTS).
425
  conversation = clean_chat_history(chat_history)
426
  conversation.append({"role": "user", "content": text})
 
427
  if files:
 
428
  if len(files) > 1:
429
- images = [load_image(image) for image in files]
430
  elif len(files) == 1:
431
- images = [load_image(files[0])]
432
  else:
433
- images = []
434
  messages = [{
435
  "role": "user",
436
  "content": [
437
- *[{"type": "image", "image": image} for image in images],
438
  {"type": "text", "text": text},
439
  ]
440
  }]
441
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
442
- inputs = processor(text=[prompt_full], images=images, return_tensors="pt", padding=True).to("cuda")
443
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
444
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
445
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
446
  thread.start()
447
  buffer = ""
448
- yield progress_bar_html("Processing Qwen2VL")
449
  for new_text in streamer:
450
  buffer += new_text.replace("<|im_end|>", "")
451
  time.sleep(0.01)
@@ -496,16 +412,14 @@ demo = gr.ChatInterface(
496
  ["@dan-tts Yo, I’m Dan, [groan] and yes, I can even sound annoyed if I have to."],
497
  ["Write python program for array rotation"],
498
  ["@tara-tts Hey there, my name is Tara, [laugh] and I’m a speech generation model that can sound just like you!"],
499
- [{"text": "summarize the letter", "files": ["examples/1.png"]}],
500
  ["@tara-llm Who is Nikola Tesla, and why did he die?"],
501
  ["@emma-llm Explain the causes of rainbows"],
502
- ["@image Chocolate dripping from a donut"],
503
  [{"text": "@video-infer Summarize the event in video", "files": ["examples/sky.mp4"]}],
504
  [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
505
  ],
506
  cache_examples=False,
507
  type="messages",
508
- description="# **Orpheus Edge🧤** `voice: tara, dan, emma, josh` \n `emotion: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>. Use @video-infer, @image, orpheus: @<voice>-tts, or @<voice>-llm triggers llm response`",
509
  fill_height=True,
510
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ Use @tara-tts/@dan-tts for direct TTS or @tara-llm/@dan-llm for LLM+TTS, etc."),
511
  stop_btn="Stop Generation",
@@ -513,4 +427,4 @@ demo = gr.ChatInterface(
513
  )
514
 
515
  if __name__ == "__main__":
516
- demo.queue(max_size=20).launch(share=True)
 
21
  AutoProcessor,
22
  )
23
  from transformers.image_utils import load_image
 
24
 
25
  # Additional imports for new TTS
26
  from snac import SNAC
 
42
  )
43
  hermes_llm_model.eval()
44
 
45
+ # Load Qwen2-VL processor and model for multimodal tasks (e.g. video processing)
46
  MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
47
  processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
48
  model_m = Qwen2VLForConditionalGeneration.from_pretrained(
 
83
  orpheus_tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_name)
84
  print(f"Orpheus TTS model loaded to {tts_device}")
85
 
86
+ # Some global parameters for chat responses
87
  MAX_MAX_NEW_TOKENS = 2048
88
  DEFAULT_MAX_NEW_TOKENS = 1024
89
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
90
 
91
+ # (Image generation related code has been fully removed.)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  MAX_SEED = np.iinfo(np.int32).max
94
 
 
143
  cleaned.append(msg)
144
  return cleaned
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  # New TTS functions (SNAC/Orpheus pipeline)
147
  def process_prompt(prompt, voice, tokenizer, device):
148
  prompt = f"{voice}: {prompt}"
 
233
  repetition_penalty: float = 1.2,
234
  ):
235
  """
236
+ Generates chatbot responses with support for multimodal input, video processing,
237
  TTS, and LLM-augmented TTS.
238
 
239
  Trigger commands:
 
240
  - "@video-infer": process video.
241
  - "@<voice>-tts": directly convert text to speech.
242
  - "@<voice>-llm": infer with the DeepHermes Llama model then convert to speech.
 
245
  files = input_dict.get("files", [])
246
  lower_text = text.strip().lower()
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  # Branch for video processing.
249
  if lower_text.startswith("@video-infer"):
250
  prompt = text[len("@video-infer"):].strip()
 
338
  # Default branch for regular chat (text and multimodal without TTS).
339
  conversation = clean_chat_history(chat_history)
340
  conversation.append({"role": "user", "content": text})
341
+ # If files are provided, only non-image files (e.g. video) are processed via Qwen2VL.
342
  if files:
343
+ # Process files using the processor (this branch no longer handles image generation)
344
  if len(files) > 1:
345
+ inputs_list = [load_image(image) for image in files]
346
  elif len(files) == 1:
347
+ inputs_list = [load_image(files[0])]
348
  else:
349
+ inputs_list = []
350
  messages = [{
351
  "role": "user",
352
  "content": [
353
+ *[{"type": "image", "image": img} for img in inputs_list],
354
  {"type": "text", "text": text},
355
  ]
356
  }]
357
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
358
+ inputs = processor(text=[prompt_full], images=inputs_list, return_tensors="pt", padding=True).to("cuda")
359
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
360
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
361
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
362
  thread.start()
363
  buffer = ""
364
+ yield progress_bar_html("Processing with Qwen2VL")
365
  for new_text in streamer:
366
  buffer += new_text.replace("<|im_end|>", "")
367
  time.sleep(0.01)
 
412
  ["@dan-tts Yo, I’m Dan, [groan] and yes, I can even sound annoyed if I have to."],
413
  ["Write python program for array rotation"],
414
  ["@tara-tts Hey there, my name is Tara, [laugh] and I’m a speech generation model that can sound just like you!"],
 
415
  ["@tara-llm Who is Nikola Tesla, and why did he die?"],
416
  ["@emma-llm Explain the causes of rainbows"],
 
417
  [{"text": "@video-infer Summarize the event in video", "files": ["examples/sky.mp4"]}],
418
  [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
419
  ],
420
  cache_examples=False,
421
  type="messages",
422
+ description="# **Orpheus Edge🧤** `voice: tara, dan, emma, josh` \n `emotion: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>. Use @video-infer, orpheus: @<voice>-tts, or @<voice>-llm triggers llm response`",
423
  fill_height=True,
424
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ Use @tara-tts/@dan-tts for direct TTS or @tara-llm/@dan-llm for LLM+TTS, etc."),
425
  stop_btn="Stop Generation",
 
427
  )
428
 
429
  if __name__ == "__main__":
430
+ demo.queue(max_size=30).launch(share=True)