Spaces:

prithivMLmods
/

Imgscope-OCR-Mini

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 20

Commit

b942456

verified ·

1 Parent(s): c0f944a

Update app.py

Browse files

Files changed (1) hide show

app.py +235 -96

app.py CHANGED Viewed

@@ -11,7 +11,6 @@ import spaces
 import torch
 import numpy as np
 from PIL import Image
-import edge_tts
 import cv2
 from transformers import (
@@ -24,61 +23,92 @@ from transformers import (
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 1024
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load text-only model and tokenizer
-model_id = "prithivMLmods/DeepHermes-3-Llama-3-3B-Preview-abliterated"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16,
 )
-model.eval()
-TTS_VOICES = [
-    "en-US-JennyNeural",  # @tts1
-    "en-US-GuyNeural",    # @tts2
-]
-MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
-    MODEL_ID,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to("cuda").eval()
-async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
-    """Convert text to speech using Edge TTS and save as MP3"""
-    communicate = edge_tts.Communicate(text, voice)
-    await communicate.save(output_file)
-    return output_file
-def clean_chat_history(chat_history):
-    """
-    Filter out any chat entries whose "content" is not a string.
-    This helps prevent errors when concatenating previous messages.
-    """
-    cleaned = []
-    for msg in chat_history:
-        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
-            cleaned.append(msg)
-    return cleaned
-# Environment variables and parameters for Stable Diffusion XL
-# Use : SG161222/RealVisXL_V4.0_Lightning or SG161222/RealVisXL_V5.0_Lightning
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
-# Load the SDXL pipeline
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
@@ -87,22 +117,19 @@ sd_pipe = StableDiffusionXLPipeline.from_pretrained(
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-# Ensure that the text encoder is in half-precision if using CUDA.
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
-# Optional: compile the model for speedup if enabled
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
-# Optional: offload parts of the model to CPU if needed
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
 MAX_SEED = np.iinfo(np.int32).max
 def save_image(img: Image.Image) -> str:
-    """Save a PIL image with a unique filename and return the path."""
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
     return unique_name
@@ -113,10 +140,6 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
     return seed
 def progress_bar_html(label: str) -> str:
-    """
-    Returns an HTML snippet for a thin progress bar with a label.
-    The progress bar is styled as a dark red animated bar.
-    """
     return f'''
 <div style="display: flex; align-items: center;">
     <span style="margin-right: 10px; font-size: 14px;">{label}</span>
@@ -133,27 +156,29 @@ def progress_bar_html(label: str) -> str:
     '''
 def downsample_video(video_path):
-    """
-    Downsamples the video to 10 evenly spaced frames.
-    Each frame is returned as a PIL image along with its timestamp.
-    """
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
-    # Sample 10 evenly spaced frames.
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
 @spaces.GPU(duration=60, enable_queue=True)
 def generate_image_fn(
     prompt: str,
@@ -169,10 +194,8 @@ def generate_image_fn(
     num_images: int = 1,
     progress=gr.Progress(track_tqdm=True),
 ):
-    """Generate images using the SDXL pipeline."""
     seed = int(randomize_seed_fn(seed, randomize_seed))
     generator = torch.Generator(device=device).manual_seed(seed)
     options = {
         "prompt": [prompt] * num_images,
         "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
@@ -185,15 +208,12 @@ def generate_image_fn(
     }
     if use_resolution_binning:
         options["use_resolution_binning"] = True
     images = []
-    # Process in batches
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
         if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
             batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
-        # Wrap the pipeline call in autocast if using CUDA
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
@@ -203,6 +223,93 @@ def generate_image_fn(
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
 @spaces.GPU
 def generate(
     input_dict: dict,
@@ -214,11 +321,14 @@ def generate(
     repetition_penalty: float = 1.2,
 ):
     """
-    Generates chatbot responses with support for multimodal input, TTS, and image generation.
-    Special commands:
-      - "@tts1" or "@tts2": triggers text-to-speech.
-      - "@image": triggers image generation using the SDXL pipeline.
-      - "@qwen2vl-video": triggers video processing using Qwen2VL.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -226,7 +336,6 @@ def generate(
     # Branch for image generation.
     if lower_text.startswith("@image"):
-        # Remove the "@image" tag and use the rest as prompt
         prompt = text[len("@image"):].strip()
         yield progress_bar_html("Generating Image")
         image_paths, used_seed = generate_image_fn(
@@ -245,18 +354,16 @@ def generate(
         yield gr.Image(image_paths[0])
         return
-    # New branch for video processing with Qwen2VL.
     if lower_text.startswith("@video-infer"):
         prompt = text[len("@video-infer"):].strip()
         if files:
-            # Assume the first file is a video.
             video_path = files[0]
             frames = downsample_video(video_path)
             messages = [
                 {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
                 {"role": "user", "content": [{"type": "text", "text": prompt}]}
             ]
-            # Append each frame with its timestamp.
             for frame in frames:
                 image, timestamp = frame
                 image_path = f"video_frame_{uuid.uuid4().hex}.png"
@@ -287,27 +394,59 @@ def generate(
         buffer = ""
         yield progress_bar_html("Processing video with Qwen2VL")
         for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer
         return
-    # Determine if TTS is requested.
-    tts_prefix = "@tts"
-    is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
-    voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
-    if is_tts and voice_index:
-        voice = TTS_VOICES[voice_index - 1]
-        text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-        conversation = [{"role": "user", "content": text}]
-    else:
-        voice = None
-        text = text.replace(tts_prefix, "").strip()
-        conversation = clean_chat_history(chat_history)
-        conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
@@ -331,17 +470,16 @@ def generate(
         buffer = ""
         yield progress_bar_html("Processing Qwen2VL")
         for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer
     else:
-        input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
             gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-        input_ids = input_ids.to(model.device)
-        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             "input_ids": input_ids,
             "streamer": streamer,
@@ -353,19 +491,19 @@ def generate(
             "num_beams": 1,
             "repetition_penalty": repetition_penalty,
         }
-        t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
         outputs = []
-        yield progress_bar_html("Processing with Qwen2VL Ocr")
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
-        if is_tts and voice:
-            output_file = asyncio.run(text_to_speech(final_response, voice))
-            yield gr.Audio(output_file, autoplay=True)
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
@@ -381,16 +519,17 @@ demo = gr.ChatInterface(
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
         ["@image Chocolate dripping from a donut"],
         ["Python Program for Array Rotation"],
-        ["@tts1 Who is Nikola Tesla, and why did he die?"],
         [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
         [{"text": "summarize the letter", "files": ["examples/1.png"]}],
-        ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",
-    description="# **QwQ Edge `@video-infer 'prompt..', @image, @tts1`**",
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ @tts1, @tts2-voices, @image for image gen, @video-infer for video, default [text, vision]"),
     stop_btn="Stop Generation",
     multimodal=True,
 )

 import torch
 import numpy as np
 from PIL import Image
 import cv2
 from transformers import (
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
+# Additional imports for new TTS
+from snac import SNAC
+from huggingface_hub import snapshot_download
+from dotenv import load_dotenv
+load_dotenv()
+# ---------------------------
+# Set up device
+# ---------------------------
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+tts_device = "cuda" if torch.cuda.is_available() else "cpu"  # for SNAC and Orpheus TTS
+# ---------------------------
+# Load DeepHermes Llama (chat/LLM) model
+# ---------------------------
+hermes_model_id = "prithivMLmods/DeepHermes-3-Llama-3-3B-Preview-abliterated"
+hermes_llm_tokenizer = AutoTokenizer.from_pretrained(hermes_model_id)
+hermes_llm_model = AutoModelForCausalLM.from_pretrained(
+    hermes_model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16,
 )
+hermes_llm_model.eval()
+# ---------------------------
+# Load Qwen2-VL processor and model for multimodal tasks
+# ---------------------------
+MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
+# (If needed, you can pass extra arguments such as a size dict here if required.)
+processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_QWEN,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to("cuda").eval()
+# ---------------------------
+# Load Orpheus TTS model and SNAC for TTS synthesis
+# ---------------------------
+print("Loading SNAC model...")
+snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
+snac_model = snac_model.to(tts_device)
+tts_model_name = "canopylabs/orpheus-3b-0.1-ft"
+# Download only model config and safetensors
+snapshot_download(
+    repo_id=tts_model_name,
+    allow_patterns=[
+        "config.json",
+        "*.safetensors",
+        "model.safetensors.index.json",
+    ],
+    ignore_patterns=[
+        "optimizer.pt",
+        "pytorch_model.bin",
+        "training_args.bin",
+        "scheduler.pt",
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "vocab.json",
+        "merges.txt",
+        "tokenizer.*"
+    ]
+)
+orpheus_tts_model = AutoModelForCausalLM.from_pretrained(tts_model_name, torch_dtype=torch.bfloat16)
+orpheus_tts_model.to(tts_device)
+orpheus_tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_name)
+print(f"Orpheus TTS model loaded to {tts_device}")
+# ---------------------------
+# Some global parameters for chat and image generation
+# ---------------------------
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+# ---------------------------
+# Stable Diffusion XL setup
+# ---------------------------
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 ).to(device)
 sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
 if torch.cuda.is_available():
     sd_pipe.text_encoder = sd_pipe.text_encoder.half()
 if USE_TORCH_COMPILE:
     sd_pipe.compile()
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
 MAX_SEED = np.iinfo(np.int32).max
+# ---------------------------
+# Utility functions
+# ---------------------------
 def save_image(img: Image.Image) -> str:
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
     return unique_name
     return seed
 def progress_bar_html(label: str) -> str:
     return f'''
 <div style="display: flex; align-items: center;">
     <span style="margin-right: 10px; font-size: 14px;">{label}</span>
     '''
 def downsample_video(video_path):
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
+def clean_chat_history(chat_history):
+    cleaned = []
+    for msg in chat_history:
+        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
+            cleaned.append(msg)
+    return cleaned
 @spaces.GPU(duration=60, enable_queue=True)
 def generate_image_fn(
     prompt: str,
     num_images: int = 1,
     progress=gr.Progress(track_tqdm=True),
 ):
     seed = int(randomize_seed_fn(seed, randomize_seed))
     generator = torch.Generator(device=device).manual_seed(seed)
     options = {
         "prompt": [prompt] * num_images,
         "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
     }
     if use_resolution_binning:
         options["use_resolution_binning"] = True
     images = []
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
         if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
             batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
         if device.type == "cuda":
             with torch.autocast("cuda", dtype=torch.float16):
                 outputs = sd_pipe(**batch_options)
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
+# ---------------------------
+# New TTS functions (SNAC/Orpheus pipeline)
+# ---------------------------
+def process_prompt(prompt, voice, tokenizer, device):
+    prompt = f"{voice}: {prompt}"
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+    start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
+    end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End markers
+    modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
+    attention_mask = torch.ones_like(modified_input_ids)
+    return modified_input_ids.to(device), attention_mask.to(device)
+def parse_output(generated_ids):
+    token_to_find = 128257
+    token_to_remove = 128258
+    token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
+    if len(token_indices[1]) > 0:
+        last_occurrence_idx = token_indices[1][-1].item()
+        cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
+    else:
+        cropped_tensor = generated_ids
+    processed_rows = []
+    for row in cropped_tensor:
+        masked_row = row[row != token_to_remove]
+        processed_rows.append(masked_row)
+    code_lists = []
+    for row in processed_rows:
+        row_length = row.size(0)
+        new_length = (row_length // 7) * 7
+        trimmed_row = row[:new_length]
+        trimmed_row = [t - 128266 for t in trimmed_row]
+        code_lists.append(trimmed_row)
+    return code_lists[0]
+def redistribute_codes(code_list, snac_model):
+    device = next(snac_model.parameters()).device
+    layer_1 = []
+    layer_2 = []
+    layer_3 = []
+    for i in range((len(code_list)+1)//7):
+        layer_1.append(code_list[7*i])
+        layer_2.append(code_list[7*i+1]-4096)
+        layer_3.append(code_list[7*i+2]-(2*4096))
+        layer_3.append(code_list[7*i+3]-(3*4096))
+        layer_2.append(code_list[7*i+4]-(4*4096))
+        layer_3.append(code_list[7*i+5]-(5*4096))
+        layer_3.append(code_list[7*i+6]-(6*4096))
+    codes = [
+        torch.tensor(layer_1, device=device).unsqueeze(0),
+        torch.tensor(layer_2, device=device).unsqueeze(0),
+        torch.tensor(layer_3, device=device).unsqueeze(0)
+    ]
+    audio_hat = snac_model.decode(codes)
+    return audio_hat.detach().squeeze().cpu().numpy()
+@spaces.GPU()
+def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
+    if not text.strip():
+        return None
+    try:
+        progress(0.1, "Processing text...")
+        input_ids, attention_mask = process_prompt(text, voice, orpheus_tts_tokenizer, tts_device)
+        progress(0.3, "Generating speech tokens...")
+        with torch.no_grad():
+            generated_ids = orpheus_tts_model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=max_new_tokens,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                num_return_sequences=1,
+                eos_token_id=128258,
+            )
+        progress(0.6, "Processing speech tokens...")
+        code_list = parse_output(generated_ids)
+        progress(0.8, "Converting to audio...")
+        audio_samples = redistribute_codes(code_list, snac_model)
+        return (24000, audio_samples)
+    except Exception as e:
+        print(f"Error generating speech: {e}")
+        return None
+# ---------------------------
+# Main generate function for the chat interface
+# ---------------------------
 @spaces.GPU
 def generate(
     input_dict: dict,
     repetition_penalty: float = 1.2,
 ):
     """
+    Generates chatbot responses with support for multimodal input, image generation,
+    TTS, and LLM-augmented TTS.
+    Trigger commands:
+      - "@image": generate an image.
+      - "@video-infer": process video.
+      - "@<voice>-tts": directly convert text to speech.
+      - "@<voice>-llm": infer with the DeepHermes Llama model then convert to speech.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
     # Branch for image generation.
     if lower_text.startswith("@image"):
         prompt = text[len("@image"):].strip()
         yield progress_bar_html("Generating Image")
         image_paths, used_seed = generate_image_fn(
         yield gr.Image(image_paths[0])
         return
+    # Branch for video processing.
     if lower_text.startswith("@video-infer"):
         prompt = text[len("@video-infer"):].strip()
         if files:
             video_path = files[0]
             frames = downsample_video(video_path)
             messages = [
                 {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
                 {"role": "user", "content": [{"type": "text", "text": prompt}]}
             ]
             for frame in frames:
                 image, timestamp = frame
                 image_path = f"video_frame_{uuid.uuid4().hex}.png"
         buffer = ""
         yield progress_bar_html("Processing video with Qwen2VL")
         for new_text in streamer:
+            buffer += new_text.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer
         return
+    # Define TTS and LLM tag mappings.
+    tts_tags = {"@tara-tts": "tara", "@dan-tts": "dan", "@josh-tts": "josh", "@emma-tts": "emma"}
+    llm_tags = {"@tara-llm": "tara", "@dan-llm": "dan", "@josh-llm": "josh", "@emma-llm": "emma"}
+    # Branch for direct TTS (no LLM inference).
+    for tag, voice in tts_tags.items():
+        if lower_text.startswith(tag):
+            text = text[len(tag):].strip()
+            # Directly generate speech from the provided text.
+            audio_output = generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens)
+            yield gr.Audio(audio_output, autoplay=True)
+            return
+    # Branch for LLM-augmented TTS.
+    for tag, voice in llm_tags.items():
+        if lower_text.startswith(tag):
+            text = text[len(tag):].strip()
+            conversation = [{"role": "user", "content": text}]
+            input_ids = hermes_llm_tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+            if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+                input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+            input_ids = input_ids.to(hermes_llm_model.device)
+            streamer = TextIteratorStreamer(hermes_llm_tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+            generation_kwargs = {
+                "input_ids": input_ids,
+                "streamer": streamer,
+                "max_new_tokens": max_new_tokens,
+                "do_sample": True,
+                "top_p": top_p,
+                "top_k": 50,
+                "temperature": temperature,
+                "num_beams": 1,
+                "repetition_penalty": repetition_penalty,
+            }
+            t = Thread(target=hermes_llm_model.generate, kwargs=generation_kwargs)
+            t.start()
+            outputs = []
+            for new_text in streamer:
+                outputs.append(new_text)
+            final_response = "".join(outputs)
+            # Convert LLM response to speech.
+            audio_output = generate_speech(final_response, voice, temperature, top_p, repetition_penalty, max_new_tokens)
+            yield gr.Audio(audio_output, autoplay=True)
+            return
+    # Default branch for regular chat (text and multimodal without TTS).
+    conversation = clean_chat_history(chat_history)
+    conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
         buffer = ""
         yield progress_bar_html("Processing Qwen2VL")
         for new_text in streamer:
+            buffer += new_text.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer
     else:
+        input_ids = hermes_llm_tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
             gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+        input_ids = input_ids.to(hermes_llm_model.device)
+        streamer = TextIteratorStreamer(hermes_llm_tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             "input_ids": input_ids,
             "streamer": streamer,
             "num_beams": 1,
             "repetition_penalty": repetition_penalty,
         }
+        t = Thread(target=hermes_llm_model.generate, kwargs=generation_kwargs)
         t.start()
         outputs = []
+        yield progress_bar_html("Processing with DeepHermes LLM")
         for new_text in streamer:
             outputs.append(new_text)
             yield "".join(outputs)
         final_response = "".join(outputs)
         yield final_response
+# ---------------------------
+# Gradio Interface
+# ---------------------------
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
         [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
         ["@image Chocolate dripping from a donut"],
         ["Python Program for Array Rotation"],
+        ["@tara-tts Who is Nikola Tesla, and why did he die?"],
+        ["@emma-llm Explain the causes of rainbows"],
         [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
         [{"text": "summarize the letter", "files": ["examples/1.png"]}],
+        ["@josh-tts What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",
+    description="# **Llama Edge** \n`Use @video-infer, @image, @<voice>-tts, or @<voice>-llm triggers`",
     fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ Use @tara-tts/@dan-tts for direct TTS or @tara-llm/@dan-llm for LLM+TTS, etc."),
     stop_btn="Stop Generation",
     multimodal=True,
 )