Spaces:

prithivMLmods
/

Imgscope-OCR-Mini

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 20

Commit

55f563b

verified ·

1 Parent(s): 4b99608

Update app.py

Browse files

Files changed (1) hide show

app.py +307 -193

app.py CHANGED Viewed

@@ -1,33 +1,115 @@
-import gradio as gr
-from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
-from transformers.image_utils import load_image
-from threading import Thread
-import re
 import time
-import torch
 import spaces
-import ast
-import html
-import random
-import cv2
 import numpy as np
-import uuid
-from PIL import Image, ImageOps
-from docling_core.types.doc import DoclingDocument
-from docling_core.types.doc.document import DocTagsDocument
-# ---------------------------
-# Helper Functions
-# ---------------------------
 def progress_bar_html(label: str) -> str:
     return f'''
 <div style="display: flex; align-items: center;">
     <span style="margin-right: 10px; font-size: 14px;">{label}</span>
-    <div style="width: 110px; height: 5px; background-color: #F0FFF0; border-radius: 2px; overflow: hidden;">
-        <div style="width: 100%; height: 100%; background-color: #00FF00; animation: loading 1.5s linear infinite;"></div>
     </div>
 </div>
 <style>
@@ -38,218 +120,250 @@ def progress_bar_html(label: str) -> str:
 </style>
     '''
-def downsample_video(video_path, num_frames=10):
-    """Downsamples a video to a fixed number of evenly spaced frames."""
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
-    if total_frames <= 0 or fps <= 0:
-        vidcap.release()
-        return frames
-    # Get indices for num_frames evenly spaced frames.
-    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
-            # Convert from BGR (OpenCV) to RGB (PIL) and then to PIL Image.
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
-def add_random_padding(image, min_percent=0.1, max_percent=0.10):
-    image = image.convert("RGB")
-    width, height = image.size
-    pad_w_percent = random.uniform(min_percent, max_percent)
-    pad_h_percent = random.uniform(min_percent, max_percent)
-    pad_w = int(width * pad_w_percent)
-    pad_h = int(height * pad_h_percent)
-    corner_pixel = image.getpixel((0, 0))  # Top-left corner for padding color
-    padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
-    return padded_image
-def normalize_values(text, target_max=500):
-    def normalize_list(values):
-        max_value = max(values) if values else 1
-        return [round((v / max_value) * target_max) for v in values]
-    def process_match(match):
-        num_list = ast.literal_eval(match.group(0))
-        normalized = normalize_list(num_list)
-        return "".join([f"<loc_{num}>" for num in normalized])
-    pattern = r"\[([\d\.\s,]+)\]"
-    normalized_text = re.sub(pattern, process_match, text)
-    return normalized_text
-# ---------------------------
-# Model & Processor Setup
-# ---------------------------
-processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
-model = AutoModelForVision2Seq.from_pretrained(
-    "ds4sd/SmolDocling-256M-preview",
-    torch_dtype=torch.bfloat16,
-).to("cuda")
-# ---------------------------
-# Main Inference Function
-# ---------------------------
 @spaces.GPU
-def model_inference(input_dict, history):
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # If there are files, check if any is a video
-    video_extensions = (".mp4", ".mov", ".avi", ".mkv", ".webm")
-    if files and any(str(f).lower().endswith(video_extensions) for f in files):
-        # -------- Video Inference Branch --------
-        video_file = files[0]  # Assume first file is a video
-        frames = downsample_video(video_file)
-        if not frames:
-            yield "Could not process video file."
-            return
-        images = [frame[0] for frame in frames]
-        timestamps = [frame[1] for frame in frames]
-        # Append frame timestamps to the query text.
-        text_with_timestamps = text + " " + " ".join([f"Frame at {ts} seconds." for ts in timestamps])
-        resulting_messages = [{
-            "role": "user",
-            "content": [{"type": "image"} for _ in range(len(images))] + [{"type": "text", "text": text_with_timestamps}]
-        }]
-        prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, images=[images], return_tensors="pt").to("cuda")
-        yield progress_bar_html("Processing video with SmolDocling")
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=False)
-        generation_args = dict(inputs, streamer=streamer, max_new_tokens=8192)
-        thread = Thread(target=model.generate, kwargs=generation_args)
         thread.start()
         buffer = ""
-        full_output = ""
         for new_text in streamer:
-            full_output += new_text
-            buffer += html.escape(new_text)
             yield buffer
-        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
-        if cleaned_output:
-            doctag_output = cleaned_output
-            yield cleaned_output
-        if any(tag in doctag_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
-            doc = DoclingDocument(name="Document")
-            if "<chart>" in doctag_output:
-                doctag_output = doctag_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
-                doctag_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', doctag_output)
-            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctag_output], images)
-            doc.load_from_doctags(doctags_doc)
-            yield f"**MD Output:**\n\n{doc.export_to_markdown()}"
         return
-    elif files:
-        # -------- Image Inference Branch --------
         if len(files) > 1:
-            if "OTSL" in text or "code" in text:
-                images = [add_random_padding(load_image(image)) for image in files]
-            else:
-                images = [load_image(image) for image in files]
         elif len(files) == 1:
-            if "OTSL" in text or "code" in text:
-                images = [add_random_padding(load_image(files[0]))]
-            else:
-                images = [load_image(files[0])]
-        resulting_messages = [{
             "role": "user",
-            "content": [{"type": "image"} for _ in range(len(images))] + [{"type": "text", "text": text}]
         }]
-        prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, images=[images], return_tensors="pt").to("cuda")
-        yield progress_bar_html("Processing with SmolDocling")
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=False)
-        generation_args = dict(inputs, streamer=streamer, max_new_tokens=8192)
-        thread = Thread(target=model.generate, kwargs=generation_args)
         thread.start()
-        yield "..."
         buffer = ""
-        full_output = ""
         for new_text in streamer:
-            full_output += new_text
-            buffer += html.escape(new_text)
             yield buffer
-        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
-        if cleaned_output:
-            doctag_output = cleaned_output
-            yield cleaned_output
-        if any(tag in doctag_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
-            doc = DoclingDocument(name="Document")
-            if "<chart>" in doctag_output:
-                doctag_output = doctag_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
-                doctag_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', doctag_output)
-            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctag_output], images)
-            doc.load_from_doctags(doctags_doc)
-            yield f"**MD Output:**\n\n{doc.export_to_markdown()}"
-        return
     else:
-        # -------- Text-Only Inference Branch --------
-        if text == "":
-            gr.Error("Please input a query and optionally image(s).")
-        resulting_messages = [{
-            "role": "user",
-            "content": [{"type": "text", "text": text}]
-        }]
-        prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
-        inputs = processor(text=prompt, return_tensors="pt").to("cuda")
-        yield progress_bar_html("Processing text with SmolDocling")
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=False)
-        generation_args = dict(inputs, streamer=streamer, max_new_tokens=8192)
-        thread = Thread(target=model.generate, kwargs=generation_args)
-        thread.start()
-        yield "..."
-        buffer = ""
-        full_output = ""
         for new_text in streamer:
-            full_output += new_text
-            buffer += html.escape(new_text)
-            yield buffer
-        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
-        if cleaned_output:
-            yield cleaned_output
-        return
-# ---------------------------
-# Gradio Interface Setup
-# ---------------------------
-examples = [
-    [{"text": "Convert this page to docling.", "files": ["example_images/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.png"]}],
-    [{"text": "Convert this table to OTSL.", "files": ["example_images/image-2.jpg"]}],
-    [{"text": "Convert code to text.", "files": ["example_images/7666.jpg"]}],
-    [{"text": "Convert formula to latex.", "files": ["example_images/2433.jpg"]}],
-    [{"text": "Convert chart to OTSL.", "files": ["example_images/06236926002285.png"]}],
-    [{"text": "OCR the text in location [47, 531, 167, 565]", "files": ["example_images/s2w_example.png"]}],
-    [{"text": "Extract all section header elements on the page.", "files": ["example_images/paper_3.png"]}],
-    [{"text": "Identify element at location [123, 413, 1059, 1061]", "files": ["example_images/redhat.png"]}],
-    [{"text": "Convert this page to docling.", "files": ["example_images/gazette_de_france.jpg"]}],
-    # Example video file (if available)
-    [{"text": "Describe the events in this video.", "files": ["example_videos/sample_video.mp4"]}],
-]
 demo = gr.ChatInterface(
-    fn=model_inference,
-    title="SmolDocling-256M: Ultra-compact VLM for Document Conversion 💫",
-    description=(
-        "Play with [ds4sd/SmolDocling-256M-preview](https://huggingface.co/ds4sd/SmolDocling-256M-preview) in this demo. "
-        "Upload an image, video, and text query or try one of the examples. Each chat starts a new conversation."
-    ),
-    examples=examples,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
     stop_btn="Stop Generation",
     multimodal=True,
-    cache_examples=False
 )
 if __name__ == "__main__":
-    demo.launch(debug=True)

+import os
+import random
+import uuid
+import json
 import time
+import asyncio
+from threading import Thread
+import gradio as gr
 import spaces
+import torch
 import numpy as np
+from PIL import Image
+import cv2
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextIteratorStreamer,
+    Qwen2VLForConditionalGeneration,
+    AutoProcessor,
+)
+from transformers.image_utils import load_image
+from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load text-only model and tokenizer
+model_id = "prithivMLmods/FastThink-0.5B-Tiny"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+model.eval()
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model_m = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to("cuda").eval()
+def clean_chat_history(chat_history):
+    """
+    Filter out any chat entries whose "content" is not a string.
+    This helps prevent errors when concatenating previous messages.
+    """
+    cleaned = []
+    for msg in chat_history:
+        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
+            cleaned.append(msg)
+    return cleaned
+# Environment variables and parameters for Stable Diffusion XL
+# Use : SG161222/RealVisXL_V4.0_Lightning or SG161222/RealVisXL_V5.0_Lightning
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
+USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
+ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
+# Load the SDXL pipeline
+sd_pipe = StableDiffusionXLPipeline.from_pretrained(
+    MODEL_ID_SD,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    use_safetensors=True,
+    add_watermarker=False,
+).to(device)
+sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+# Ensure that the text encoder is in half-precision if using CUDA.
+if torch.cuda.is_available():
+    sd_pipe.text_encoder = sd_pipe.text_encoder.half()
+# Optional: compile the model for speedup if enabled
+if USE_TORCH_COMPILE:
+    sd_pipe.compile()
+# Optional: offload parts of the model to CPU if needed
+if ENABLE_CPU_OFFLOAD:
+    sd_pipe.enable_model_cpu_offload()
+MAX_SEED = np.iinfo(np.int32).max
+def save_image(img: Image.Image) -> str:
+    """Save a PIL image with a unique filename and return the path."""
+    unique_name = str(uuid.uuid4()) + ".png"
+    img.save(unique_name)
+    return unique_name
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
 def progress_bar_html(label: str) -> str:
+    """
+    Returns an HTML snippet for a thin progress bar with a label.
+    The progress bar is styled as a dark red animated bar.
+    """
     return f'''
 <div style="display: flex; align-items: center;">
     <span style="margin-right: 10px; font-size: 14px;">{label}</span>
+    <div style="width: 110px; height: 5px; background-color: #FFF0F5; border-radius: 2px; overflow: hidden;">
+        <div style="width: 100%; height: 100%; background-color: #FF69B4; animation: loading 1.5s linear infinite;"></div>
     </div>
 </div>
 <style>
 </style>
     '''
+def downsample_video(video_path):
+    """
+    Downsamples the video to 10 evenly spaced frames.
+    Each frame is returned as a PIL image along with its timestamp.
+    """
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
+    # Sample 10 evenly spaced frames.
+    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
+@spaces.GPU(duration=60, enable_queue=True)
+def generate_image_fn(
+    prompt: str,
+    negative_prompt: str = "",
+    use_negative_prompt: bool = False,
+    seed: int = 1,
+    width: int = 1024,
+    height: int = 1024,
+    guidance_scale: float = 3,
+    num_inference_steps: int = 25,
+    randomize_seed: bool = False,
+    use_resolution_binning: bool = True,
+    num_images: int = 1,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """Generate images using the SDXL pipeline."""
+    seed = int(randomize_seed_fn(seed, randomize_seed))
+    generator = torch.Generator(device=device).manual_seed(seed)
+    options = {
+        "prompt": [prompt] * num_images,
+        "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
+        "width": width,
+        "height": height,
+        "guidance_scale": guidance_scale,
+        "num_inference_steps": num_inference_steps,
+        "generator": generator,
+        "output_type": "pil",
+    }
+    if use_resolution_binning:
+        options["use_resolution_binning"] = True
+    images = []
+    # Process in batches
+    for i in range(0, num_images, BATCH_SIZE):
+        batch_options = options.copy()
+        batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
+        if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
+            batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
+        # Wrap the pipeline call in autocast if using CUDA
+        if device.type == "cuda":
+            with torch.autocast("cuda", dtype=torch.float16):
+                outputs = sd_pipe(**batch_options)
+        else:
+            outputs = sd_pipe(**batch_options)
+        images.extend(outputs.images)
+    image_paths = [save_image(img) for img in images]
+    return image_paths, seed
 @spaces.GPU
+def generate(
+    input_dict: dict,
+    chat_history: list[dict],
+    max_new_tokens: int = 1024,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
+):
+    """
+    Generates chatbot responses with support for multimodal input and image generation.
+    Special commands:
+      - "@image": triggers image generation using the SDXL pipeline.
+      - "@video-infer": triggers video processing using Qwen2VL.
+    """
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    lower_text = text.strip().lower()
+    # Branch for image generation.
+    if lower_text.startswith("@image"):
+        # Remove the "@image" tag and use the rest as prompt
+        prompt = text[len("@image"):].strip()
+        yield progress_bar_html("Generating Image")
+        image_paths, used_seed = generate_image_fn(
+            prompt=prompt,
+            negative_prompt="",
+            use_negative_prompt=False,
+            seed=1,
+            width=1024,
+            height=1024,
+            guidance_scale=3,
+            num_inference_steps=25,
+            randomize_seed=True,
+            use_resolution_binning=True,
+            num_images=1,
+        )
+        yield gr.Image(image_paths[0])
+        return
+    # New branch for video processing with Qwen2VL.
+    if lower_text.startswith("@video-infer"):
+        prompt = text[len("@video-infer"):].strip()
+        if files:
+            # Assume the first file is a video.
+            video_path = files[0]
+            frames = downsample_video(video_path)
+            messages = [
+                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+                {"role": "user", "content": [{"type": "text", "text": prompt}]}
+            ]
+            # Append each frame with its timestamp.
+            for frame in frames:
+                image, timestamp = frame
+                image_path = f"video_frame_{uuid.uuid4().hex}.png"
+                image.save(image_path)
+                messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
+                messages[1]["content"].append({"type": "image", "url": image_path})
+        else:
+            messages = [
+                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+                {"role": "user", "content": [{"type": "text", "text": prompt}]}
+            ]
+        inputs = processor.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
+        ).to("cuda")
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
+        yield progress_bar_html("Processing video with Qwen2VL")
         for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
             yield buffer
         return
+    # Process as text and/or image input.
+    if files:
         if len(files) > 1:
+            images = [load_image(image) for image in files]
         elif len(files) == 1:
+            images = [load_image(files[0])]
+        else:
+            images = []
+        messages = [{
             "role": "user",
+            "content": [
+                *[{"type": "image", "image": image} for image in images],
+                {"type": "text", "text": text},
+            ]
         }]
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[prompt_full], images=images, return_tensors="pt", padding=True).to("cuda")
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
+        yield progress_bar_html("Thinking...")
         for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
             yield buffer
     else:
+        conversation = clean_chat_history(chat_history)
+        conversation.append({"role": "user", "content": text})
+        input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+        input_ids = input_ids.to(model.device)
+        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            "input_ids": input_ids,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "top_p": top_p,
+            "top_k": top_k,
+            "temperature": temperature,
+            "num_beams": 1,
+            "repetition_penalty": repetition_penalty,
+        }
+        t = Thread(target=model.generate, kwargs=generation_kwargs)
+        t.start()
+        outputs = []
+        yield progress_bar_html("Processing with Qwen2VL Ocr")
         for new_text in streamer:
+            outputs.append(new_text)
+            yield "".join(outputs)
+        final_response = "".join(outputs)
+        yield final_response
 demo = gr.ChatInterface(
+    fn=generate,
+    additional_inputs=[
+        gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
+        gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
+        gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
+        gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
+        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
+    ],
+    examples=[
+        [{"text": "@video-infer Describe the Ad", "files": ["examples/coca.mp4"]}],
+        [{"text": "@video-infer Summarize the event in video", "files": ["examples/sky.mp4"]}],
+        [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}],
+        ["@image Chocolate dripping from a donut"],
+        ["Python Program for Array Rotation"],
+        [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
+        [{"text": "summarize the letter", "files": ["examples/1.png"]}],
+    ],
+    cache_examples=False,
+    type="messages",
+    description="# **Llama Edge** \n`@video-infer 'prompt..', @image`",
+    fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="‎ @image for image gen, @video-infer for video, default [text, vision]"),
     stop_btn="Stop Generation",
     multimodal=True,
 )
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch(share=True)