Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on 5 days ago

Commit

e817668

verified ·

1 Parent(s): 3d4caeb

Update app.py

Browse files

Files changed (1) hide show

app.py +373 -223

app.py CHANGED Viewed

@@ -10,19 +10,24 @@ import gradio as gr
 import spaces
 import torch
 import numpy as np
-from PIL import Image
 import cv2
-import edge_tts
 from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    TextIteratorStreamer,
     Qwen2VLForConditionalGeneration,
     AutoProcessor,
 )
 from transformers.image_utils import load_image
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
@@ -30,271 +35,416 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load text-only model and tokenizer
-model_id = "prithivMLmods/Galactic-Qwen-14B-Exp2"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-)
-model.eval()
-# Load multimodal processor and model
-MODEL_ID = "prithivMLmods/Imgscope-OCR-2B-0527"
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
-    MODEL_ID,
     trust_remote_code=True,
     torch_dtype=torch.float16
-).to("cuda").eval()
-# Edge TTS voices mapping for new tags.
-TTS_VOICE_MAP = {
-    "@jennyneural": "en-US-JennyNeural",
-    "@guyneural": "en-US-GuyNeural",
-    "@palomaneural": "es-US-PalomaNeural",
-    "@alonsoneural": "es-US-AlonsoNeural",
-    "@madhurneural": "hi-IN-MadhurNeural"
-}
-async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
-    """
-    Convert text to speech using Edge TTS and save as MP3.
-    """
-    communicate = edge_tts.Communicate(text, voice)
-    await communicate.save(output_file)
-    return output_file
-def clean_chat_history(chat_history):
-    """
-    Filter out any chat entries whose "content" is not a string.
-    This helps prevent errors when concatenating previous messages.
-    """
-    cleaned = []
-    for msg in chat_history:
-        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
-            cleaned.append(msg)
-    return cleaned
 def downsample_video(video_path):
-    """
-    Downsamples the video to 10 evenly spaced frames.
-    Each frame is returned as a PIL image along with its timestamp.
-    """
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
-    # Sample 10 evenly spaced frames.
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
-def progress_bar_html(label: str) -> str:
-    """
-    Returns an HTML snippet for a thin progress bar with a label.
-    The progress bar is styled as a light cyan animated bar.
-    """
-    return f'''
-<div style="display: flex; align-items: center;">
-    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
-    <div style="width: 110px; height: 5px; background-color: #B0E0E6; border-radius: 2px; overflow: hidden;">
-        <div style="width: 100%; height: 100%; background-color: #00FFFF; animation: loading 1.5s linear infinite;"></div>
-    </div>
-</div>
-<style>
-@keyframes loading {{
-    0% {{ transform: translateX(-100%); }}
-    100% {{ transform: translateX(100%); }}
-}}
-</style>
-    '''
 @spaces.GPU
-def generate(input_dict: dict, chat_history: list[dict],
-             max_new_tokens: int = 1024,
-             temperature: float = 0.6,
-             top_p: float = 0.9,
-             top_k: int = 50,
-             repetition_penalty: float = 1.2):
-    """
-    Generates chatbot responses with support for multimodal input, video processing,
-    and Edge TTS when using the new tags @JennyNeural or @GuyNeural.
-    Special command:
-      - "@video-infer": triggers video processing using Imgscope-OCR
-    """
-    text = input_dict["text"]
-    files = input_dict.get("files", [])
-    lower_text = text.strip().lower()
-    # Check for TTS tag in the prompt.
-    tts_voice = None
-    for tag, voice in TTS_VOICE_MAP.items():
-        if lower_text.startswith(tag):
-            tts_voice = voice
-            text = text[len(tag):].strip()  # Remove the tag from the prompt.
-            break
-    # Branch for video processing with Callisto OCR3.
-    if lower_text.startswith("@video-infer"):
-        prompt = text[len("@video-infer"):].strip() if not tts_voice else text
-        if files:
-            # Assume the first file is a video.
-            video_path = files[0]
-            frames = downsample_video(video_path)
-            messages = [
-                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-                {"role": "user", "content": [{"type": "text", "text": prompt}]}
-            ]
-            # Append each frame with its timestamp.
-            for frame in frames:
-                image, timestamp = frame
-                image_path = f"video_frame_{uuid.uuid4().hex}.png"
-                image.save(image_path)
-                messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
-                messages[1]["content"].append({"type": "image", "url": image_path})
         else:
-            messages = [
-                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-                {"role": "user", "content": [{"type": "text", "text": prompt}]}
-            ]
-        # Enable truncation to avoid token/feature mismatch.
-        inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt",
-            truncation=True,
-            max_length=MAX_INPUT_TOKEN_LENGTH
-        ).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             **inputs,
             "streamer": streamer,
             "max_new_tokens": max_new_tokens,
-            "do_sample": True,
             "temperature": temperature,
             "top_p": top_p,
             "top_k": top_k,
             "repetition_penalty": repetition_penalty,
         }
-        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
-        yield progress_bar_html("Processing video with Imgscope-OCR")
         for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
             yield buffer
-        return
-    # Multimodal processing when files are provided.
-    if files:
-        if len(files) > 1:
-            images = [load_image(image) for image in files]
-        elif len(files) == 1:
-            images = [load_image(files[0])]
         else:
-            images = []
-        messages = [{
-            "role": "user",
-            "content": [
-                *[{"type": "image", "image": image} for image in images],
-                {"type": "text", "text": text},
-            ]
-        }]
-        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        # Enable truncation explicitly here as well.
-        inputs = processor(
-            text=[prompt_full],
-            images=images,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=MAX_INPUT_TOKEN_LENGTH
-        ).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        yield progress_bar_html("Processing image with Imgscope-OCR")
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-    else:
-        # Normal text conversation processing with Pocket Llama.
-        conversation = clean_chat_history(chat_history)
-        conversation.append({"role": "user", "content": text})
-        input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
-            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-        input_ids = input_ids.to(model.device)
-        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
-            "input_ids": input_ids,
             "streamer": streamer,
             "max_new_tokens": max_new_tokens,
-            "do_sample": True,
             "top_p": top_p,
             "top_k": top_k,
-            "temperature": temperature,
-            "num_beams": 1,
             "repetition_penalty": repetition_penalty,
         }
-        t = Thread(target=model.generate, kwargs=generation_kwargs)
-        t.start()
-        outputs = []
-        yield progress_bar_html("Processing With Galactic Qwen")
         for new_text in streamer:
-            outputs.append(new_text)
-            yield "".join(outputs)
-        final_response = "".join(outputs)
-        yield final_response
-        # If a TTS voice was specified, convert the final response to speech.
-        if tts_voice:
-            output_file = asyncio.run(text_to_speech(final_response, tts_voice))
-            yield gr.Audio(output_file, autoplay=True)
-# Create the Gradio ChatInterface with the custom CSS applied
-demo = gr.ChatInterface(
-    fn=generate,
-    additional_inputs=[
-        gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
-        gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
-        gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
-        gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
-        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
-    ],
-    examples=[
-        ["Write the code that converts temperatures between Celsius and Fahrenheit in short"],
-        [{"text": "Create a short story based on the image.", "files": ["examples/1.jpg"]}],
-        ["@JennyNeural Who was Nikola Tesla and what were his contributions?"],
-        [{"text": "@video-infer Describe the video", "files": ["examples/Missing.mp4"]}]
-    ],
-    cache_examples=False,
-    description="# **Imgscope-OCR**",
-    type="messages",
-    fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
-    stop_btn="Stop Generation",
-    multimodal=True,
-)
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch(share=True)

 import spaces
 import torch
 import numpy as np
+from PIL import Image, ImageOps
 import cv2
 from transformers import (
     Qwen2VLForConditionalGeneration,
+    VisionEncoderDecoderModel,
+    AutoModelForVision2Seq,
     AutoProcessor,
+    TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
+from docling_core.types.doc import DoclingDocument, DocTagsDocument
+import re
+import ast
+import html
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load olmOCR-7B-0225-preview
+MODEL_ID_M = "allenai/olmOCR-7B-0225-preview"
+processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_M,
     trust_remote_code=True,
     torch_dtype=torch.float16
+).to(device).eval()
+# Load ByteDance's Dolphin
+MODEL_ID_K = "ByteDance/Dolphin"
+processor_k = AutoProcessor.from_pretrained(MODEL_ID_K, trust_remote_code=True)
+model_k = VisionEncoderDecoderModel.from_pretrained(
+    MODEL_ID_K,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
+# Load SmolDocling-256M-preview
+MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
+processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
+model_x = AutoModelForVision2Seq.from_pretrained(
+    MODEL_ID_X,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
+# Preprocessing functions for SmolDocling-256M
+def add_random_padding(image, min_percent=0.1, max_percent=0.10):
+    """Add random padding to an image based on its size."""
+    image = image.convert("RGB")
+    width, height = image.size
+    pad_w_percent = random.uniform(min_percent, max_percent)
+    pad_h_percent = random.uniform(min_percent, max_percent)
+    pad_w = int(width * pad_w_percent)
+    pad_h = int(height * pad_h_percent)
+    corner_pixel = image.getpixel((0, 0))  # Top-left corner
+    padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
+    return padded_image
+def normalize_values(text, target_max=500):
+    """Normalize numerical values in text to a target maximum."""
+    def normalize_list(values):
+        max_value = max(values) if values else 1
+        return [round((v / max_value) * target_max) for v in values]
+    def process_match(match):
+        num_list = ast.literal_eval(match.group(0))
+        normalized = normalize_list(num_list)
+        return "".join([f"<loc_{num}>" for num in normalized])
+    pattern = r"\[([\d\.\s,]+)\]"
+    normalized_text = re.sub(pattern, process_match, text)
+    return normalized_text
 def downsample_video(video_path):
+    """Downsample a video to evenly spaced frames, returning PIL images with timestamps."""
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
         if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
+# Dolphin-specific functions
+def model_chat(prompt, image):
+    """Use Dolphin model for inference."""
+    processor = processor_k
+    model = model_k
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    inputs = processor(image, return_tensors="pt").to(device)
+    pixel_values = inputs.pixel_values.half()
+    prompt_inputs = processor.tokenizer(
+        f"<s>{prompt} <Answer/>",
+        add_special_tokens=False,
+        return_tensors="pt"
+    ).to(device)
+    outputs = model.generate(
+        pixel_values=pixel_values,
+        decoder_input_ids=prompt_inputs.input_ids,
+        decoder_attention_mask=prompt_inputs.attention_mask,
+        min_length=1,
+        max_length=4096,
+        pad_token_id=processor.tokenizer.pad_token_id,
+        eos_token_id=processor.tokenizer.eos_token_id,
+        use_cache=True,
+        bad_words_ids=[[processor.tokenizer.unk_token_id]],
+        return_dict_in_generate=True,
+        do_sample=False,
+        num_beams=1,
+        repetition_penalty=1.1
+    )
+    sequence = processor.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)[0]
+    cleaned = sequence.replace(f"<s>{prompt} <Answer/>", "").replace("<pad>", "").replace("</s>", "").strip()
+    return cleaned
+def process_elements(layout_results, image):
+    """Parse layout results and extract elements from the image."""
+    # Placeholder parsing logic based on expected Dolphin output
+    # Assuming layout_results is a string like "[(x1,y1,x2,y2,label), ...]"
+    try:
+        elements = ast.literal_eval(layout_results)
+    except:
+        elements = []  # Fallback if parsing fails
+    recognition_results = []
+    reading_order = 0
+    for bbox, label in elements:
+        try:
+            x1, y1, x2, y2 = map(int, bbox)
+            cropped = image.crop((x1, y1, x2, y2))
+            if cropped.size[0] > 0 and cropped.size[1] > 0:
+                if label == "text":
+                    text = model_chat("Read text in the image.", cropped)
+                    recognition_results.append({
+                        "label": label,
+                        "bbox": [x1, y1, x2, y2],
+                        "text": text.strip(),
+                        "reading_order": reading_order
+                    })
+                elif label == "table":
+                    table_text = model_chat("Parse the table in the image.", cropped)
+                    recognition_results.append({
+                        "label": label,
+                        "bbox": [x1, y1, x2, y2],
+                        "text": table_text.strip(),
+                        "reading_order": reading_order
+                    })
+                elif label == "figure":
+                    recognition_results.append({
+                        "label": label,
+                        "bbox": [x1, y1, x2, y2],
+                        "text": "[Figure]",  # Placeholder for figure content
+                        "reading_order": reading_order
+                    })
+            reading_order += 1
+        except Exception as e:
+            print(f"Error processing element: {e}")
+            continue
+    return recognition_results
+def generate_markdown(recognition_results):
+    """Generate markdown from extracted elements."""
+    markdown = ""
+    for element in sorted(recognition_results, key=lambda x: x["reading_order"]):
+        if element["label"] == "text":
+            markdown += f"{element['text']}\n\n"
+        elif element["label"] == "table":
+            markdown += f"**Table:**\n{element['text']}\n\n"
+        elif element["label"] == "figure":
+            markdown += f"{element['text']}\n\n"
+    return markdown.strip()
+def process_image_with_dolphin(image):
+    """Process a single image with Dolphin model."""
+    layout_output = model_chat("Parse the reading order of this document.", image)
+    elements = process_elements(layout_output, image)
+    markdown_content = generate_markdown(elements)
+    return markdown_content
 @spaces.GPU
+def generate_image(model_name: str, text: str, image: Image.Image,
+                   max_new_tokens: int = 1024,
+                   temperature: float = 0.6,
+                   top_p: float = 0.9,
+                   top_k: int = 50,
+                   repetition_penalty: float = 1.2):
+    """Generate responses for image input using the selected model."""
+    if model_name == "ByteDance-s-Dolphin":
+        if image is None:
+            yield "Please upload an image."
+            return
+        markdown_content = process_image_with_dolphin(image)
+        yield markdown_content
+    else:
+        # Existing logic for other models
+        if model_name == "olmOCR-7B-0225-preview":
+            processor = processor_m
+            model = model_m
+        elif model_name == "SmolDocling-256M-preview":
+            processor = processor_x
+            model = model_x
         else:
+            yield "Invalid model selected."
+            return
+        if image is None:
+            yield "Please upload an image."
+            return
+        images = [image]
+        if model_name == "SmolDocling-256M-preview":
+            if "OTSL" in text or "code" in text:
+                images = [add_random_padding(img) for img in images]
+            if "OCR at text at" in text or "Identify element" in text or "formula" in text:
+                text = normalize_values(text, target_max=500)
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "image"} for _ in images] + [
+                    {"type": "text", "text": text}
+                ]
+            }
+        ]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             **inputs,
             "streamer": streamer,
             "max_new_tokens": max_new_tokens,
             "temperature": temperature,
             "top_p": top_p,
             "top_k": top_k,
             "repetition_penalty": repetition_penalty,
         }
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
+        full_output = ""
         for new_text in streamer:
+            full_output += new_text
+            buffer += new_text.replace("<|im_end|>", "")
             yield buffer
+        if model_name == "SmolDocling-256M-preview":
+            cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
+            if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+                if "<chart>" in cleaned_output:
+                    cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                    cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
+                doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
+                doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
+                markdown_output = doc.export_to_markdown()
+                yield f"**MD Output:**\n\n{markdown_output}"
+            else:
+                yield cleaned_output
+@spaces.GPU
+def generate_video(model_name: str, text: str, video_path: str,
+                   max_new_tokens: int = 1024,
+                   temperature: float = 0.6,
+                   top_p: float = 0.9,
+                   top_k: int = 50,
+                   repetition_penalty: float = 1.2):
+    """Generate responses for video input using the selected model."""
+    if model_name == "ByteDance-s-Dolphin":
+        if video_path is None:
+            yield "Please upload a video."
+            return
+        frames = downsample_video(video_path)
+        markdown_contents = []
+        for frame, _ in frames:
+            markdown_content = process_image_with_dolphin(frame)
+            markdown_contents.append(markdown_content)
+        combined_markdown = "\n\n".join(markdown_contents)
+        yield combined_markdown
+    else:
+        # Existing logic for other models
+        if model_name == "olmOCR-7B-0225-preview":
+            processor = processor_m
+            model = model_m
+        elif model_name == "SmolDocling-256M-preview":
+            processor = processor_x
+            model = model_x
         else:
+            yield "Invalid model selected."
+            return
+        if video_path is None:
+            yield "Please upload a video."
+            return
+        frames = downsample_video(video_path)
+        images = [frame for frame, _ in frames]
+        if model_name == "SmolDocling-256M-preview":
+            if "OTSL" in text or "code" in text:
+                images = [add_random_padding(img) for img in images]
+            if "OCR at text at" in text or "Identify element" in text or "formula" in text:
+                text = normalize_values(text, target_max=500)
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "image"} for _ in images] + [
+                    {"type": "text", "text": text}
+                ]
+            }
+        ]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
+            **inputs,
             "streamer": streamer,
             "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
             "top_p": top_p,
             "top_k": top_k,
             "repetition_penalty": repetition_penalty,
         }
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        full_output = ""
         for new_text in streamer:
+            full_output += new_text
+            buffer += new_text.replace("<|im_end|>", "")
+            yield buffer
+        if model_name == "SmolDocling-256M-preview":
+            cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
+            if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+                if "<chart>" in cleaned_output:
+                    cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                    cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
+                doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
+                doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
+                markdown_output = doc.export_to_markdown()
+                yield f"**MD Output:**\n\n{markdown_output}"
+            else:
+                yield cleaned_output
+# Define examples for image and video inference
+image_examples = [
+    ["Convert this page to docling", "images/1.png"],
+    ["OCR the image", "images/2.jpg"],
+    ["Convert this page to docling", "images/3.png"],
+]
+video_examples = [
+    ["Explain the ad in detail", "example/1.mp4"],
+    ["Identify the main actions in the coca cola ad...", "example/2.mp4"]
+]
+css = """
+.submit-btn {
+    background-color: #2980b9 !important;
+    color: white !important;
+}
+.submit-btn:hover {
+    background-color: #3498db !important;
+}
+"""
+# Create the Gradio Interface
+with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("# **[Core OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
+    with gr.Row():
+        with gr.Column():
+            with gr.Tabs():
+                with gr.TabItem("Image Inference"):
+                    image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    image_upload = gr.Image(type="pil", label="Image")
+                    image_submit = gr.Button("Submit", elem_classes="submit-btn")
+                    gr.Examples(
+                        examples=image_examples,
+                        inputs=[image_query, image_upload]
+                    )
+                with gr.TabItem("Video Inference"):
+                    video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    video_upload = gr.Video(label="Video")
+                    video_submit = gr.Button("Submit", elem_classes="submit-btn")
+                    gr.Examples(
+                        examples=video_examples,
+                        inputs=[video_query, video_upload]
+                    )
+            with gr.Accordion("Advanced options", open=False):
+                max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
+                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
+                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
+                top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
+                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
+        with gr.Column():
+            output = gr.Textbox(label="Output", interactive=False, lines=3, scale=2)
+            model_choice = gr.Radio(
+                choices=["olmOCR-7B-0225-preview", "SmolDocling-256M-preview", "ByteDance-s-Dolphin"],
+                label="Select Model",
+                value="olmOCR-7B-0225-preview"
+            )
+    image_submit.click(
+        fn=generate_image,
+        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=output
+    )
+    video_submit.click(
+        fn=generate_video,
+        inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=output
+    )
 if __name__ == "__main__":
+    demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)