Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on 7 days ago

Commit

914bd4d

verified ·

1 Parent(s): 6e73997

Update app.py

Browse files

Files changed (1) hide show

app.py +265 -338

app.py CHANGED Viewed

@@ -5,12 +5,6 @@ import json
 import time
 import asyncio
 from threading import Thread
-import io
-import base64
-import re
-import ast
-import html
-from collections import namedtuple
 import gradio as gr
 import spaces
@@ -31,9 +25,13 @@ from transformers.image_utils import load_image
 from docling_core.types.doc import DoclingDocument, DocTagsDocument
 # Constants for text generation
-MAX_MAX_NEW_TOKENS = 4096
-DEFAULT_MAX_NEW_TOKENS = 2048
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -47,6 +45,15 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
 # Load SmolDocling-256M-preview
 MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
@@ -71,21 +78,6 @@ model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-#------------------------------------------------#
-# Load ByteDance's Dolphin (with specific implementation)
-print("Loading ByteDance/Dolphin model...")
-MODEL_ID_K = "ByteDance/Dolphin"
-processor_k = AutoProcessor.from_pretrained(MODEL_ID_K)
-model_k = VisionEncoderDecoderModel.from_pretrained(MODEL_ID_K)
-model_k.eval()
-model_k.to(device)
-if torch.cuda.is_available():
-    model_k = model_k.half() # Use half-precision on GPU
-tokenizer_k = processor_k.tokenizer
-print("ByteDance/Dolphin model loaded.")
-#------------------------------------------------#
 # Preprocessing functions for SmolDocling-256M
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
     """Add random padding to an image based on its size."""
@@ -120,12 +112,7 @@ def downsample_video(video_path):
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
-    # Take up to 10 frames
-    num_frames_to_sample = min(10, total_frames)
-    if num_frames_to_sample == 0:
-        return []
-    frame_indices = np.linspace(0, total_frames - 1, num_frames_to_sample, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
@@ -137,194 +124,103 @@ def downsample_video(video_path):
     vidcap.release()
     return frames
-# ------------------- Dolphin Model Specific Helper Functions ------------------- #
-ImageDimensions = namedtuple("ImageDimensions", ["width", "height", "new_w", "new_h", "pad_w", "pad_h"])
-class MarkdownConverter:
-    """Converts structured recognition results to a Markdown string."""
-    def convert(self, elements):
-        markdown_str = ""
-        for elem in elements:
-            label = elem["label"]
-            text = elem["text"]
-            if label == "fig":
-                # Embed image as base64
-                markdown_str += f"![figure](data:image/png;base64,{text})\n\n"
-            elif label == "tab":
-                markdown_str += f"### Table\n\n{text}\n\n"
-            else: # text, title, head, foot, etc.
-                markdown_str += f"{text}\n\n"
-        return markdown_str.strip()
-def prepare_image_dolphin(pil_image, target_size=1024):
-    """Pads a PIL image to a square, returning a cv2 image and dimensions."""
-    image = np.array(pil_image.convert('RGB'))
-    h, w, _ = image.shape
-    if h > w:
-        new_h, new_w = target_size, int(w * target_size / h)
-    else:
-        new_h, new_w = int(h * target_size / w), target_size
-    resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
-    pad_w = (target_size - new_w) // 2
-    pad_h = (target_size - new_h) // 2
-    padded_image = np.pad(resized_image, ((pad_h, pad_h), (pad_w, pad_w), (0, 0)), 'constant', constant_values=255)
-    dims = ImageDimensions(w, h, new_w, new_h, pad_w, pad_h)
-    return padded_image, dims
-def parse_layout_string_dolphin(layout_string):
-    """Parses the model's layout string into a list of (bbox, label) tuples."""
-    pattern = r'([a-zA-Z_]+)\(((?:\d+,){3}\d+)\)'
-    matches = re.findall(pattern, layout_string)
-    results = []
-    for label, coords_str in matches:
-        coords = tuple(map(int, coords_str.split(',')))
-        results.append((coords, label))
-    return results
-def process_coordinates_dolphin(bbox, padded_image, dims, previous_box):
-    """Converts relative bbox coordinates to absolute pixel coordinates for cropping."""
-    x1, y1, x2, y2 = bbox
-    orig_x1 = int(x1 / 1024 * dims.new_w)
-    orig_y1 = int(y1 / 1024 * dims.new_h)
-    orig_x2 = int(x2 / 1024 * dims.new_w)
-    orig_y2 = int(y2 / 1024 * dims.new_h)
-    x1 = orig_x1 + dims.pad_w
-    y1 = orig_y1 + dims.pad_h
-    x2 = orig_x2 + dims.pad_w
-    y2 = orig_y2 + dims.pad_h
-    return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, bbox
-@spaces.GPU
-def dolphin_model_chat(model, processor, prompt, image):
-    """Core inference function for the Dolphin model, supports batching."""
-    is_batch = isinstance(image, list)
-    images = image if is_batch else [image]
-    prompts = prompt if isinstance(prompt, list) else [prompt] * len(images)
-    batch_inputs = processor(images, return_tensors="pt", padding=True)
-    pixel_values = batch_inputs.pixel_values.to(device)
-    if torch.cuda.is_available():
-        pixel_values = pixel_values.half()
-    prompts = [f"<s>{p} <Answer/>" for p in prompts]
-    prompt_inputs = tokenizer_k(prompts, add_special_tokens=False, return_tensors="pt")
-    prompt_ids = prompt_inputs.input_ids.to(device)
-    attention_mask = prompt_inputs.attention_mask.to(device)
     outputs = model.generate(
         pixel_values=pixel_values,
-        decoder_input_ids=prompt_ids,
-        decoder_attention_mask=attention_mask,
         max_length=4096,
-        pad_token_id=tokenizer_k.pad_token_id,
-        eos_token_id=tokenizer_k.eos_token_id,
         use_cache=True,
-        bad_words_ids=[[tokenizer_k.unk_token_id]],
         return_dict_in_generate=True,
     )
-    sequences = tokenizer_k.batch_decode(outputs.sequences, skip_special_tokens=False)
-    results = []
-    for i, seq in enumerate(sequences):
-        cleaned = seq.replace(prompts[i], "").replace("<pad>", "").replace("</s>", "").strip()
-        results.append(cleaned)
-    return results[0] if not is_batch else results
-@spaces.GPU
-def process_element_batch_dolphin(elements, prompt, model, processor, max_batch_size=16):
-    """Processes a batch of cropped image elements with the same prompt."""
-    results = []
-    for i in range(0, len(elements), max_batch_size):
-        batch_elements = elements[i:i+max_batch_size]
-        crops_list = [elem["crop"] for elem in batch_elements]
-        prompts_list = [prompt] * len(crops_list)
-        batch_results = dolphin_model_chat(model, processor, prompts_list, crops_list)
-        for j, result in enumerate(batch_results):
-            elem = batch_elements[j]
-            results.append({
-                "label": elem["label"],
-                "bbox": elem["bbox"],
-                "text": result.strip(),
-                "reading_order": elem["reading_order"],
-            })
-    return results
-@spaces.GPU
-def run_dolphin_image_pipeline(pil_image, model, processor):
-    """Runs the full two-stage pipeline for a single image."""
-    try:
-        # Stage 1: Layout Analysis
-        print("Dolphin: Running layout analysis...")
-        layout_output = dolphin_model_chat(model, processor, "Parse the reading order of this document.", pil_image)
-        # Stage 2: Element Recognition
-        print("Dolphin: Parsing layout and processing elements...")
-        padded_image, dims = prepare_image_dolphin(pil_image)
-        layout_results = parse_layout_string_dolphin(layout_output)
-        text_elements, table_elements, figure_results = [], [], []
-        previous_box = None
-        reading_order = 0
-        for bbox, label in layout_results:
-            try:
-                x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = process_coordinates_dolphin(
-                    bbox, padded_image, dims, previous_box
-                )
-                cropped = padded_image[y1:y2, x1:x2]
-                if cropped.size > 0:
-                    pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
-                    element_info = {"crop": pil_crop, "label": label, "bbox": [orig_x1, orig_y1, orig_x2, orig_y2], "reading_order": reading_order}
-                    if label == "fig":
-                        buffered = io.BytesIO()
-                        pil_crop.save(buffered, format="PNG")
-                        img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
-                        figure_results.append({"label": label, "bbox": element_info["bbox"], "text": img_base64, "reading_order": reading_order})
-                    elif label == "tab":
-                        table_elements.append(element_info)
-                    else:
-                        text_elements.append(element_info)
-                reading_order += 1
-            except Exception as e:
-                print(f"Dolphin: Error processing element with label {label}: {e}")
-                continue
-        recognition_results = figure_results.copy()
-        if text_elements:
-            print(f"Dolphin: Recognizing {len(text_elements)} text element(s)...")
-            recognition_results.extend(process_element_batch_dolphin(text_elements, "Read text in the image.", model, processor))
-        if table_elements:
-            print(f"Dolphin: Parsing {len(table_elements)} table(s)...")
-            recognition_results.extend(process_element_batch_dolphin(table_elements, "Parse the table in the image.", model, processor))
-        recognition_results.sort(key=lambda x: x.get("reading_order", 0))
-        # Stage 3: Generate Markdown
-        print("Dolphin: Generating final Markdown output...")
-        converter = MarkdownConverter()
-        markdown_output = converter.convert(recognition_results)
-        return f"**Markdown Output (from Dolphin):**\n\n{markdown_output}"
-    except Exception as e:
-        print(f"Error during Dolphin pipeline: {e}")
-        return f"An error occurred during the Dolphin processing pipeline: {e}"
-# ------------------- End of Dolphin Specific Functions ------------------- #
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
@@ -334,62 +230,82 @@ def generate_image(model_name: str, text: str, image: Image.Image,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
-    if image is None:
-        yield "Please upload an image."
-        return
-    # --- Dolphin Specific Path (Non-streaming, multi-stage) ---
     if model_name == "ByteDance-s-Dolphin":
-        yield run_dolphin_image_pipeline(image, model_k, processor_k)
-        return
-    # --- Generic Path for Other Models (Streaming) ---
-    if model_name == "Nanonets-OCR-s":
-        processor, model = processor_m, model_m
-    elif model_name == "MonkeyOCR-Recognition":
-        processor, model = processor_g, model_g
-    elif model_name == "SmolDocling-256M-preview":
-        processor, model = processor_x, model_x
     else:
-        yield "Invalid model selected."
-        return
-    images = [image]
-    if model_name == "SmolDocling-256M-preview":
-        if "OTSL" in text or "code" in text:
-            images = [add_random_padding(img) for img in images]
-        if "OCR at text at" in text or "Identify element" in text or "formula" in text:
-            text = normalize_values(text, target_max=500)
-    messages = [{"role": "user", "content": [{"type": "image"}] * len(images) + [{"type": "text", "text": text}]}]
-    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k, "repetition_penalty": repetition_penalty}
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    full_output = ""
-    for new_text in streamer:
-        full_output += new_text
-        buffer += new_text.replace("<|im_end|>", "")
-        yield buffer
-    if model_name == "SmolDocling-256M-preview":
-        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
-        if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
-            if "<chart>" in cleaned_output:
-                cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
-                cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
-            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
-            doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
-            markdown_output = doc.export_to_markdown()
-            yield f"**MD Output:**\n\n{markdown_output}"
         else:
-            yield cleaned_output
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
@@ -399,76 +315,88 @@ def generate_video(model_name: str, text: str, video_path: str,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for video input using the selected model."""
-    if video_path is None:
-        yield "Please upload a video."
-        return
-    frames_with_ts = downsample_video(video_path)
-    if not frames_with_ts:
-        yield "Could not extract frames from the video."
-        return
-    images = [frame for frame, _ in frames_with_ts]
-    timestamps = [ts for _, ts in frames_with_ts]
-    # --- Dolphin Specific Path (Batch processing frames) ---
     if model_name == "ByteDance-s-Dolphin":
-        if not text:
-            yield "Please provide a query for the video analysis (e.g., 'Describe what you see')."
             return
-        prompts = [text] * len(images)
-        yield "Analyzing video frames with Dolphin... (this may take a moment)"
-        results = dolphin_model_chat(model_k, processor_k, prompts, images)
-        full_output = "### Dolphin Video Analysis (per-frame)\n\n"
-        for i, res in enumerate(results):
-            full_output += f"**Frame at {timestamps[i]:.2f}s:**\n{res.strip()}\n\n---\n"
-        yield full_output
-        return
-    # --- Generic Path for Other Models (Streaming) ---
-    if model_name == "Nanonets-OCR-s":
-        processor, model = processor_m, model_m
-    elif model_name == "MonkeyOCR-Recognition":
-        processor, model = processor_g, model_g
-    elif model_name == "SmolDocling-256M-preview":
-        processor, model = processor_x, model_x
     else:
-        yield "Invalid model selected."
-        return
-    if model_name == "SmolDocling-256M-preview":
-        if "OTSL" in text or "code" in text:
-            images = [add_random_padding(img) for img in images]
-        if "OCR at text at" in text or "Identify element" in text or "formula" in text:
-            text = normalize_values(text, target_max=500)
-    messages = [{"role": "user", "content": [{"type": "image"}] * len(images) + [{"type": "text", "text": text}]}]
-    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k, "repetition_penalty": repetition_penalty}
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    full_output = ""
-    for new_text in streamer:
-        full_output += new_text
-        buffer += new_text.replace("<|im_end|>", "")
-        yield buffer
-    if model_name == "SmolDocling-256M-preview":
-        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
-        if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
-            if "<chart>" in cleaned_output:
-                cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
-                cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
-            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
-            doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
-            markdown_output = doc.export_to_markdown()
-            yield f"**MD Output:**\n\n{markdown_output}"
         else:
-            yield cleaned_output
 # Define examples for image and video inference
 image_examples = [
@@ -495,17 +423,11 @@ css = """
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     gr.Markdown("# **[Core OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
-    gr.Markdown("A multi-model OCR and Document AI interface. Select 'ByteDance-s-Dolphin' for advanced, two-stage document layout analysis on images.")
     with gr.Row():
         with gr.Column():
-            model_choice = gr.Radio(
-                choices=["Nanonets-OCR-s", "SmolDocling-256M-preview", "MonkeyOCR-Recognition", "ByteDance-s-Dolphin"],
-                label="Select Model",
-                value="Nanonets-OCR-s"
-            )
             with gr.Tabs():
                 with gr.TabItem("Image Inference"):
-                    image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here... For Dolphin, leave blank to run full document analysis or ask a question about the image.")
                     image_upload = gr.Image(type="pil", label="Image")
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(
@@ -520,15 +442,20 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                         examples=video_examples,
                         inputs=[video_query, video_upload]
                     )
-            with gr.Accordion("Advanced options (for streaming models)", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
-            output = gr.Markdown(label="Output", interactive=False)
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
@@ -541,4 +468,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     )
 if __name__ == "__main__":
-    demo.queue(max_size=30).launch(share=True)

 import time
 import asyncio
 from threading import Thread
 import gradio as gr
 import spaces
 from docling_core.types.doc import DoclingDocument, DocTagsDocument
+import re
+import ast
+import html
 # Constants for text generation
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load ByteDance's Dolphin
+MODEL_ID_K = "ByteDance/Dolphin"
+processor_k = AutoProcessor.from_pretrained(MODEL_ID_K, trust_remote_code=True)
+model_k = VisionEncoderDecoderModel.from_pretrained(
+    MODEL_ID_K,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
 # Load SmolDocling-256M-preview
 MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
     torch_dtype=torch.float16
 ).to(device).eval()
 # Preprocessing functions for SmolDocling-256M
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
     """Add random padding to an image based on its size."""
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
+    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         success, image = vidcap.read()
     vidcap.release()
     return frames
+# Dolphin-specific functions
+def model_chat(prompt, image):
+    """Use Dolphin model for inference."""
+    processor = processor_k
+    model = model_k
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    inputs = processor(image, return_tensors="pt").to(device)
+    pixel_values = inputs.pixel_values.half()
+    prompt_inputs = processor.tokenizer(
+        f"<s>{prompt} <Answer/>",
+        add_special_tokens=False,
+        return_tensors="pt"
+    ).to(device)
     outputs = model.generate(
         pixel_values=pixel_values,
+        decoder_input_ids=prompt_inputs.input_ids,
+        decoder_attention_mask=prompt_inputs.attention_mask,
+        min_length=1,
         max_length=4096,
+        pad_token_id=processor.tokenizer.pad_token_id,
+        eos_token_id=processor.tokenizer.eos_token_id,
         use_cache=True,
+        bad_words_ids=[[processor.tokenizer.unk_token_id]],
         return_dict_in_generate=True,
+        do_sample=False,
+        num_beams=1,
+        repetition_penalty=1.1
     )
+    sequence = processor.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)[0]
+    cleaned = sequence.replace(f"<s>{prompt} <Answer/>", "").replace("<pad>", "").replace("</s>", "").strip()
+    return cleaned
+def process_elements(layout_results, image):
+    """Parse layout results and extract elements from the image."""
+    # Placeholder parsing logic based on expected Dolphin output
+    # Assuming layout_results is a string like "[(x1,y1,x2,y2,label), ...]"
+    try:
+        elements = ast.literal_eval(layout_results)
+    except:
+        elements = []  # Fallback if parsing fails
+    recognition_results = []
+    reading_order = 0
+    for bbox, label in elements:
+        try:
+            x1, y1, x2, y2 = map(int, bbox)
+            cropped = image.crop((x1, y1, x2, y2))
+            if cropped.size[0] > 0 and cropped.size[1] > 0:
+                if label == "text":
+                    text = model_chat("Read text in the image.", cropped)
+                    recognition_results.append({
+                        "label": label,
+                        "bbox": [x1, y1, x2, y2],
+                        "text": text.strip(),
+                        "reading_order": reading_order
+                    })
+                elif label == "table":
+                    table_text = model_chat("Parse the table in the image.", cropped)
+                    recognition_results.append({
+                        "label": label,
+                        "bbox": [x1, y1, x2, y2],
+                        "text": table_text.strip(),
+                        "reading_order": reading_order
+                    })
+                elif label == "figure":
+                    recognition_results.append({
+                        "label": label,
+                        "bbox": [x1, y1, x2, y2],
+                        "text": "[Figure]",  # Placeholder for figure content
+                        "reading_order": reading_order
+                    })
+            reading_order += 1
+        except Exception as e:
+            print(f"Error processing element: {e}")
+            continue
+    return recognition_results
+def generate_markdown(recognition_results):
+    """Generate markdown from extracted elements."""
+    markdown = ""
+    for element in sorted(recognition_results, key=lambda x: x["reading_order"]):
+        if element["label"] == "text":
+            markdown += f"{element['text']}\n\n"
+        elif element["label"] == "table":
+            markdown += f"**Table:**\n{element['text']}\n\n"
+        elif element["label"] == "figure":
+            markdown += f"{element['text']}\n\n"
+    return markdown.strip()
+def process_image_with_dolphin(image):
+    """Process a single image with Dolphin model."""
+    layout_output = model_chat("Parse the reading order of this document.", image)
+    elements = process_elements(layout_output, image)
+    markdown_content = generate_markdown(elements)
+    return markdown_content
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for image input using the selected model."""
     if model_name == "ByteDance-s-Dolphin":
+        if image is None:
+            yield "Please upload an image."
+            return
+        markdown_content = process_image_with_dolphin(image)
+        yield markdown_content
     else:
+        # Existing logic for other models
+        if model_name == "Nanonets-OCR-s":
+            processor = processor_m
+            model = model_m
+        elif model_name == "MonkeyOCR-Recognition":
+            processor = processor_g
+            model = model_g
+        elif model_name == "SmolDocling-256M-preview":
+            processor = processor_x
+            model = model_x
         else:
+            yield "Invalid model selected."
+            return
+        if image is None:
+            yield "Please upload an image."
+            return
+        images = [image]
+        if model_name == "SmolDocling-256M-preview":
+            if "OTSL" in text or "code" in text:
+                images = [add_random_padding(img) for img in images]
+            if "OCR at text at" in text or "Identify element" in text or "formula" in text:
+                text = normalize_values(text, target_max=500)
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "image"} for _ in images] + [
+                    {"type": "text", "text": text}
+                ]
+            }
+        ]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        full_output = ""
+        for new_text in streamer:
+            full_output += new_text
+            buffer += new_text.replace("<|im_end|>", "")
+            yield buffer
+        if model_name == "SmolDocling-256M-preview":
+            cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
+            if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+                if "<chart>" in cleaned_output:
+                    cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                    cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
+                doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
+                doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
+                markdown_output = doc.export_to_markdown()
+                yield f"**MD Output:**\n\n{markdown_output}"
+            else:
+                yield cleaned_output
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for video input using the selected model."""
     if model_name == "ByteDance-s-Dolphin":
+        if video_path is None:
+            yield "Please upload a video."
             return
+        frames = downsample_video(video_path)
+        markdown_contents = []
+        for frame, _ in frames:
+            markdown_content = process_image_with_dolphin(frame)
+            markdown_contents.append(markdown_content)
+        combined_markdown = "\n\n".join(markdown_contents)
+        yield combined_markdown
     else:
+        # Existing logic for other models
+        if model_name == "Nanonets-OCR-s":
+            processor = processor_m
+            model = model_m
+        elif model_name == "MonkeyOCR-Recognition":
+            processor = processor_g
+            model = model_g
+        elif model_name == "SmolDocling-256M-preview":
+            processor = processor_x
+            model = model_x
         else:
+            yield "Invalid model selected."
+            return
+        if video_path is None:
+            yield "Please upload a video."
+            return
+        frames = downsample_video(video_path)
+        images = [frame for frame, _ in frames]
+        if model_name == "SmolDocling-256M-preview":
+            if "OTSL" in text or "code" in text:
+                images = [add_random_padding(img) for img in images]
+            if "OCR at text at" in text or "Identify element" in text or "formula" in text:
+                text = normalize_values(text, target_max=500)
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "image"} for _ in images] + [
+                    {"type": "text", "text": text}
+                ]
+            }
+        ]
+        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        full_output = ""
+        for new_text in streamer:
+            full_output += new_text
+            buffer += new_text.replace("<|im_end|>", "")
+            yield buffer
+        if model_name == "SmolDocling-256M-preview":
+            cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
+            if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+                if "<chart>" in cleaned_output:
+                    cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                    cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
+                doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
+                doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
+                markdown_output = doc.export_to_markdown()
+                yield f"**MD Output:**\n\n{markdown_output}"
+            else:
+                yield cleaned_output
 # Define examples for image and video inference
 image_examples = [
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     gr.Markdown("# **[Core OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
                 with gr.TabItem("Image Inference"):
+                    image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     image_upload = gr.Image(type="pil", label="Image")
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(
                         examples=video_examples,
                         inputs=[video_query, video_upload]
                     )
+            with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
+            output = gr.Textbox(label="Output", interactive=False, lines=3, scale=2)
+            model_choice = gr.Radio(
+                choices=["Nanonets-OCR-s", "SmolDocling-256M-preview", "MonkeyOCR-Recognition", "ByteDance-s-Dolphin"],
+                label="Select Model",
+                value="Nanonets-OCR-s"
+            )
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
     )
 if __name__ == "__main__":
+    demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)