Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Starting on Zero

App Files Files Community

prithivMLmods commited on 8 days ago

Commit

7019b95

verified ·

1 Parent(s): 94d3a2b

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -62

app.py CHANGED Viewed

@@ -10,19 +10,24 @@ import gradio as gr
 import spaces
 import torch
 import numpy as np
-from PIL import Image
 import cv2
 from transformers import (
     Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
     AutoModelForVision2Seq,
-    AutoModelForImageTextToText,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
@@ -48,30 +53,51 @@ model_x = AutoModelForVision2Seq.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-#--------------------------------------------------------------------------------------#
-#Load MonkeyOCR
 MODEL_ID_G = "echo840/MonkeyOCR"
 SUBFOLDER = "Recognition"
 processor_g = AutoProcessor.from_pretrained(
     MODEL_ID_G,
     trust_remote_code=True,
     subfolder=SUBFOLDER
 )
 model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_G,
     trust_remote_code=True,
     subfolder=SUBFOLDER,
     torch_dtype=torch.float16
 ).to(device).eval()
-#--------------------------------------------------------------------------------------#
 def downsample_video(video_path):
-    """
-    Downsamples the video to evenly spaced frames.
-    Each frame is returned as a PIL image along with its timestamp.
-    """
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
@@ -95,18 +121,17 @@ def generate_image(model_name: str, text: str, image: Image.Image,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
-    """
-    Generates responses using the selected model for image input.
-    """
     if model_name == "Nanonets-OCR-s":
         processor = processor_m
         model = model_m
-    elif model_name == "SmolDocling-256M-preview":
-        processor = processor_x
-        model = model_x
     elif model_name == "MonkeyOCR-Recognition":
         processor = processor_g
         model = model_g
     else:
         yield "Invalid model selected."
         return
@@ -115,33 +140,64 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         yield "Please upload an image."
         return
-    messages = [{
-        "role": "user",
-        "content": [
-            {"type": "image", "image": image},
-            {"type": "text", "text": text},
-        ]
-    }]
-    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(
-        text=[prompt_full],
-        images=[image],
-        return_tensors="pt",
-        padding=True,
-        truncation=False,
-        max_length=MAX_INPUT_TOKEN_LENGTH
-    ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
-        time.sleep(0.01)
         yield buffer
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
                    max_new_tokens: int = 1024,
@@ -149,18 +205,17 @@ def generate_video(model_name: str, text: str, video_path: str,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
-    """
-    Generates responses using the selected model for video input.
-    """
     if model_name == "Nanonets-OCR-s":
         processor = processor_m
         model = model_m
-    elif model_name == "SmolDocling-256M-preview":
-        processor = processor_x
-        model = model_x
     elif model_name == "MonkeyOCR-Recognition":
         processor = processor_g
         model = model_g
     else:
         yield "Invalid model selected."
         return
@@ -169,30 +224,35 @@ def generate_video(model_name: str, text: str, video_path: str,
         yield "Please upload a video."
         return
     frames = downsample_video(video_path)
     messages = [
-        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-        {"role": "user", "content": [{"type": "text", "text": text}]}
     ]
-    for frame in frames:
-        image, timestamp = frame
-        messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
-        messages[1]["content"].append({"type": "image", "image": image})
-    inputs = processor.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_dict=True,
-        return_tensors="pt",
-        truncation=False,
-        max_length=MAX_INPUT_TOKEN_LENGTH
-    ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
         "streamer": streamer,
         "max_new_tokens": max_new_tokens,
-        "do_sample": True,
         "temperature": temperature,
         "top_p": top_p,
         "top_k": top_k,
@@ -200,13 +260,29 @@ def generate_video(model_name: str, text: str, video_path: str,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
-        time.sleep(0.01)
         yield buffer
 # Define examples for image and video inference
 image_examples = [
     ["fill the correct numbers", "example/image3.png"],

 import spaces
 import torch
 import numpy as np
+from PIL import Image, ImageOps
 import cv2
 from transformers import (
     Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
     AutoModelForVision2Seq,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
+from docling_core.types.doc import DoclingDocument, DocTagsDocument
+import re
+import ast
+import html
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load MonkeyOCR
 MODEL_ID_G = "echo840/MonkeyOCR"
 SUBFOLDER = "Recognition"
 processor_g = AutoProcessor.from_pretrained(
     MODEL_ID_G,
     trust_remote_code=True,
     subfolder=SUBFOLDER
 )
 model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_G,
     trust_remote_code=True,
     subfolder=SUBFOLDER,
     torch_dtype=torch.float16
 ).to(device).eval()
+# Preprocessing functions for SmolDocling-256M
+def add_random_padding(image, min_percent=0.1, max_percent=0.10):
+    """Add random padding to an image based on its size."""
+    image = image.convert("RGB")
+    width, height = image.size
+    pad_w_percent = random.uniform(min_percent, max_percent)
+    pad_h_percent = random.uniform(min_percent, max_percent)
+    pad_w = int(width * pad_w_percent)
+    pad_h = int(height * pad_h_percent)
+    corner_pixel = image.getpixel((0, 0))  # Top-left corner
+    padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
+    return padded_image
+def normalize_values(text, target_max=500):
+    """Normalize numerical values in text to a target maximum."""
+    def normalize_list(values):
+        max_value = max(values) if values else 1
+        return [round((v / max_value) * target_max) for v in values]
+    def process_match(match):
+        num_list = ast.literal_eval(match.group(0))
+        normalized = normalize_list(num_list)
+        return "".join([f"<loc_{num}>" for num in normalized])
+    pattern = r"\[([\d\.\s,]+)\]"
+    normalized_text = re.sub(pattern, process_match, text)
+    return normalized_text
 def downsample_video(video_path):
+    """Downsample a video to evenly spaced frames, returning PIL images with timestamps."""
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
+    """Generate responses for image input using the selected model."""
+    # Model selection
     if model_name == "Nanonets-OCR-s":
         processor = processor_m
         model = model_m
     elif model_name == "MonkeyOCR-Recognition":
         processor = processor_g
         model = model_g
+    elif model_name == "SmolDocling-256M-preview":
+        processor = processor_x
+        model = model_x
     else:
         yield "Invalid model selected."
         return
         yield "Please upload an image."
         return
+    # Prepare images as a list (single image for image inference)
+    images = [image]
+    # SmolDocling-256M specific preprocessing
+    if model_name == "SmolDocling-256M-preview":
+        if "OTSL" in text or "code" in text:
+            images = [add_random_padding(img) for img in images]
+        if "OCR at text at" in text or "Identify element" in text or "formula" in text:
+            text = normalize_values(text, target_max=500)
+    # Unified message structure for all models
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "image"} for _ in images] + [
+                {"type": "text", "text": text}
+            ]
+        }
+    ]
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+    # Generation with streaming
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repetition_penalty": repetition_penalty,
+    }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
+    # Stream output and collect full response
     buffer = ""
+    full_output = ""
     for new_text in streamer:
+        full_output += new_text
+        buffer += new_text.replace("<|im_end|>", "")
         yield buffer
+    # SmolDocling-256M specific postprocessing
+    if model_name == "SmolDocling-256M-preview":
+        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
+        if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+            if "<chart>" in cleaned_output:
+                cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
+            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
+            doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
+            markdown_output = doc.export_to_markdown()
+            yield f"**MD Output:**\n\n{markdown_output}"
+        else:
+            yield cleaned_output
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
                    max_new_tokens: int = 1024,
                    top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
+    """Generate responses for video input using the selected model."""
+    # Model selection
     if model_name == "Nanonets-OCR-s":
         processor = processor_m
         model = model_m
     elif model_name == "MonkeyOCR-Recognition":
         processor = processor_g
         model = model_g
+    elif model_name == "SmolDocling-256M-preview":
+        processor = processor_x
+        model = model_x
     else:
         yield "Invalid model selected."
         return
         yield "Please upload a video."
         return
+    # Extract frames from video
     frames = downsample_video(video_path)
+    images = [frame for frame, _ in frames]
+    # SmolDocling-256M specific preprocessing
+    if model_name == "SmolDocling-256M-preview":
+        if "OTSL" in text or "code" in text:
+            images = [add_random_padding(img) for img in images]
+        if "OCR at text at" in text or "Identify element" in text or "formula" in text:
+            text = normalize_values(text, target_max=500)
+    # Unified message structure for all models
     messages = [
+        {
+            "role": "user",
+            "content": [{"type": "image"} for _ in images] + [
+                {"type": "text", "text": text}
+            ]
+        }
     ]
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+    # Generation with streaming
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
         "streamer": streamer,
         "max_new_tokens": max_new_tokens,
         "temperature": temperature,
         "top_p": top_p,
         "top_k": top_k,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
+    # Stream output and collect full response
     buffer = ""
+    full_output = ""
     for new_text in streamer:
+        full_output += new_text
+        buffer += new_text.replace("<|im_end|>", "")
         yield buffer
+    # SmolDocling-256M specific postprocessing
+    if model_name == "SmolDocling-256M-preview":
+        cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
+        if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+            if "<chart>" in cleaned_output:
+                cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
+            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
+            doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
+            markdown_output = doc.export_to_markdown()
+            yield f"**MD Output:**\n\n{markdown_output}"
+        else:
+            yield cleaned_output
 # Define examples for image and video inference
 image_examples = [
     ["fill the correct numbers", "example/image3.png"],