Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on 6 days ago

Commit

0e5cdf9

verified ·

1 Parent(s): e817668

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -33

app.py CHANGED Viewed

@@ -62,7 +62,6 @@ model_x = AutoModelForVision2Seq.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
 # Preprocessing functions for SmolDocling-256M
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
     """Add random padding to an image based on its size."""
@@ -110,18 +109,30 @@ def downsample_video(video_path):
     return frames
 # Dolphin-specific functions
-def model_chat(prompt, image):
-    """Use Dolphin model for inference."""
     processor = processor_k
     model = model_k
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    inputs = processor(image, return_tensors="pt").to(device)
     pixel_values = inputs.pixel_values.half()
     prompt_inputs = processor.tokenizer(
-        f"<s>{prompt} <Answer/>",
         add_special_tokens=False,
-        return_tensors="pt"
     ).to(device)
     outputs = model.generate(
         pixel_values=pixel_values,
         decoder_input_ids=prompt_inputs.input_ids,
@@ -137,20 +148,48 @@ def model_chat(prompt, image):
         num_beams=1,
         repetition_penalty=1.1
     )
-    sequence = processor.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)[0]
-    cleaned = sequence.replace(f"<s>{prompt} <Answer/>", "").replace("<pad>", "").replace("</s>", "").strip()
-    return cleaned
 def process_elements(layout_results, image):
     """Parse layout results and extract elements from the image."""
-    # Placeholder parsing logic based on expected Dolphin output
-    # Assuming layout_results is a string like "[(x1,y1,x2,y2,label), ...]"
     try:
         elements = ast.literal_eval(layout_results)
     except:
-        elements = []  # Fallback if parsing fails
-    recognition_results = []
     reading_order = 0
     for bbox, label in elements:
@@ -158,27 +197,21 @@ def process_elements(layout_results, image):
             x1, y1, x2, y2 = map(int, bbox)
             cropped = image.crop((x1, y1, x2, y2))
             if cropped.size[0] > 0 and cropped.size[1] > 0:
                 if label == "text":
-                    text = model_chat("Read text in the image.", cropped)
-                    recognition_results.append({
-                        "label": label,
-                        "bbox": [x1, y1, x2, y2],
-                        "text": text.strip(),
-                        "reading_order": reading_order
-                    })
                 elif label == "table":
-                    table_text = model_chat("Parse the table in the image.", cropped)
-                    recognition_results.append({
-                        "label": label,
-                        "bbox": [x1, y1, x2, y2],
-                        "text": table_text.strip(),
-                        "reading_order": reading_order
-                    })
                 elif label == "figure":
-                    recognition_results.append({
                         "label": label,
                         "bbox": [x1, y1, x2, y2],
-                        "text": "[Figure]",  # Placeholder for figure content
                         "reading_order": reading_order
                     })
             reading_order += 1
@@ -186,12 +219,23 @@ def process_elements(layout_results, image):
             print(f"Error processing element: {e}")
             continue
     return recognition_results
 def generate_markdown(recognition_results):
     """Generate markdown from extracted elements."""
     markdown = ""
-    for element in sorted(recognition_results, key=lambda x: x["reading_order"]):
         if element["label"] == "text":
             markdown += f"{element['text']}\n\n"
         elif element["label"] == "table":
@@ -222,7 +266,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         markdown_content = process_image_with_dolphin(image)
         yield markdown_content
     else:
-        # Existing logic for other models
         if model_name == "olmOCR-7B-0225-preview":
             processor = processor_m
             model = model_m
@@ -309,7 +352,6 @@ def generate_video(model_name: str, text: str, video_path: str,
         combined_markdown = "\n\n".join(markdown_contents)
         yield combined_markdown
     else:
-        # Existing logic for other models
         if model_name == "olmOCR-7B-0225-preview":
             processor = processor_m
             model = model_m
@@ -401,7 +443,7 @@ css = """
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
-    gr.Markdown("# **[Core OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():

     torch_dtype=torch.float16
 ).to(device).eval()
 # Preprocessing functions for SmolDocling-256M
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
     """Add random padding to an image based on its size."""
     return frames
 # Dolphin-specific functions
+def model_chat(prompt, image, is_batch=False):
+    """Use Dolphin model for inference, supporting both single and batch processing."""
     processor = processor_k
     model = model_k
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    if not is_batch:
+        images = [image]
+        prompts = [prompt]
+    else:
+        images = image
+        prompts = prompt if isinstance(prompt, list) else [prompt] * len(images)
+    inputs = processor(images, return_tensors="pt", padding=True).to(device)
     pixel_values = inputs.pixel_values.half()
+    prompts = [f"<s>{p} <Answer/>" for p in prompts]
     prompt_inputs = processor.tokenizer(
+        prompts,
         add_special_tokens=False,
+        return_tensors="pt",
+        padding=True
     ).to(device)
     outputs = model.generate(
         pixel_values=pixel_values,
         decoder_input_ids=prompt_inputs.input_ids,
         num_beams=1,
         repetition_penalty=1.1
     )
+    sequences = processor.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)
+    results = []
+    for i, sequence in enumerate(sequences):
+        cleaned = sequence.replace(prompts[i], "").replace("<pad>", "").replace("</s>", "").strip()
+        results.append(cleaned)
+    return results[0] if not is_batch else results
+def process_element_batch(elements, prompt, max_batch_size=16):
+    """Process a batch of elements with the same prompt."""
+    results = []
+    batch_size = min(len(elements), max_batch_size)
+    for i in range(0, len(elements), batch_size):
+        batch_elements = elements[i:i + batch_size]
+        crops_list = [elem["crop"] for elem in batch_elements]
+        prompts_list = [prompt] * len(crops_list)
+        batch_results = model_chat(prompts_list, crops_list, is_batch=True)
+        for j, result in enumerate(batch_results):
+            elem = batch_elements[j]
+            results.append({
+                "label": elem["label"],
+                "bbox": elem["bbox"],
+                "text": result.strip(),
+                "reading_order": elem["reading_order"],
+            })
+    return results
 def process_elements(layout_results, image):
     """Parse layout results and extract elements from the image."""
     try:
         elements = ast.literal_eval(layout_results)
     except:
+        elements = []
+    text_elements = []
+    table_elements = []
+    figure_results = []
     reading_order = 0
     for bbox, label in elements:
             x1, y1, x2, y2 = map(int, bbox)
             cropped = image.crop((x1, y1, x2, y2))
             if cropped.size[0] > 0 and cropped.size[1] > 0:
+                element_info = {
+                    "crop": cropped,
+                    "label": label,
+                    "bbox": [x1, y1, x2, y2],
+                    "reading_order": reading_order,
+                }
                 if label == "text":
+                    text_elements.append(element_info)
                 elif label == "table":
+                    table_elements.append(element_info)
                 elif label == "figure":
+                    figure_results.append({
                         "label": label,
                         "bbox": [x1, y1, x2, y2],
+                        "text": "[Figure]",
                         "reading_order": reading_order
                     })
             reading_order += 1
             print(f"Error processing element: {e}")
             continue
+    recognition_results = figure_results.copy()
+    if text_elements:
+        text_results = process_element_batch(text_elements, "Read text in the image.")
+        recognition_results.extend(text_results)
+    if table_elements:
+        table_results = process_element_batch(table_elements, "Parse the table in the image.")
+        recognition_results.extend(table_results)
+    recognition_results.sort(key=lambda x: x["reading_order"])
     return recognition_results
 def generate_markdown(recognition_results):
     """Generate markdown from extracted elements."""
     markdown = ""
+    for element in recognition_results:
         if element["label"] == "text":
             markdown += f"{element['text']}\n\n"
         elif element["label"] == "table":
         markdown_content = process_image_with_dolphin(image)
         yield markdown_content
     else:
         if model_name == "olmOCR-7B-0225-preview":
             processor = processor_m
             model = model_m
         combined_markdown = "\n\n".join(markdown_contents)
         yield combined_markdown
     else:
         if model_name == "olmOCR-7B-0225-preview":
             processor = processor_m
             model = model_m
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("# **[Docling-VLMs](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():