Spaces:

geekyrakshit
/

medrag

Runtime error

App Files Files Community

mratanusarkar commited on Oct 19, 2024

Commit

331f289

1 Parent(s): f37090a

add: marker image loader + docs + corrections

Browse files

Files changed (8) hide show

docs/document_loader/image_loader/marker_img_loader.md +4 -0
docs/document_loader/image_loader/pdf2image_img_loader.md +1 -1
medrag_multi_modal/document_loader/__init__.py +2 -1
medrag_multi_modal/document_loader/image_loader/__init__.py +2 -2
medrag_multi_modal/document_loader/image_loader/base_img_loader.py +3 -3
medrag_multi_modal/document_loader/image_loader/marker_img_loader.py +74 -0
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py +1 -1
mkdocs.yml +1 -0

docs/document_loader/image_loader/marker_img_loader.md ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ # Load images from PDF files (using Marker)
2	+
3	+ ::: medrag_multi_modal.document_loader.image_loader.marker_img_loader
4	+

docs/document_loader/image_loader/pdf2image_img_loader.md CHANGED Viewed

@@ -1,3 +1,3 @@
-# Load images from PDF files (using pdf2image)
 ::: medrag_multi_modal.document_loader.image_loader.pdf2image_img_loader


1	+ # Load images from PDF files (using PDF2Image)
2
3	::: medrag_multi_modal.document_loader.image_loader.pdf2image_img_loader

medrag_multi_modal/document_loader/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .image_loader import PDF2ImageLoader
 from .text_loader import (
     MarkerTextLoader,
     PDFPlumberTextLoader,
@@ -12,4 +12,5 @@ __all__ = [
     "PDFPlumberTextLoader",
     "MarkerTextLoader",
     "PDF2ImageLoader",
 ]

+from .image_loader import MarkerImageLoader, PDF2ImageLoader
 from .text_loader import (
     MarkerTextLoader,
     PDFPlumberTextLoader,
     "PDFPlumberTextLoader",
     "MarkerTextLoader",
     "PDF2ImageLoader",
+    "MarkerImageLoader",
 ]

medrag_multi_modal/document_loader/image_loader/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .base_img_loader import BaseImageLoader
 from .pdf2image_img_loader import PDF2ImageLoader
-__all__ = ["PDF2ImageLoader", "BaseImageLoader"]

+from .marker_img_loader import MarkerImageLoader
 from .pdf2image_img_loader import PDF2ImageLoader
+__all__ = ["PDF2ImageLoader", "MarkerImageLoader"]

medrag_multi_modal/document_loader/image_loader/base_img_loader.py CHANGED Viewed

@@ -47,7 +47,7 @@ class BaseImageLoader(BaseTextLoader):
         """
         Asynchronously loads images from a PDF file specified by a URL or local file path.
         The overrided processing abstract method then processes the images,
-        and optionally publishes it to a Weave artifact.
         This function downloads a PDF from a given URL if it does not already exist locally,
         reads the specified range of pages, scans each page's content to extract images, and
@@ -58,12 +58,12 @@ class BaseImageLoader(BaseTextLoader):
         each page, extract the image content from the PDF, and convert it to png format.
         It processes pages concurrently using `asyncio` for efficiency.
-        If a wandb_artifact_name is provided, the processed pages are published to a Weave artifact.
         Args:
             start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
             end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
-            wandb_artifact_name (Optional[str]): The name of the Weave artifact to publish the pages to, if provided.
             image_save_dir (str): The directory to save the extracted images.
             cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
             **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.

         """
         Asynchronously loads images from a PDF file specified by a URL or local file path.
         The overrided processing abstract method then processes the images,
+        and optionally publishes it to a WandB artifact.
         This function downloads a PDF from a given URL if it does not already exist locally,
         reads the specified range of pages, scans each page's content to extract images, and
         each page, extract the image content from the PDF, and convert it to png format.
         It processes pages concurrently using `asyncio` for efficiency.
+        If a wandb_artifact_name is provided, the processed pages are published to a WandB artifact.
         Args:
             start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
             end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
+            wandb_artifact_name (Optional[str]): The name of the WandB artifact to publish the pages to, if provided.
             image_save_dir (str): The directory to save the extracted images.
             cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
             **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.

medrag_multi_modal/document_loader/image_loader/marker_img_loader.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+from typing import Any, Dict
+from marker.convert import convert_single_pdf
+from marker.models import load_all_models
+from .base_img_loader import BaseImageLoader
+class MarkerImageLoader(BaseImageLoader):
+    """
+    `MarkerImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
+    loading of pages from a PDF file as images using the marker library.
+    This class provides functionality to extract images from a PDF file using marker library,
+    and optionally publish these images to a WandB artifact.
+    Args:
+        url (str): The URL of the PDF document.
+        document_name (str): The name of the document.
+        document_file_path (str): The path to the PDF file.
+    """
+    def __init__(self, url: str, document_name: str, document_file_path: str):
+        super().__init__(url, document_name, document_file_path)
+    async def extract_page_data(
+        self, page_idx: int, image_save_dir: str, **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Extracts a single page from the PDF as an image using marker library.
+        Args:
+            page_idx (int): The index of the page to process.
+            image_save_dir (str): The directory to save the extracted image.
+            **kwargs: Additional keyword arguments that may be used by marker.
+        Returns:
+            Dict[str, Any]: A dictionary containing the processed page data.
+            The dictionary will have the following keys and values:
+            - "page_idx": (int) the index of the page.
+            - "document_name": (str) the name of the document.
+            - "file_path": (str) the local file path where the PDF is stored.
+            - "file_url": (str) the URL of the PDF file.
+            - "image_file_path": (str) the local file path where the image is stored.
+        """
+        model_lst = load_all_models()
+        _, images, out_meta = convert_single_pdf(
+            self.document_file_path,
+            model_lst,
+            max_pages=1,
+            batch_multiplier=1,
+            start_page=page_idx,
+            ocr_all_pages=True,
+            **kwargs,
+        )
+        image_file_paths = []
+        for img_idx, (_, image) in enumerate(images.items()):
+            image_file_name = f"page{page_idx}_fig{img_idx}.png"
+            image_file_path = os.path.join(image_save_dir, image_file_name)
+            image.save(image_file_path, "png")
+            image_file_paths.append(image_file_path)
+        return {
+            "page_idx": page_idx,
+            "document_name": self.document_name,
+            "file_path": self.document_file_path,
+            "file_url": self.url,
+            "image_file_paths": image_file_paths,
+            "meta": out_meta,
+        }

medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py CHANGED Viewed

@@ -12,7 +12,7 @@ class PDF2ImageLoader(BaseImageLoader):
     loading of pages from a PDF file as images using the pdf2image library.
     This class provides functionality to convert specific pages of a PDF document into images
-    and optionally publish these images to a Weave artifact.
     It is like a snapshot image version of each of the pages from the PDF.
     Args:

     loading of pages from a PDF file as images using the pdf2image library.
     This class provides functionality to convert specific pages of a PDF document into images
+    and optionally publish these images to a WandB artifact.
     It is like a snapshot image version of each of the pages from the PDF.
     Args:

mkdocs.yml CHANGED Viewed

@@ -72,6 +72,7 @@ nav:
     - Image Loader:
       - Base: 'document_loader/image_loader/base_img_loader.md'
       - PDF2Image: 'document_loader/image_loader/pdf2image_img_loader.md'
   - Chunking: 'chunking.md'
   - Retrieval:
     - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'

     - Image Loader:
       - Base: 'document_loader/image_loader/base_img_loader.md'
       - PDF2Image: 'document_loader/image_loader/pdf2image_img_loader.md'
+      - Marker: 'document_loader/image_loader/marker_img_loader.md'
   - Chunking: 'chunking.md'
   - Retrieval:
     - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'