Spaces:

geekyrakshit
/

medrag

Runtime error

App Files Files Community

geekyrakshit commited on Oct 18, 2024

Commit

56d3953

unverified ·

2 Parent(s): bb79bf4 07a16a7

Merge pull request #9 from soumik12345/feat/ensemble-of-text-loaders

Browse files

Files changed (21) hide show

.gitignore +13 -5
docs/document_loader/load_text.md +0 -3
docs/document_loader/load_text_image.md +0 -3
docs/document_loader/text_loader/base_text_loader.md +3 -0
docs/document_loader/text_loader/marker_text_loader.md +23 -0
docs/document_loader/text_loader/pdfplumber_text_loader.md +22 -0
docs/document_loader/text_loader/pymupdf4llm_text_loader.md +23 -0
docs/document_loader/text_loader/pypdf2_text_loader.md +23 -0
medrag_multi_modal/document_loader/__init__.py +14 -2
medrag_multi_modal/document_loader/load_image.py +3 -3
medrag_multi_modal/document_loader/load_text_image.py +0 -137
medrag_multi_modal/document_loader/text_loader/__init__.py +11 -0
medrag_multi_modal/document_loader/{load_text.py → text_loader/base_text_loader.py} +62 -46
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py +92 -0
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py +81 -0
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py +80 -0
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py +82 -0
medrag_multi_modal/retrieval/multi_modal_retrieval.py +6 -6
mkdocs.yml +6 -2
pyproject.toml +5 -1
uv.lock +0 -0

.gitignore CHANGED Viewed

@@ -1,12 +1,20 @@
 .venv/
 .env
-cursor_prompt.txt
-**egg-info/
 **pycache**
 .ruff_cache/
-test.py
-**.pdf
 images/
 wandb/
 .byaldi/
-artifacts/

+# Virtual environments and environment files
 .venv/
 .env
+# Python-related
 **pycache**
+**egg-info/
 .ruff_cache/
+# Project-specific directories
+artifacts/
 images/
 wandb/
+# Temporary and generated files
+**.pdf
 .byaldi/
+cursor_prompt.txt
+test.py
+uv.lock

docs/document_loader/load_text.md DELETED Viewed

@@ -1,3 +0,0 @@
-## Load text from PDF files
-::: medrag_multi_modal.document_loader.load_text

docs/document_loader/load_text_image.md DELETED Viewed

@@ -1,3 +0,0 @@
-## Load text and images from PDF files
-::: medrag_multi_modal.document_loader.load_text_image

docs/document_loader/text_loader/base_text_loader.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ ## Load text from PDF files
2	+
3	+ ::: medrag_multi_modal.document_loader.text_loader.base_text_loader

docs/document_loader/text_loader/marker_text_loader.md ADDED Viewed

	@@ -0,0 +1,23 @@

+## Load text from PDF files (using Marker)
+??? note "Note"
+    **Underlying Library:** `marker-pdf`
+    Convert PDF to markdown quickly and accurately using a pipeline of deep learning models.
+    You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
+    Use it in our library with:
+    ```python
+    from medrag_multi_modal.document_loader.text_loader import MarkerTextLoader
+    ```
+    For details and available `**kwargs`, please refer to the sources below.
+    **Sources:**
+    - [DataLab](https://www.datalab.to)
+    - [GitHub](https://github.com/VikParuchuri/marker)
+    - [PyPI](https://pypi.org/project/marker-pdf/)
+::: medrag_multi_modal.document_loader.text_loader.marker_text_loader

docs/document_loader/text_loader/pdfplumber_text_loader.md ADDED Viewed

	@@ -0,0 +1,22 @@

+## Load text from PDF files (using PDFPlumber)
+??? note "Note"
+    **Underlying Library:** `pdfplumber`
+    Plumb a PDF for detailed information about each char, rectangle, line, et cetera — and easily extract text and tables.
+    You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
+    Use it in our library with:
+    ```python
+    from medrag_multi_modal.document_loader.text_loader import PDFPlumberTextLoader
+    ```
+    For details and available `**kwargs`, please refer to the sources below.
+    **Sources:**
+    - [GitHub](https://github.com/jsvine/pdfplumber)
+    - [PyPI](https://pypi.org/project/pdfplumber/)
+::: medrag_multi_modal.document_loader.text_loader.pdfplumber_text_loader

docs/document_loader/text_loader/pymupdf4llm_text_loader.md ADDED Viewed

	@@ -0,0 +1,23 @@

+## Load text from PDF files (using PyMuPDF4LLM)
+??? note "Note"
+    **Underlying Library:** `pymupdf4llm`
+    PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.
+    You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
+    Use it in our library with:
+    ```python
+    from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
+    ```
+    For details and available `**kwargs`, please refer to the sources below.
+    **Sources:**
+    - [Docs](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/)
+    - [GitHub](https://github.com/pymupdf/PyMuPDF)
+    - [PyPI](https://pypi.org/project/pymupdf4llm/)
+::: medrag_multi_modal.document_loader.text_loader.pymupdf4llm_text_loader

docs/document_loader/text_loader/pypdf2_text_loader.md ADDED Viewed

	@@ -0,0 +1,23 @@

+## Load text from PDF files (using PyPDF2)
+??? note "Note"
+    **Underlying Library:** `pypdf2`
+    A pure-python PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files
+    You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
+    Use it in our library with:
+    ```python
+    from medrag_multi_modal.document_loader.text_loader import PyPDF2TextLoader
+    ```
+    For details and available `**kwargs`, please refer to the sources below.
+    **Sources:**
+    - [Docs](https://pypdf2.readthedocs.io/en/3.x/)
+    - [GitHub](https://github.com/py-pdf/pypdf)
+    - [PyPI](https://pypi.org/project/PyPDF2/)
+::: medrag_multi_modal.document_loader.text_loader.pypdf2_text_loader

medrag_multi_modal/document_loader/__init__.py CHANGED Viewed

@@ -1,5 +1,17 @@
 from .load_image import ImageLoader
-from .load_text import TextLoader
 from .load_text_image import TextImageLoader
-__all__ = ["TextLoader", "TextImageLoader", "ImageLoader"]

 from .load_image import ImageLoader
 from .load_text_image import TextImageLoader
+from .text_loader import (
+    MarkerTextLoader,
+    PDFPlumberTextLoader,
+    PyMuPDF4LLMTextLoader,
+    PyPDF2TextLoader,
+)
+__all__ = [
+    "PyMuPDF4LLMTextLoader",
+    "PyPDF2TextLoader",
+    "PDFPlumberTextLoader",
+    "MarkerTextLoader",
+    "ImageLoader",
+    "TextImageLoader",
+]

medrag_multi_modal/document_loader/load_image.py CHANGED Viewed

@@ -3,15 +3,15 @@ import os
 from typing import Optional
 import rich
 import weave
 from pdf2image.pdf2image import convert_from_path
 from PIL import Image
-import wandb
-from medrag_multi_modal.document_loader.load_text import TextLoader
-class ImageLoader(TextLoader):
     """
     `ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
     loading of pages from a PDF file as images.

 from typing import Optional
 import rich
+import wandb
 import weave
 from pdf2image.pdf2image import convert_from_path
 from PIL import Image
+from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
+class ImageLoader(PyMuPDF4LLMTextLoader):
     """
     `ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
     loading of pages from a PDF file as images.

medrag_multi_modal/document_loader/load_text_image.py DELETED Viewed

@@ -1,137 +0,0 @@
-import asyncio
-import os
-from glob import glob
-from typing import Optional
-import pymupdf4llm
-import rich
-import weave
-from PIL import Image
-from medrag_multi_modal.document_loader.load_text import TextLoader
-class TextImageLoader(TextLoader):
-    """
-    A class for loading and processing text and images from a document.
-    The TextImageLoader class extends the TextLoader class to provide
-    functionality for extracting both text and images from a document
-    specified by a URL, document name, and file path. It processes the
-    document asynchronously, allowing for efficient handling of large
-    documents.
-    !!! example "Example Usage"
-        ```python
-        import asyncio
-        import weave
-        from medrag_multi_modal.document_loader import TextImageLoader
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
-        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
-        loader = TextImageLoader(
-            url=url,
-            document_name="Gray's Anatomy",
-            document_file_path="grays_anatomy.pdf",
-        )
-        asyncio.run(
-            loader.load_data(
-                start_page=20,
-                end_page=25,
-                weave_dataset_name="grays-anatomy-text",
-            )
-        )
-        ```
-    Args:
-        url (str): The URL of the document to be processed.
-        document_name (str): The name of the document.
-        document_file_path (str): The file path where the document is stored.
-    """
-    def __init__(self, url: str, document_name: str, document_file_path: str):
-        super().__init__(url, document_name, document_file_path)
-    async def load_data(
-        self,
-        start_page: Optional[int] = None,
-        end_page: Optional[int] = None,
-        weave_dataset_name: Optional[str] = None,
-        image_path: Optional[str] = "./images",
-        dpi: int = 300,
-    ):
-        """
-        Asynchronously loads and processes text and images from a specified range of pages
-        in a document. This function extracts text in markdown format and images in PNG
-        format from the document, storing them in a list of dictionaries, each representing
-        a page. Optionally, the processed data can be published to a Weave dataset.
-        The function first determines the page indices to process using the
-        `get_page_indices` method. It then defines an asynchronous inner function,
-        `process_page`, which handles the extraction of text and images for a single page.
-        The text is extracted using the `pymupdf4llm.to_markdown` function, and images are
-        retrieved from the specified image path. The processed data is appended to the
-        `pages` list.
-        The function creates a list of tasks for processing each page asynchronously and
-        awaits their completion. If a `weave_dataset_name` is provided, the processed data
-        is published to a Weave dataset. Finally, the function returns the list of processed
-        pages.
-        Args:
-            start_page (Optional[int]): The starting page index for processing. If None,
-                defaults to the first page of the document.
-            end_page (Optional[int]): The ending page index for processing. If None,
-                defaults to the last page of the document.
-            weave_dataset_name (Optional[str]): The name of the Weave dataset to publish
-                the processed data to. If None, the data is not published.
-            image_path (Optional[str]): The directory path where extracted images are
-                stored. Defaults to "./images".
-            dpi (int): The resolution in dots per inch for image extraction. Defaults to 300.
-        Returns:
-            List[Dict]: A list of dictionaries, each containing the extracted text, page
-            index, document name, file path, file URL, and a list of images for each page
-            processed.
-        """
-        start_page, end_page = self.get_page_indices(start_page, end_page)
-        pages = []
-        processed_pages_counter: int = 1
-        total_pages = end_page - start_page
-        async def process_page(page_idx):
-            nonlocal processed_pages_counter
-            text = pymupdf4llm.to_markdown(
-                doc=self.document_file_path,
-                pages=[page_idx],
-                show_progress=False,
-                write_images=True,
-                image_format="png",
-                dpi=dpi,
-                image_path=image_path,
-            )
-            image_paths = glob(
-                os.path.join(image_path, f"{self.document_file_path}-{page_idx}-*.png")
-            )
-            print(image_paths)
-            pages.append(
-                {
-                    "text": text,
-                    "page_idx": page_idx,
-                    "document_name": self.document_name,
-                    "file_path": self.document_file_path,
-                    "file_url": self.url,
-                    "images": [Image.open(image) for image in image_paths],
-                }
-            )
-            rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
-            processed_pages_counter += 1
-        tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
-        for task in asyncio.as_completed(tasks):
-            await task
-        if weave_dataset_name:
-            weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
-        return pages

medrag_multi_modal/document_loader/text_loader/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .marker_text_loader import MarkerTextLoader
+from .pdfplumber_text_loader import PDFPlumberTextLoader
+from .pymupdf4llm_text_loader import PyMuPDF4LLMTextLoader
+from .pypdf2_text_loader import PyPDF2TextLoader
+__all__ = [
+    "PyMuPDF4LLMTextLoader",
+    "PyPDF2TextLoader",
+    "PDFPlumberTextLoader",
+    "MarkerTextLoader",
+]

medrag_multi_modal/document_loader/{load_text.py → text_loader/base_text_loader.py} RENAMED Viewed

@@ -1,41 +1,22 @@
 import asyncio
 import os
-from typing import Optional
-import pymupdf4llm
 import PyPDF2
 import rich
 import weave
 from firerequests import FireRequests
-class TextLoader:
     """
-    A class for loading text from a PDF file, processing it into markdown, and optionally publishing it to a Weave dataset.
     This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
-    It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
-    of Page objects, which can be optionally published to a Weave dataset.
-    !!! example "Example Usage"
-        ```python
-        import asyncio
-        import weave
-        from medrag_multi_modal.document_loader import TextLoader
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
-        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
-        loader = TextLoader(
-            url=url,
-            document_name="Gray's Anatomy",
-            document_file_path="grays_anatomy.pdf",
-        )
-        asyncio.run(
-            loader.load_data(start_page=9, end_page=15, weave_dataset_name="grays-anatomy-text")
-        )
-        ```
     Args:
         url (str): The URL of the PDF file to download if not present locally.
@@ -55,7 +36,18 @@ class TextLoader:
     def get_page_indices(
         self, start_page: Optional[int] = None, end_page: Optional[int] = None
-    ):
         if start_page:
             if start_page > self.page_count:
                 raise ValueError(
@@ -72,30 +64,61 @@ class TextLoader:
             end_page = self.page_count - 1
         return start_page, end_page
     async def load_data(
         self,
         start_page: Optional[int] = None,
         end_page: Optional[int] = None,
         weave_dataset_name: Optional[str] = None,
-    ):
         """
-        Asynchronously loads text from a PDF file specified by a URL or local file path,
-        processes the text into markdown format, and optionally publishes it to a Weave dataset.
         This function downloads a PDF from a given URL if it does not already exist locally,
         reads the specified range of pages, converts each page's content to markdown, and
-        returns a list of Page objects containing the text and metadata. It uses PyPDF2 to read
-        the PDF and pymupdf4llm to convert pages to markdown. It processes pages concurrently using
-        `asyncio` for efficiency. If a weave_dataset_name is provided, the processed pages are published
-        to a Weave dataset.
         Args:
             start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
             end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
             weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
         Returns:
-            list[Page]: A list of Page objects, each containing the text and metadata for a processed page.
         Raises:
             ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
@@ -107,24 +130,17 @@ class TextLoader:
         async def process_page(page_idx):
             nonlocal processed_pages_counter
-            text = pymupdf4llm.to_markdown(
-                doc=self.document_file_path, pages=[page_idx], show_progress=False
-            )
-            pages.append(
-                {
-                    "text": text,
-                    "page_idx": page_idx,
-                    "document_name": self.document_name,
-                    "file_path": self.document_file_path,
-                    "file_url": self.url,
-                }
             )
-            rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
             processed_pages_counter += 1
         tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
         for task in asyncio.as_completed(tasks):
             await task
         if weave_dataset_name:
             weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
         return pages

 import asyncio
 import os
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional
 import PyPDF2
 import rich
 import weave
 from firerequests import FireRequests
+class BaseTextLoader(ABC):
     """
+    An abstract base class for loading text from a PDF file, processing it into markdown, and optionally publishing it to a Weave dataset.
     This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
+    Subclasses should implement the specific PDF reading, text extraction, and markdown conversion methods.
+    The processed pages are finally stored in a list of Page objects, which can be optionally published to a Weave dataset.
     Args:
         url (str): The URL of the PDF file to download if not present locally.
     def get_page_indices(
         self, start_page: Optional[int] = None, end_page: Optional[int] = None
+    ) -> tuple[int, int]:
+        """
+        Get the start and end page indices for processing.
+        Args:
+            start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
+            end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
+        Returns:
+            tuple[int, int]: A tuple containing the start and end page indices.
+        """
         if start_page:
             if start_page > self.page_count:
                 raise ValueError(
             end_page = self.page_count - 1
         return start_page, end_page
+    @abstractmethod
+    async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
+        """
+        Abstract method to process a single page of the PDF and extract the text data.
+        Overwrite this method in the subclass to provide the actual implementation and
+        processing logic for each page of the PDF using various PDF processing libraries.
+        Args:
+            page_idx (int): The index of the page to process.
+            **kwargs: Additional keyword arguments that may be used by underlying libraries.
+        Returns:
+            Dict[str, str]: A dictionary containing the processed page data.
+        """
+        pass
     async def load_data(
         self,
         start_page: Optional[int] = None,
         end_page: Optional[int] = None,
         weave_dataset_name: Optional[str] = None,
+        **kwargs,
+    ) -> List[Dict[str, str]]:
         """
+        Asynchronously loads text from a PDF file specified by a URL or local file path.
+        The overrided processing abstract method then processes the text into markdown format,
+        and optionally publishes it to a Weave dataset.
         This function downloads a PDF from a given URL if it does not already exist locally,
         reads the specified range of pages, converts each page's content to markdown, and
+        returns a list of Page objects containing the text and metadata.
+        It uses `PyPDF2` to calculate the number of pages in the PDF and the
+        overriden `extract_page_data` method provides the actual implementation to process
+        each page, extract the text from the PDF, and convert it to markdown.
+        It processes pages concurrently using `asyncio` for efficiency.
+        If a weave_dataset_name is provided, the processed pages are published to a Weave dataset.
         Args:
             start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
             end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
             weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
+            **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
         Returns:
+            List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
+            Each dictionary will have the following keys and values:
+            - "text": (str) the processed page data in markdown format.
+            - "page_idx": (int) the index of the page.
+            - "document_name": (str) the name of the document.
+            - "file_path": (str) the local file path where the PDF is stored.
+            - "file_url": (str) the URL of the PDF file.
         Raises:
             ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
         async def process_page(page_idx):
             nonlocal processed_pages_counter
+            page_data = await self.extract_page_data(page_idx, **kwargs)
+            pages.append(page_data)
+            rich.print(
+                f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
             )
             processed_pages_counter += 1
         tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
         for task in asyncio.as_completed(tasks):
             await task
         if weave_dataset_name:
             weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
         return pages

medrag_multi_modal/document_loader/text_loader/marker_text_loader.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from typing import Dict
+from marker.convert import convert_single_pdf
+from marker.models import load_all_models
+from .base_text_loader import BaseTextLoader
+class MarkerTextLoader(BaseTextLoader):
+    """
+    A concrete implementation of the BaseTextLoader for loading text from a PDF file
+    using `marker-pdf`, processing it into a structured text format, and optionally publishing
+    it to a Weave dataset.
+    This class extends the BaseTextLoader and implements the abstract methods to
+    load and process pages from a PDF file using marker-pdf, which is a pipeline of deep learning models.
+    This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
+    It uses marker-pdf to read the PDF and extract structured text from each page. The processed pages are stored
+    in a list of Page objects, which can be optionally published to a Weave dataset.
+    !!! example "Example Usage"
+        ```python
+        import asyncio
+        import weave
+        from medrag_multi_modal.document_loader.text_loader import MarkerTextLoader
+        weave.init(project_name="ml-colabs/medrag-multi-modal")
+        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
+        loader = MarkerTextLoader(
+            url=url,
+            document_name="Gray's Anatomy",
+            document_file_path="grays_anatomy.pdf",
+        )
+        asyncio.run(
+            loader.load_data(
+                start_page=31,
+                end_page=36,
+                weave_dataset_name="grays-anatomy-text",
+            )
+        )
+        ```
+    Args:
+        url (str): The URL of the PDF file to download if not present locally.
+        document_name (str): The name of the document for metadata purposes.
+        document_file_path (str): The local file path where the PDF is stored or will be downloaded.
+    """
+    async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
+        """
+        Process a single page of the PDF and extract its structured text using marker-pdf.
+        Returns a dictionary with the processed page data.
+        The dictionary will have the following keys and values:
+        - "text": (str) the extracted structured text from the page.
+        - "page_idx": (int) the index of the page.
+        - "document_name": (str) the name of the document.
+        - "file_path": (str) the local file path where the PDF is stored.
+        - "file_url": (str) the URL of the PDF file.
+        - "meta": (dict) the metadata extracted from the page by marker-pdf.
+        Args:
+            page_idx (int): The index of the page to process.
+            **kwargs: Additional keyword arguments to be passed to `marker.convert.convert_single_pdf`.
+        Returns:
+            Dict[str, str]: A dictionary containing the processed page data.
+        """
+        model_lst = load_all_models()
+        text, _, out_meta = convert_single_pdf(
+            self.document_file_path,
+            model_lst,
+            max_pages=1,
+            batch_multiplier=1,
+            start_page=page_idx,
+            ocr_all_pages=True,
+            **kwargs,
+        )
+        return {
+            "text": text,
+            "page_idx": page_idx,
+            "document_name": self.document_name,
+            "file_path": self.document_file_path,
+            "file_url": self.url,
+            "meta": out_meta,
+        }

medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import Dict
+import pdfplumber
+from .base_text_loader import BaseTextLoader
+class PDFPlumberTextLoader(BaseTextLoader):
+    """
+    A concrete implementation of the BaseTextLoader for loading text from a PDF file
+    using `pdfplumber`, processing it into a simple text format, and optionally publishing
+    it to a Weave dataset.
+    This class extends the BaseTextLoader and implements the abstract methods to
+    load and process pages from a PDF file.
+    This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
+    It uses pdfplumber to read the PDF and extract text from each page. The processed pages are stored in a list
+    of Page objects, which can be optionally published to a Weave dataset.
+    !!! example "Example Usage"
+        ```python
+        import asyncio
+        import weave
+        from medrag_multi_modal.document_loader.text_loader import PDFPlumberTextLoader
+        weave.init(project_name="ml-colabs/medrag-multi-modal")
+        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
+        loader = PDFPlumberTextLoader(
+            url=url,
+            document_name="Gray's Anatomy",
+            document_file_path="grays_anatomy.pdf",
+        )
+        asyncio.run(
+            loader.load_data(
+                start_page=31,
+                end_page=36,
+                weave_dataset_name="grays-anatomy-text",
+            )
+        )
+        ```
+    Args:
+        url (str): The URL of the PDF file to download if not present locally.
+        document_name (str): The name of the document for metadata purposes.
+        document_file_path (str): The local file path where the PDF is stored or will be downloaded.
+    """
+    async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
+        """
+        Process a single page of the PDF and extract its text using pdfplumber.
+        Returns a dictionary with the processed page data.
+        The dictionary will have the following keys and values:
+        - "text": (str) the extracted text from the page.
+        - "page_idx": (int) the index of the page.
+        - "document_name": (str) the name of the document.
+        - "file_path": (str) the local file path where the PDF is stored.
+        - "file_url": (str) the URL of the PDF file.
+        Args:
+            page_idx (int): The index of the page to process.
+            **kwargs: Additional keyword arguments to be passed to `pdfplumber.Page.extract_text`.
+        Returns:
+            Dict[str, str]: A dictionary containing the processed page data.
+        """
+        with pdfplumber.open(self.document_file_path) as pdf:
+            page = pdf.pages[page_idx]
+            text = page.extract_text(**kwargs)
+        return {
+            "text": text,
+            "page_idx": page_idx,
+            "document_name": self.document_name,
+            "file_path": self.document_file_path,
+            "file_url": self.url,
+        }

medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import Dict
+import pymupdf4llm
+from .base_text_loader import BaseTextLoader
+class PyMuPDF4LLMTextLoader(BaseTextLoader):
+    """
+    A concrete implementation of the BaseTextLoader for loading text from a PDF file,
+    processing it into markdown using `pymupdf4llm`, and optionally publishing it to a Weave dataset.
+    This class extends the BaseTextLoader and implements the abstract methods to load and process pages from a PDF file.
+    This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
+    It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
+    of Page objects, which can be optionally published to a Weave dataset.
+    !!! example "Example Usage"
+        ```python
+        import asyncio
+        import weave
+        from medrag_multi_modal.document_loader.text_loader import (
+            PyMuPDF4LLMTextLoader
+        )
+        weave.init(project_name="ml-colabs/medrag-multi-modal")
+        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
+        loader = PyMuPDF4LLMTextLoader(
+            url=url,
+            document_name="Gray's Anatomy",
+            document_file_path="grays_anatomy.pdf",
+        )
+        asyncio.run(
+            loader.load_data(
+                start_page=31,
+                end_page=36,
+                weave_dataset_name="grays-anatomy-text",
+            )
+        )
+        ```
+    Args:
+        url (str): The URL of the PDF file to download if not present locally.
+        document_name (str): The name of the document for metadata purposes.
+        document_file_path (str): The local file path where the PDF is stored or will be downloaded.
+    """
+    async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
+        """
+        Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
+        Returns a dictionary with the processed page data.
+        The dictionary will have the following keys and values:
+        - "text": (str) the processed page data in markdown format.
+        - "page_idx": (int) the index of the page.
+        - "document_name": (str) the name of the document.
+        - "file_path": (str) the local file path where the PDF is stored.
+        - "file_url": (str) the URL of the PDF file.
+        Args:
+            page_idx (int): The index of the page to process.
+            **kwargs: Additional keyword arguments to be passed to `pymupdf4llm.to_markdown`.
+        Returns:
+            Dict[str, str]: A dictionary containing the processed page data.
+        """
+        text = pymupdf4llm.to_markdown(
+            doc=self.document_file_path, pages=[page_idx], show_progress=False, **kwargs
+        )
+        return {
+            "text": text,
+            "page_idx": page_idx,
+            "document_name": self.document_name,
+            "file_path": self.document_file_path,
+            "file_url": self.url,
+        }

medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from typing import Dict
+import PyPDF2
+from .base_text_loader import BaseTextLoader
+class PyPDF2TextLoader(BaseTextLoader):
+    """
+    A concrete implementation of the BaseTextLoader for loading text from a PDF file
+    using `PyPDF2`, processing it into a simple text format, and optionally publishing
+    it to a Weave dataset.
+    This class extends the BaseTextLoader and implements the abstract methods to
+    load and process pages from a PDF file.
+    This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
+    It uses PyPDF2 to read the PDF and extract text from each page. The processed pages are stored in a list
+    of Page objects, which can be optionally published to a Weave dataset.
+    !!! example "Example Usage"
+        ```python
+        import asyncio
+        import weave
+        from medrag_multi_modal.document_loader.text_loader import PyPDF2TextLoader
+        weave.init(project_name="ml-colabs/medrag-multi-modal")
+        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
+        loader = PyPDF2TextLoader(
+            url=url,
+            document_name="Gray's Anatomy",
+            document_file_path="grays_anatomy.pdf",
+        )
+        asyncio.run(
+            loader.load_data(
+                start_page=31,
+                end_page=36,
+                weave_dataset_name="grays-anatomy-text",
+            )
+        )
+        ```
+    Args:
+        url (str): The URL of the PDF file to download if not present locally.
+        document_name (str): The name of the document for metadata purposes.
+        document_file_path (str): The local file path where the PDF is stored or will be downloaded.
+    """
+    async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
+        """
+        Process a single page of the PDF and extract its text using PyPDF2.
+        Returns a dictionary with the processed page data.
+        The dictionary will have the following keys and values:
+        - "text": (str) the extracted text from the page.
+        - "page_idx": (int) the index of the page.
+        - "document_name": (str) the name of the document.
+        - "file_path": (str) the local file path where the PDF is stored.
+        - "file_url": (str) the URL of the PDF file.
+        Args:
+            page_idx (int): The index of the page to process.
+            **kwargs: Additional keyword arguments to be passed to `PyPDF2.PdfReader.pages[0].extract_text`.
+        Returns:
+            Dict[str, str]: A dictionary containing the processed page data.
+        """
+        with open(self.document_file_path, "rb") as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            page = pdf_reader.pages[page_idx]
+            text = page.extract_text(**kwargs)
+        return {
+            "text": text,
+            "page_idx": page_idx,
+            "document_name": self.document_name,
+            "file_path": self.document_file_path,
+            "file_url": self.url,
+        }

medrag_multi_modal/retrieval/multi_modal_retrieval.py CHANGED Viewed

@@ -1,23 +1,22 @@
 import os
 from typing import Any, Optional
 import weave
 from byaldi import RAGMultiModalModel
 from PIL import Image
-import wandb
 from ..utils import get_wandb_artifact
 class MultiModalRetriever(weave.Model):
     """
     MultiModalRetriever is a class that facilitates the retrieval of page images using ColPali.
     This class leverages the `byaldi.RAGMultiModalModel` to perform document retrieval tasks.
     It can be initialized with a pre-trained model or from a specified W&B artifact. The class
     also provides methods to index new data and to predict/retrieve documents based on a query.
     !!! example "Indexing Data"
         ```python
         import wandb
@@ -31,14 +30,14 @@ class MultiModalRetriever(weave.Model):
             index_name="grays-anatomy",
         )
         ```
     !!! example "Retrieving Documents"
         ```python
         import weave
         import wandb
         from medrag_multi_modal.retrieval import MultiModalRetriever
         weave.init(project_name="ml-colabs/medrag-multi-modal")
         retriever = MultiModalRetriever.from_artifact(
             index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
@@ -54,6 +53,7 @@ class MultiModalRetriever(weave.Model):
     Attributes:
         model_name (str): The name of the model to be used for retrieval.
     """
     model_name: str
     _docs_retrieval_model: Optional[RAGMultiModalModel] = None
     _metadata: Optional[dict] = None

 import os
 from typing import Any, Optional
+import wandb
 import weave
 from byaldi import RAGMultiModalModel
 from PIL import Image
 from ..utils import get_wandb_artifact
 class MultiModalRetriever(weave.Model):
     """
     MultiModalRetriever is a class that facilitates the retrieval of page images using ColPali.
     This class leverages the `byaldi.RAGMultiModalModel` to perform document retrieval tasks.
     It can be initialized with a pre-trained model or from a specified W&B artifact. The class
     also provides methods to index new data and to predict/retrieve documents based on a query.
     !!! example "Indexing Data"
         ```python
         import wandb
             index_name="grays-anatomy",
         )
         ```
     !!! example "Retrieving Documents"
         ```python
         import weave
         import wandb
         from medrag_multi_modal.retrieval import MultiModalRetriever
         weave.init(project_name="ml-colabs/medrag-multi-modal")
         retriever = MultiModalRetriever.from_artifact(
             index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
     Attributes:
         model_name (str): The name of the model to be used for retrieval.
     """
     model_name: str
     _docs_retrieval_model: Optional[RAGMultiModalModel] = None
     _metadata: Optional[dict] = None

mkdocs.yml CHANGED Viewed

@@ -63,8 +63,12 @@ nav:
     - Installation: 'installation/install.md'
     - Development: 'installation/development.md'
   - Document Loader:
-    - Text Loader: 'document_loader/load_text.md'
-    - Text and Image Loader: 'document_loader/load_text_image.md'
     - Image Loader: 'document_loader/load_image.md'
   - Retrieval:
     - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'

     - Installation: 'installation/install.md'
     - Development: 'installation/development.md'
   - Document Loader:
+    - Text Loader:
+      - Base: 'document_loader/text_loader/base_text_loader.md'
+      - PyMuPDF4LLM: 'document_loader/text_loader/pymupdf4llm_text_loader.md'
+      - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
+      - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
+      - Marker: 'document_loader/text_loader/marker_text_loader.md'
     - Image Loader: 'document_loader/load_image.md'
   - Retrieval:
     - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'

pyproject.toml CHANGED Viewed

@@ -19,6 +19,7 @@ dependencies = [
     "isort>=5.13.2",
     "black>=24.10.0",
     "ruff>=0.6.9",
     "mkdocs>=1.6.1",
     "mkdocstrings>=0.26.1",
     "mkdocstrings-python>=1.11.1",
@@ -27,13 +28,17 @@ dependencies = [
     "mkdocs-glightbox>=0.4.0",
     "mkdocs-jupyter>=0.25.0",
     "jupyter>=1.1.1",
 ]
 [project.optional-dependencies]
 core = [
     "Byaldi>=0.0.5",
     "firerequests>=0.0.7",
     "pdf2image>=1.17.0",
     "python-dotenv>=1.0.1",
     "pymupdf4llm>=0.0.17",
     "torch>=2.4.1",
@@ -42,7 +47,6 @@ core = [
 dev = [
     "pytest>=8.3.3",
-    "PyPDF2>=3.0.1",
     "isort>=5.13.2",
     "black>=24.10.0",
     "ruff>=0.6.9",

     "isort>=5.13.2",
     "black>=24.10.0",
     "ruff>=0.6.9",
+    "marker-pdf>=0.2.17",
     "mkdocs>=1.6.1",
     "mkdocstrings>=0.26.1",
     "mkdocstrings-python>=1.11.1",
     "mkdocs-glightbox>=0.4.0",
     "mkdocs-jupyter>=0.25.0",
     "jupyter>=1.1.1",
+    "pdfplumber>=0.11.4",
 ]
 [project.optional-dependencies]
 core = [
     "Byaldi>=0.0.5",
     "firerequests>=0.0.7",
+    "marker-pdf>=0.2.17",
     "pdf2image>=1.17.0",
+    "pdfplumber>=0.11.4",
+    "PyPDF2>=3.0.1",
     "python-dotenv>=1.0.1",
     "pymupdf4llm>=0.0.17",
     "torch>=2.4.1",
 dev = [
     "pytest>=8.3.3",
     "isort>=5.13.2",
     "black>=24.10.0",
     "ruff>=0.6.9",

uv.lock DELETED Viewed

The diff for this file is too large to render. See raw diff