geekyrakshit commited on
Commit
56d3953
·
unverified ·
2 Parent(s): bb79bf4 07a16a7

Merge pull request #9 from soumik12345/feat/ensemble-of-text-loaders

Browse files
.gitignore CHANGED
@@ -1,12 +1,20 @@
 
1
  .venv/
2
  .env
3
- cursor_prompt.txt
4
- **egg-info/
5
  **pycache**
 
6
  .ruff_cache/
7
- test.py
8
- **.pdf
 
9
  images/
10
  wandb/
 
 
 
11
  .byaldi/
12
- artifacts/
 
 
 
1
+ # Virtual environments and environment files
2
  .venv/
3
  .env
4
+
5
+ # Python-related
6
  **pycache**
7
+ **egg-info/
8
  .ruff_cache/
9
+
10
+ # Project-specific directories
11
+ artifacts/
12
  images/
13
  wandb/
14
+
15
+ # Temporary and generated files
16
+ **.pdf
17
  .byaldi/
18
+ cursor_prompt.txt
19
+ test.py
20
+ uv.lock
docs/document_loader/load_text.md DELETED
@@ -1,3 +0,0 @@
1
- ## Load text from PDF files
2
-
3
- ::: medrag_multi_modal.document_loader.load_text
 
 
 
 
docs/document_loader/load_text_image.md DELETED
@@ -1,3 +0,0 @@
1
- ## Load text and images from PDF files
2
-
3
- ::: medrag_multi_modal.document_loader.load_text_image
 
 
 
 
docs/document_loader/text_loader/base_text_loader.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ## Load text from PDF files
2
+
3
+ ::: medrag_multi_modal.document_loader.text_loader.base_text_loader
docs/document_loader/text_loader/marker_text_loader.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Load text from PDF files (using Marker)
2
+
3
+ ??? note "Note"
4
+ **Underlying Library:** `marker-pdf`
5
+
6
+ Convert PDF to markdown quickly and accurately using a pipeline of deep learning models.
7
+
8
+ You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
9
+
10
+ Use it in our library with:
11
+ ```python
12
+ from medrag_multi_modal.document_loader.text_loader import MarkerTextLoader
13
+ ```
14
+
15
+ For details and available `**kwargs`, please refer to the sources below.
16
+
17
+ **Sources:**
18
+
19
+ - [DataLab](https://www.datalab.to)
20
+ - [GitHub](https://github.com/VikParuchuri/marker)
21
+ - [PyPI](https://pypi.org/project/marker-pdf/)
22
+
23
+ ::: medrag_multi_modal.document_loader.text_loader.marker_text_loader
docs/document_loader/text_loader/pdfplumber_text_loader.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Load text from PDF files (using PDFPlumber)
2
+
3
+ ??? note "Note"
4
+ **Underlying Library:** `pdfplumber`
5
+
6
+ Plumb a PDF for detailed information about each char, rectangle, line, et cetera — and easily extract text and tables.
7
+
8
+ You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
9
+
10
+ Use it in our library with:
11
+ ```python
12
+ from medrag_multi_modal.document_loader.text_loader import PDFPlumberTextLoader
13
+ ```
14
+
15
+ For details and available `**kwargs`, please refer to the sources below.
16
+
17
+ **Sources:**
18
+
19
+ - [GitHub](https://github.com/jsvine/pdfplumber)
20
+ - [PyPI](https://pypi.org/project/pdfplumber/)
21
+
22
+ ::: medrag_multi_modal.document_loader.text_loader.pdfplumber_text_loader
docs/document_loader/text_loader/pymupdf4llm_text_loader.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Load text from PDF files (using PyMuPDF4LLM)
2
+
3
+ ??? note "Note"
4
+ **Underlying Library:** `pymupdf4llm`
5
+
6
+ PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.
7
+
8
+ You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
9
+
10
+ Use it in our library with:
11
+ ```python
12
+ from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
13
+ ```
14
+
15
+ For details and available `**kwargs`, please refer to the sources below.
16
+
17
+ **Sources:**
18
+
19
+ - [Docs](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/)
20
+ - [GitHub](https://github.com/pymupdf/PyMuPDF)
21
+ - [PyPI](https://pypi.org/project/pymupdf4llm/)
22
+
23
+ ::: medrag_multi_modal.document_loader.text_loader.pymupdf4llm_text_loader
docs/document_loader/text_loader/pypdf2_text_loader.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Load text from PDF files (using PyPDF2)
2
+
3
+ ??? note "Note"
4
+ **Underlying Library:** `pypdf2`
5
+
6
+ A pure-python PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files
7
+
8
+ You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
9
+
10
+ Use it in our library with:
11
+ ```python
12
+ from medrag_multi_modal.document_loader.text_loader import PyPDF2TextLoader
13
+ ```
14
+
15
+ For details and available `**kwargs`, please refer to the sources below.
16
+
17
+ **Sources:**
18
+
19
+ - [Docs](https://pypdf2.readthedocs.io/en/3.x/)
20
+ - [GitHub](https://github.com/py-pdf/pypdf)
21
+ - [PyPI](https://pypi.org/project/PyPDF2/)
22
+
23
+ ::: medrag_multi_modal.document_loader.text_loader.pypdf2_text_loader
medrag_multi_modal/document_loader/__init__.py CHANGED
@@ -1,5 +1,17 @@
1
  from .load_image import ImageLoader
2
- from .load_text import TextLoader
3
  from .load_text_image import TextImageLoader
 
 
 
 
 
 
4
 
5
- __all__ = ["TextLoader", "TextImageLoader", "ImageLoader"]
 
 
 
 
 
 
 
 
1
  from .load_image import ImageLoader
 
2
  from .load_text_image import TextImageLoader
3
+ from .text_loader import (
4
+ MarkerTextLoader,
5
+ PDFPlumberTextLoader,
6
+ PyMuPDF4LLMTextLoader,
7
+ PyPDF2TextLoader,
8
+ )
9
 
10
+ __all__ = [
11
+ "PyMuPDF4LLMTextLoader",
12
+ "PyPDF2TextLoader",
13
+ "PDFPlumberTextLoader",
14
+ "MarkerTextLoader",
15
+ "ImageLoader",
16
+ "TextImageLoader",
17
+ ]
medrag_multi_modal/document_loader/load_image.py CHANGED
@@ -3,15 +3,15 @@ import os
3
  from typing import Optional
4
 
5
  import rich
 
6
  import weave
7
  from pdf2image.pdf2image import convert_from_path
8
  from PIL import Image
9
 
10
- import wandb
11
- from medrag_multi_modal.document_loader.load_text import TextLoader
12
 
13
 
14
- class ImageLoader(TextLoader):
15
  """
16
  `ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
17
  loading of pages from a PDF file as images.
 
3
  from typing import Optional
4
 
5
  import rich
6
+ import wandb
7
  import weave
8
  from pdf2image.pdf2image import convert_from_path
9
  from PIL import Image
10
 
11
+ from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
 
12
 
13
 
14
+ class ImageLoader(PyMuPDF4LLMTextLoader):
15
  """
16
  `ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
17
  loading of pages from a PDF file as images.
medrag_multi_modal/document_loader/load_text_image.py DELETED
@@ -1,137 +0,0 @@
1
- import asyncio
2
- import os
3
- from glob import glob
4
- from typing import Optional
5
-
6
- import pymupdf4llm
7
- import rich
8
- import weave
9
- from PIL import Image
10
-
11
- from medrag_multi_modal.document_loader.load_text import TextLoader
12
-
13
-
14
- class TextImageLoader(TextLoader):
15
- """
16
- A class for loading and processing text and images from a document.
17
-
18
- The TextImageLoader class extends the TextLoader class to provide
19
- functionality for extracting both text and images from a document
20
- specified by a URL, document name, and file path. It processes the
21
- document asynchronously, allowing for efficient handling of large
22
- documents.
23
-
24
- !!! example "Example Usage"
25
- ```python
26
- import asyncio
27
-
28
- import weave
29
-
30
- from medrag_multi_modal.document_loader import TextImageLoader
31
-
32
- weave.init(project_name="ml-colabs/medrag-multi-modal")
33
- url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
34
- loader = TextImageLoader(
35
- url=url,
36
- document_name="Gray's Anatomy",
37
- document_file_path="grays_anatomy.pdf",
38
- )
39
- asyncio.run(
40
- loader.load_data(
41
- start_page=20,
42
- end_page=25,
43
- weave_dataset_name="grays-anatomy-text",
44
- )
45
- )
46
- ```
47
-
48
- Args:
49
- url (str): The URL of the document to be processed.
50
- document_name (str): The name of the document.
51
- document_file_path (str): The file path where the document is stored.
52
- """
53
-
54
- def __init__(self, url: str, document_name: str, document_file_path: str):
55
- super().__init__(url, document_name, document_file_path)
56
-
57
- async def load_data(
58
- self,
59
- start_page: Optional[int] = None,
60
- end_page: Optional[int] = None,
61
- weave_dataset_name: Optional[str] = None,
62
- image_path: Optional[str] = "./images",
63
- dpi: int = 300,
64
- ):
65
- """
66
- Asynchronously loads and processes text and images from a specified range of pages
67
- in a document. This function extracts text in markdown format and images in PNG
68
- format from the document, storing them in a list of dictionaries, each representing
69
- a page. Optionally, the processed data can be published to a Weave dataset.
70
-
71
- The function first determines the page indices to process using the
72
- `get_page_indices` method. It then defines an asynchronous inner function,
73
- `process_page`, which handles the extraction of text and images for a single page.
74
- The text is extracted using the `pymupdf4llm.to_markdown` function, and images are
75
- retrieved from the specified image path. The processed data is appended to the
76
- `pages` list.
77
-
78
- The function creates a list of tasks for processing each page asynchronously and
79
- awaits their completion. If a `weave_dataset_name` is provided, the processed data
80
- is published to a Weave dataset. Finally, the function returns the list of processed
81
- pages.
82
-
83
- Args:
84
- start_page (Optional[int]): The starting page index for processing. If None,
85
- defaults to the first page of the document.
86
- end_page (Optional[int]): The ending page index for processing. If None,
87
- defaults to the last page of the document.
88
- weave_dataset_name (Optional[str]): The name of the Weave dataset to publish
89
- the processed data to. If None, the data is not published.
90
- image_path (Optional[str]): The directory path where extracted images are
91
- stored. Defaults to "./images".
92
- dpi (int): The resolution in dots per inch for image extraction. Defaults to 300.
93
-
94
- Returns:
95
- List[Dict]: A list of dictionaries, each containing the extracted text, page
96
- index, document name, file path, file URL, and a list of images for each page
97
- processed.
98
- """
99
- start_page, end_page = self.get_page_indices(start_page, end_page)
100
- pages = []
101
- processed_pages_counter: int = 1
102
- total_pages = end_page - start_page
103
-
104
- async def process_page(page_idx):
105
- nonlocal processed_pages_counter
106
- text = pymupdf4llm.to_markdown(
107
- doc=self.document_file_path,
108
- pages=[page_idx],
109
- show_progress=False,
110
- write_images=True,
111
- image_format="png",
112
- dpi=dpi,
113
- image_path=image_path,
114
- )
115
- image_paths = glob(
116
- os.path.join(image_path, f"{self.document_file_path}-{page_idx}-*.png")
117
- )
118
- print(image_paths)
119
- pages.append(
120
- {
121
- "text": text,
122
- "page_idx": page_idx,
123
- "document_name": self.document_name,
124
- "file_path": self.document_file_path,
125
- "file_url": self.url,
126
- "images": [Image.open(image) for image in image_paths],
127
- }
128
- )
129
- rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
130
- processed_pages_counter += 1
131
-
132
- tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
133
- for task in asyncio.as_completed(tasks):
134
- await task
135
- if weave_dataset_name:
136
- weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
137
- return pages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
medrag_multi_modal/document_loader/text_loader/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .marker_text_loader import MarkerTextLoader
2
+ from .pdfplumber_text_loader import PDFPlumberTextLoader
3
+ from .pymupdf4llm_text_loader import PyMuPDF4LLMTextLoader
4
+ from .pypdf2_text_loader import PyPDF2TextLoader
5
+
6
+ __all__ = [
7
+ "PyMuPDF4LLMTextLoader",
8
+ "PyPDF2TextLoader",
9
+ "PDFPlumberTextLoader",
10
+ "MarkerTextLoader",
11
+ ]
medrag_multi_modal/document_loader/{load_text.py → text_loader/base_text_loader.py} RENAMED
@@ -1,41 +1,22 @@
1
  import asyncio
2
  import os
3
- from typing import Optional
 
4
 
5
- import pymupdf4llm
6
  import PyPDF2
7
  import rich
8
  import weave
9
  from firerequests import FireRequests
10
 
11
 
12
- class TextLoader:
13
  """
14
- A class for loading text from a PDF file, processing it into markdown, and optionally publishing it to a Weave dataset.
15
 
16
  This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
17
- It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
18
- of Page objects, which can be optionally published to a Weave dataset.
19
 
20
- !!! example "Example Usage"
21
- ```python
22
- import asyncio
23
-
24
- import weave
25
-
26
- from medrag_multi_modal.document_loader import TextLoader
27
-
28
- weave.init(project_name="ml-colabs/medrag-multi-modal")
29
- url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
30
- loader = TextLoader(
31
- url=url,
32
- document_name="Gray's Anatomy",
33
- document_file_path="grays_anatomy.pdf",
34
- )
35
- asyncio.run(
36
- loader.load_data(start_page=9, end_page=15, weave_dataset_name="grays-anatomy-text")
37
- )
38
- ```
39
 
40
  Args:
41
  url (str): The URL of the PDF file to download if not present locally.
@@ -55,7 +36,18 @@ class TextLoader:
55
 
56
  def get_page_indices(
57
  self, start_page: Optional[int] = None, end_page: Optional[int] = None
58
- ):
 
 
 
 
 
 
 
 
 
 
 
59
  if start_page:
60
  if start_page > self.page_count:
61
  raise ValueError(
@@ -72,30 +64,61 @@ class TextLoader:
72
  end_page = self.page_count - 1
73
  return start_page, end_page
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  async def load_data(
76
  self,
77
  start_page: Optional[int] = None,
78
  end_page: Optional[int] = None,
79
  weave_dataset_name: Optional[str] = None,
80
- ):
 
81
  """
82
- Asynchronously loads text from a PDF file specified by a URL or local file path,
83
- processes the text into markdown format, and optionally publishes it to a Weave dataset.
 
84
 
85
  This function downloads a PDF from a given URL if it does not already exist locally,
86
  reads the specified range of pages, converts each page's content to markdown, and
87
- returns a list of Page objects containing the text and metadata. It uses PyPDF2 to read
88
- the PDF and pymupdf4llm to convert pages to markdown. It processes pages concurrently using
89
- `asyncio` for efficiency. If a weave_dataset_name is provided, the processed pages are published
90
- to a Weave dataset.
 
 
 
 
91
 
92
  Args:
93
  start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
94
  end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
95
  weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
 
96
 
97
  Returns:
98
- list[Page]: A list of Page objects, each containing the text and metadata for a processed page.
 
 
 
 
 
 
 
99
 
100
  Raises:
101
  ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
@@ -107,24 +130,17 @@ class TextLoader:
107
 
108
  async def process_page(page_idx):
109
  nonlocal processed_pages_counter
110
- text = pymupdf4llm.to_markdown(
111
- doc=self.document_file_path, pages=[page_idx], show_progress=False
112
- )
113
- pages.append(
114
- {
115
- "text": text,
116
- "page_idx": page_idx,
117
- "document_name": self.document_name,
118
- "file_path": self.document_file_path,
119
- "file_url": self.url,
120
- }
121
  )
122
- rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
123
  processed_pages_counter += 1
124
 
125
  tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
126
  for task in asyncio.as_completed(tasks):
127
  await task
 
128
  if weave_dataset_name:
129
  weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
130
  return pages
 
1
  import asyncio
2
  import os
3
+ from abc import ABC, abstractmethod
4
+ from typing import Dict, List, Optional
5
 
 
6
  import PyPDF2
7
  import rich
8
  import weave
9
  from firerequests import FireRequests
10
 
11
 
12
+ class BaseTextLoader(ABC):
13
  """
14
+ An abstract base class for loading text from a PDF file, processing it into markdown, and optionally publishing it to a Weave dataset.
15
 
16
  This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
17
+ Subclasses should implement the specific PDF reading, text extraction, and markdown conversion methods.
 
18
 
19
+ The processed pages are finally stored in a list of Page objects, which can be optionally published to a Weave dataset.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  Args:
22
  url (str): The URL of the PDF file to download if not present locally.
 
36
 
37
  def get_page_indices(
38
  self, start_page: Optional[int] = None, end_page: Optional[int] = None
39
+ ) -> tuple[int, int]:
40
+ """
41
+ Get the start and end page indices for processing.
42
+
43
+ Args:
44
+ start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
45
+ end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
46
+
47
+ Returns:
48
+ tuple[int, int]: A tuple containing the start and end page indices.
49
+ """
50
+
51
  if start_page:
52
  if start_page > self.page_count:
53
  raise ValueError(
 
64
  end_page = self.page_count - 1
65
  return start_page, end_page
66
 
67
+ @abstractmethod
68
+ async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
69
+ """
70
+ Abstract method to process a single page of the PDF and extract the text data.
71
+
72
+ Overwrite this method in the subclass to provide the actual implementation and
73
+ processing logic for each page of the PDF using various PDF processing libraries.
74
+
75
+ Args:
76
+ page_idx (int): The index of the page to process.
77
+ **kwargs: Additional keyword arguments that may be used by underlying libraries.
78
+
79
+ Returns:
80
+ Dict[str, str]: A dictionary containing the processed page data.
81
+ """
82
+ pass
83
+
84
  async def load_data(
85
  self,
86
  start_page: Optional[int] = None,
87
  end_page: Optional[int] = None,
88
  weave_dataset_name: Optional[str] = None,
89
+ **kwargs,
90
+ ) -> List[Dict[str, str]]:
91
  """
92
+ Asynchronously loads text from a PDF file specified by a URL or local file path.
93
+ The overrided processing abstract method then processes the text into markdown format,
94
+ and optionally publishes it to a Weave dataset.
95
 
96
  This function downloads a PDF from a given URL if it does not already exist locally,
97
  reads the specified range of pages, converts each page's content to markdown, and
98
+ returns a list of Page objects containing the text and metadata.
99
+
100
+ It uses `PyPDF2` to calculate the number of pages in the PDF and the
101
+ overriden `extract_page_data` method provides the actual implementation to process
102
+ each page, extract the text from the PDF, and convert it to markdown.
103
+ It processes pages concurrently using `asyncio` for efficiency.
104
+
105
+ If a weave_dataset_name is provided, the processed pages are published to a Weave dataset.
106
 
107
  Args:
108
  start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
109
  end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
110
  weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
111
+ **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
112
 
113
  Returns:
114
+ List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
115
+ Each dictionary will have the following keys and values:
116
+
117
+ - "text": (str) the processed page data in markdown format.
118
+ - "page_idx": (int) the index of the page.
119
+ - "document_name": (str) the name of the document.
120
+ - "file_path": (str) the local file path where the PDF is stored.
121
+ - "file_url": (str) the URL of the PDF file.
122
 
123
  Raises:
124
  ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
 
130
 
131
  async def process_page(page_idx):
132
  nonlocal processed_pages_counter
133
+ page_data = await self.extract_page_data(page_idx, **kwargs)
134
+ pages.append(page_data)
135
+ rich.print(
136
+ f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
 
 
 
 
 
 
 
137
  )
 
138
  processed_pages_counter += 1
139
 
140
  tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
141
  for task in asyncio.as_completed(tasks):
142
  await task
143
+
144
  if weave_dataset_name:
145
  weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
146
  return pages
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+
3
+ from marker.convert import convert_single_pdf
4
+ from marker.models import load_all_models
5
+
6
+ from .base_text_loader import BaseTextLoader
7
+
8
+
9
+ class MarkerTextLoader(BaseTextLoader):
10
+ """
11
+ A concrete implementation of the BaseTextLoader for loading text from a PDF file
12
+ using `marker-pdf`, processing it into a structured text format, and optionally publishing
13
+ it to a Weave dataset.
14
+
15
+ This class extends the BaseTextLoader and implements the abstract methods to
16
+ load and process pages from a PDF file using marker-pdf, which is a pipeline of deep learning models.
17
+
18
+ This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
19
+ It uses marker-pdf to read the PDF and extract structured text from each page. The processed pages are stored
20
+ in a list of Page objects, which can be optionally published to a Weave dataset.
21
+
22
+ !!! example "Example Usage"
23
+ ```python
24
+ import asyncio
25
+
26
+ import weave
27
+
28
+ from medrag_multi_modal.document_loader.text_loader import MarkerTextLoader
29
+
30
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
31
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
32
+ loader = MarkerTextLoader(
33
+ url=url,
34
+ document_name="Gray's Anatomy",
35
+ document_file_path="grays_anatomy.pdf",
36
+ )
37
+ asyncio.run(
38
+ loader.load_data(
39
+ start_page=31,
40
+ end_page=36,
41
+ weave_dataset_name="grays-anatomy-text",
42
+ )
43
+ )
44
+ ```
45
+
46
+ Args:
47
+ url (str): The URL of the PDF file to download if not present locally.
48
+ document_name (str): The name of the document for metadata purposes.
49
+ document_file_path (str): The local file path where the PDF is stored or will be downloaded.
50
+ """
51
+
52
+ async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
53
+ """
54
+ Process a single page of the PDF and extract its structured text using marker-pdf.
55
+
56
+ Returns a dictionary with the processed page data.
57
+ The dictionary will have the following keys and values:
58
+
59
+ - "text": (str) the extracted structured text from the page.
60
+ - "page_idx": (int) the index of the page.
61
+ - "document_name": (str) the name of the document.
62
+ - "file_path": (str) the local file path where the PDF is stored.
63
+ - "file_url": (str) the URL of the PDF file.
64
+ - "meta": (dict) the metadata extracted from the page by marker-pdf.
65
+
66
+ Args:
67
+ page_idx (int): The index of the page to process.
68
+ **kwargs: Additional keyword arguments to be passed to `marker.convert.convert_single_pdf`.
69
+
70
+ Returns:
71
+ Dict[str, str]: A dictionary containing the processed page data.
72
+ """
73
+ model_lst = load_all_models()
74
+
75
+ text, _, out_meta = convert_single_pdf(
76
+ self.document_file_path,
77
+ model_lst,
78
+ max_pages=1,
79
+ batch_multiplier=1,
80
+ start_page=page_idx,
81
+ ocr_all_pages=True,
82
+ **kwargs,
83
+ )
84
+
85
+ return {
86
+ "text": text,
87
+ "page_idx": page_idx,
88
+ "document_name": self.document_name,
89
+ "file_path": self.document_file_path,
90
+ "file_url": self.url,
91
+ "meta": out_meta,
92
+ }
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+
3
+ import pdfplumber
4
+
5
+ from .base_text_loader import BaseTextLoader
6
+
7
+
8
+ class PDFPlumberTextLoader(BaseTextLoader):
9
+ """
10
+ A concrete implementation of the BaseTextLoader for loading text from a PDF file
11
+ using `pdfplumber`, processing it into a simple text format, and optionally publishing
12
+ it to a Weave dataset.
13
+
14
+ This class extends the BaseTextLoader and implements the abstract methods to
15
+ load and process pages from a PDF file.
16
+
17
+ This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
18
+ It uses pdfplumber to read the PDF and extract text from each page. The processed pages are stored in a list
19
+ of Page objects, which can be optionally published to a Weave dataset.
20
+
21
+ !!! example "Example Usage"
22
+ ```python
23
+ import asyncio
24
+
25
+ import weave
26
+
27
+ from medrag_multi_modal.document_loader.text_loader import PDFPlumberTextLoader
28
+
29
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
30
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
31
+ loader = PDFPlumberTextLoader(
32
+ url=url,
33
+ document_name="Gray's Anatomy",
34
+ document_file_path="grays_anatomy.pdf",
35
+ )
36
+ asyncio.run(
37
+ loader.load_data(
38
+ start_page=31,
39
+ end_page=36,
40
+ weave_dataset_name="grays-anatomy-text",
41
+ )
42
+ )
43
+ ```
44
+
45
+ Args:
46
+ url (str): The URL of the PDF file to download if not present locally.
47
+ document_name (str): The name of the document for metadata purposes.
48
+ document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
+ """
50
+
51
+ async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
52
+ """
53
+ Process a single page of the PDF and extract its text using pdfplumber.
54
+
55
+ Returns a dictionary with the processed page data.
56
+ The dictionary will have the following keys and values:
57
+
58
+ - "text": (str) the extracted text from the page.
59
+ - "page_idx": (int) the index of the page.
60
+ - "document_name": (str) the name of the document.
61
+ - "file_path": (str) the local file path where the PDF is stored.
62
+ - "file_url": (str) the URL of the PDF file.
63
+
64
+ Args:
65
+ page_idx (int): The index of the page to process.
66
+ **kwargs: Additional keyword arguments to be passed to `pdfplumber.Page.extract_text`.
67
+
68
+ Returns:
69
+ Dict[str, str]: A dictionary containing the processed page data.
70
+ """
71
+ with pdfplumber.open(self.document_file_path) as pdf:
72
+ page = pdf.pages[page_idx]
73
+ text = page.extract_text(**kwargs)
74
+
75
+ return {
76
+ "text": text,
77
+ "page_idx": page_idx,
78
+ "document_name": self.document_name,
79
+ "file_path": self.document_file_path,
80
+ "file_url": self.url,
81
+ }
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+
3
+ import pymupdf4llm
4
+
5
+ from .base_text_loader import BaseTextLoader
6
+
7
+
8
+ class PyMuPDF4LLMTextLoader(BaseTextLoader):
9
+ """
10
+ A concrete implementation of the BaseTextLoader for loading text from a PDF file,
11
+ processing it into markdown using `pymupdf4llm`, and optionally publishing it to a Weave dataset.
12
+
13
+ This class extends the BaseTextLoader and implements the abstract methods to load and process pages from a PDF file.
14
+
15
+ This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
16
+ It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
17
+ of Page objects, which can be optionally published to a Weave dataset.
18
+
19
+ !!! example "Example Usage"
20
+ ```python
21
+ import asyncio
22
+
23
+ import weave
24
+
25
+ from medrag_multi_modal.document_loader.text_loader import (
26
+ PyMuPDF4LLMTextLoader
27
+ )
28
+
29
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
30
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
31
+ loader = PyMuPDF4LLMTextLoader(
32
+ url=url,
33
+ document_name="Gray's Anatomy",
34
+ document_file_path="grays_anatomy.pdf",
35
+ )
36
+ asyncio.run(
37
+ loader.load_data(
38
+ start_page=31,
39
+ end_page=36,
40
+ weave_dataset_name="grays-anatomy-text",
41
+ )
42
+ )
43
+ ```
44
+
45
+ Args:
46
+ url (str): The URL of the PDF file to download if not present locally.
47
+ document_name (str): The name of the document for metadata purposes.
48
+ document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
+ """
50
+
51
+ async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
52
+ """
53
+ Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
54
+
55
+ Returns a dictionary with the processed page data.
56
+ The dictionary will have the following keys and values:
57
+
58
+ - "text": (str) the processed page data in markdown format.
59
+ - "page_idx": (int) the index of the page.
60
+ - "document_name": (str) the name of the document.
61
+ - "file_path": (str) the local file path where the PDF is stored.
62
+ - "file_url": (str) the URL of the PDF file.
63
+
64
+ Args:
65
+ page_idx (int): The index of the page to process.
66
+ **kwargs: Additional keyword arguments to be passed to `pymupdf4llm.to_markdown`.
67
+
68
+ Returns:
69
+ Dict[str, str]: A dictionary containing the processed page data.
70
+ """
71
+ text = pymupdf4llm.to_markdown(
72
+ doc=self.document_file_path, pages=[page_idx], show_progress=False, **kwargs
73
+ )
74
+ return {
75
+ "text": text,
76
+ "page_idx": page_idx,
77
+ "document_name": self.document_name,
78
+ "file_path": self.document_file_path,
79
+ "file_url": self.url,
80
+ }
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+
3
+ import PyPDF2
4
+
5
+ from .base_text_loader import BaseTextLoader
6
+
7
+
8
+ class PyPDF2TextLoader(BaseTextLoader):
9
+ """
10
+ A concrete implementation of the BaseTextLoader for loading text from a PDF file
11
+ using `PyPDF2`, processing it into a simple text format, and optionally publishing
12
+ it to a Weave dataset.
13
+
14
+ This class extends the BaseTextLoader and implements the abstract methods to
15
+ load and process pages from a PDF file.
16
+
17
+ This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
18
+ It uses PyPDF2 to read the PDF and extract text from each page. The processed pages are stored in a list
19
+ of Page objects, which can be optionally published to a Weave dataset.
20
+
21
+ !!! example "Example Usage"
22
+ ```python
23
+ import asyncio
24
+
25
+ import weave
26
+
27
+ from medrag_multi_modal.document_loader.text_loader import PyPDF2TextLoader
28
+
29
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
30
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
31
+ loader = PyPDF2TextLoader(
32
+ url=url,
33
+ document_name="Gray's Anatomy",
34
+ document_file_path="grays_anatomy.pdf",
35
+ )
36
+ asyncio.run(
37
+ loader.load_data(
38
+ start_page=31,
39
+ end_page=36,
40
+ weave_dataset_name="grays-anatomy-text",
41
+ )
42
+ )
43
+ ```
44
+
45
+ Args:
46
+ url (str): The URL of the PDF file to download if not present locally.
47
+ document_name (str): The name of the document for metadata purposes.
48
+ document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
+ """
50
+
51
+ async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
52
+ """
53
+ Process a single page of the PDF and extract its text using PyPDF2.
54
+
55
+ Returns a dictionary with the processed page data.
56
+ The dictionary will have the following keys and values:
57
+
58
+ - "text": (str) the extracted text from the page.
59
+ - "page_idx": (int) the index of the page.
60
+ - "document_name": (str) the name of the document.
61
+ - "file_path": (str) the local file path where the PDF is stored.
62
+ - "file_url": (str) the URL of the PDF file.
63
+
64
+ Args:
65
+ page_idx (int): The index of the page to process.
66
+ **kwargs: Additional keyword arguments to be passed to `PyPDF2.PdfReader.pages[0].extract_text`.
67
+
68
+ Returns:
69
+ Dict[str, str]: A dictionary containing the processed page data.
70
+ """
71
+ with open(self.document_file_path, "rb") as file:
72
+ pdf_reader = PyPDF2.PdfReader(file)
73
+ page = pdf_reader.pages[page_idx]
74
+ text = page.extract_text(**kwargs)
75
+
76
+ return {
77
+ "text": text,
78
+ "page_idx": page_idx,
79
+ "document_name": self.document_name,
80
+ "file_path": self.document_file_path,
81
+ "file_url": self.url,
82
+ }
medrag_multi_modal/retrieval/multi_modal_retrieval.py CHANGED
@@ -1,23 +1,22 @@
1
  import os
2
  from typing import Any, Optional
3
 
 
4
  import weave
5
  from byaldi import RAGMultiModalModel
6
  from PIL import Image
7
 
8
- import wandb
9
-
10
  from ..utils import get_wandb_artifact
11
 
12
 
13
  class MultiModalRetriever(weave.Model):
14
  """
15
  MultiModalRetriever is a class that facilitates the retrieval of page images using ColPali.
16
-
17
  This class leverages the `byaldi.RAGMultiModalModel` to perform document retrieval tasks.
18
  It can be initialized with a pre-trained model or from a specified W&B artifact. The class
19
  also provides methods to index new data and to predict/retrieve documents based on a query.
20
-
21
  !!! example "Indexing Data"
22
  ```python
23
  import wandb
@@ -31,14 +30,14 @@ class MultiModalRetriever(weave.Model):
31
  index_name="grays-anatomy",
32
  )
33
  ```
34
-
35
  !!! example "Retrieving Documents"
36
  ```python
37
  import weave
38
 
39
  import wandb
40
  from medrag_multi_modal.retrieval import MultiModalRetriever
41
-
42
  weave.init(project_name="ml-colabs/medrag-multi-modal")
43
  retriever = MultiModalRetriever.from_artifact(
44
  index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
@@ -54,6 +53,7 @@ class MultiModalRetriever(weave.Model):
54
  Attributes:
55
  model_name (str): The name of the model to be used for retrieval.
56
  """
 
57
  model_name: str
58
  _docs_retrieval_model: Optional[RAGMultiModalModel] = None
59
  _metadata: Optional[dict] = None
 
1
  import os
2
  from typing import Any, Optional
3
 
4
+ import wandb
5
  import weave
6
  from byaldi import RAGMultiModalModel
7
  from PIL import Image
8
 
 
 
9
  from ..utils import get_wandb_artifact
10
 
11
 
12
  class MultiModalRetriever(weave.Model):
13
  """
14
  MultiModalRetriever is a class that facilitates the retrieval of page images using ColPali.
15
+
16
  This class leverages the `byaldi.RAGMultiModalModel` to perform document retrieval tasks.
17
  It can be initialized with a pre-trained model or from a specified W&B artifact. The class
18
  also provides methods to index new data and to predict/retrieve documents based on a query.
19
+
20
  !!! example "Indexing Data"
21
  ```python
22
  import wandb
 
30
  index_name="grays-anatomy",
31
  )
32
  ```
33
+
34
  !!! example "Retrieving Documents"
35
  ```python
36
  import weave
37
 
38
  import wandb
39
  from medrag_multi_modal.retrieval import MultiModalRetriever
40
+
41
  weave.init(project_name="ml-colabs/medrag-multi-modal")
42
  retriever = MultiModalRetriever.from_artifact(
43
  index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
 
53
  Attributes:
54
  model_name (str): The name of the model to be used for retrieval.
55
  """
56
+
57
  model_name: str
58
  _docs_retrieval_model: Optional[RAGMultiModalModel] = None
59
  _metadata: Optional[dict] = None
mkdocs.yml CHANGED
@@ -63,8 +63,12 @@ nav:
63
  - Installation: 'installation/install.md'
64
  - Development: 'installation/development.md'
65
  - Document Loader:
66
- - Text Loader: 'document_loader/load_text.md'
67
- - Text and Image Loader: 'document_loader/load_text_image.md'
 
 
 
 
68
  - Image Loader: 'document_loader/load_image.md'
69
  - Retrieval:
70
  - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
 
63
  - Installation: 'installation/install.md'
64
  - Development: 'installation/development.md'
65
  - Document Loader:
66
+ - Text Loader:
67
+ - Base: 'document_loader/text_loader/base_text_loader.md'
68
+ - PyMuPDF4LLM: 'document_loader/text_loader/pymupdf4llm_text_loader.md'
69
+ - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
70
+ - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
71
+ - Marker: 'document_loader/text_loader/marker_text_loader.md'
72
  - Image Loader: 'document_loader/load_image.md'
73
  - Retrieval:
74
  - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
pyproject.toml CHANGED
@@ -19,6 +19,7 @@ dependencies = [
19
  "isort>=5.13.2",
20
  "black>=24.10.0",
21
  "ruff>=0.6.9",
 
22
  "mkdocs>=1.6.1",
23
  "mkdocstrings>=0.26.1",
24
  "mkdocstrings-python>=1.11.1",
@@ -27,13 +28,17 @@ dependencies = [
27
  "mkdocs-glightbox>=0.4.0",
28
  "mkdocs-jupyter>=0.25.0",
29
  "jupyter>=1.1.1",
 
30
  ]
31
 
32
  [project.optional-dependencies]
33
  core = [
34
  "Byaldi>=0.0.5",
35
  "firerequests>=0.0.7",
 
36
  "pdf2image>=1.17.0",
 
 
37
  "python-dotenv>=1.0.1",
38
  "pymupdf4llm>=0.0.17",
39
  "torch>=2.4.1",
@@ -42,7 +47,6 @@ core = [
42
 
43
  dev = [
44
  "pytest>=8.3.3",
45
- "PyPDF2>=3.0.1",
46
  "isort>=5.13.2",
47
  "black>=24.10.0",
48
  "ruff>=0.6.9",
 
19
  "isort>=5.13.2",
20
  "black>=24.10.0",
21
  "ruff>=0.6.9",
22
+ "marker-pdf>=0.2.17",
23
  "mkdocs>=1.6.1",
24
  "mkdocstrings>=0.26.1",
25
  "mkdocstrings-python>=1.11.1",
 
28
  "mkdocs-glightbox>=0.4.0",
29
  "mkdocs-jupyter>=0.25.0",
30
  "jupyter>=1.1.1",
31
+ "pdfplumber>=0.11.4",
32
  ]
33
 
34
  [project.optional-dependencies]
35
  core = [
36
  "Byaldi>=0.0.5",
37
  "firerequests>=0.0.7",
38
+ "marker-pdf>=0.2.17",
39
  "pdf2image>=1.17.0",
40
+ "pdfplumber>=0.11.4",
41
+ "PyPDF2>=3.0.1",
42
  "python-dotenv>=1.0.1",
43
  "pymupdf4llm>=0.0.17",
44
  "torch>=2.4.1",
 
47
 
48
  dev = [
49
  "pytest>=8.3.3",
 
50
  "isort>=5.13.2",
51
  "black>=24.10.0",
52
  "ruff>=0.6.9",
uv.lock DELETED
The diff for this file is too large to render. See raw diff