Spaces:
Sleeping
Sleeping
Merge pull request #9 from soumik12345/feat/ensemble-of-text-loaders
Browse files- .gitignore +13 -5
- docs/document_loader/load_text.md +0 -3
- docs/document_loader/load_text_image.md +0 -3
- docs/document_loader/text_loader/base_text_loader.md +3 -0
- docs/document_loader/text_loader/marker_text_loader.md +23 -0
- docs/document_loader/text_loader/pdfplumber_text_loader.md +22 -0
- docs/document_loader/text_loader/pymupdf4llm_text_loader.md +23 -0
- docs/document_loader/text_loader/pypdf2_text_loader.md +23 -0
- medrag_multi_modal/document_loader/__init__.py +14 -2
- medrag_multi_modal/document_loader/load_image.py +3 -3
- medrag_multi_modal/document_loader/load_text_image.py +0 -137
- medrag_multi_modal/document_loader/text_loader/__init__.py +11 -0
- medrag_multi_modal/document_loader/{load_text.py → text_loader/base_text_loader.py} +62 -46
- medrag_multi_modal/document_loader/text_loader/marker_text_loader.py +92 -0
- medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py +81 -0
- medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py +80 -0
- medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py +82 -0
- medrag_multi_modal/retrieval/multi_modal_retrieval.py +6 -6
- mkdocs.yml +6 -2
- pyproject.toml +5 -1
- uv.lock +0 -0
.gitignore
CHANGED
@@ -1,12 +1,20 @@
|
|
|
|
1 |
.venv/
|
2 |
.env
|
3 |
-
|
4 |
-
|
5 |
**pycache**
|
|
|
6 |
.ruff_cache/
|
7 |
-
|
8 |
-
|
|
|
9 |
images/
|
10 |
wandb/
|
|
|
|
|
|
|
11 |
.byaldi/
|
12 |
-
|
|
|
|
|
|
1 |
+
# Virtual environments and environment files
|
2 |
.venv/
|
3 |
.env
|
4 |
+
|
5 |
+
# Python-related
|
6 |
**pycache**
|
7 |
+
**egg-info/
|
8 |
.ruff_cache/
|
9 |
+
|
10 |
+
# Project-specific directories
|
11 |
+
artifacts/
|
12 |
images/
|
13 |
wandb/
|
14 |
+
|
15 |
+
# Temporary and generated files
|
16 |
+
**.pdf
|
17 |
.byaldi/
|
18 |
+
cursor_prompt.txt
|
19 |
+
test.py
|
20 |
+
uv.lock
|
docs/document_loader/load_text.md
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
## Load text from PDF files
|
2 |
-
|
3 |
-
::: medrag_multi_modal.document_loader.load_text
|
|
|
|
|
|
|
|
docs/document_loader/load_text_image.md
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
## Load text and images from PDF files
|
2 |
-
|
3 |
-
::: medrag_multi_modal.document_loader.load_text_image
|
|
|
|
|
|
|
|
docs/document_loader/text_loader/base_text_loader.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
## Load text from PDF files
|
2 |
+
|
3 |
+
::: medrag_multi_modal.document_loader.text_loader.base_text_loader
|
docs/document_loader/text_loader/marker_text_loader.md
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Load text from PDF files (using Marker)
|
2 |
+
|
3 |
+
??? note "Note"
|
4 |
+
**Underlying Library:** `marker-pdf`
|
5 |
+
|
6 |
+
Convert PDF to markdown quickly and accurately using a pipeline of deep learning models.
|
7 |
+
|
8 |
+
You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
|
9 |
+
|
10 |
+
Use it in our library with:
|
11 |
+
```python
|
12 |
+
from medrag_multi_modal.document_loader.text_loader import MarkerTextLoader
|
13 |
+
```
|
14 |
+
|
15 |
+
For details and available `**kwargs`, please refer to the sources below.
|
16 |
+
|
17 |
+
**Sources:**
|
18 |
+
|
19 |
+
- [DataLab](https://www.datalab.to)
|
20 |
+
- [GitHub](https://github.com/VikParuchuri/marker)
|
21 |
+
- [PyPI](https://pypi.org/project/marker-pdf/)
|
22 |
+
|
23 |
+
::: medrag_multi_modal.document_loader.text_loader.marker_text_loader
|
docs/document_loader/text_loader/pdfplumber_text_loader.md
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Load text from PDF files (using PDFPlumber)
|
2 |
+
|
3 |
+
??? note "Note"
|
4 |
+
**Underlying Library:** `pdfplumber`
|
5 |
+
|
6 |
+
Plumb a PDF for detailed information about each char, rectangle, line, et cetera — and easily extract text and tables.
|
7 |
+
|
8 |
+
You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
|
9 |
+
|
10 |
+
Use it in our library with:
|
11 |
+
```python
|
12 |
+
from medrag_multi_modal.document_loader.text_loader import PDFPlumberTextLoader
|
13 |
+
```
|
14 |
+
|
15 |
+
For details and available `**kwargs`, please refer to the sources below.
|
16 |
+
|
17 |
+
**Sources:**
|
18 |
+
|
19 |
+
- [GitHub](https://github.com/jsvine/pdfplumber)
|
20 |
+
- [PyPI](https://pypi.org/project/pdfplumber/)
|
21 |
+
|
22 |
+
::: medrag_multi_modal.document_loader.text_loader.pdfplumber_text_loader
|
docs/document_loader/text_loader/pymupdf4llm_text_loader.md
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Load text from PDF files (using PyMuPDF4LLM)
|
2 |
+
|
3 |
+
??? note "Note"
|
4 |
+
**Underlying Library:** `pymupdf4llm`
|
5 |
+
|
6 |
+
PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.
|
7 |
+
|
8 |
+
You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
|
9 |
+
|
10 |
+
Use it in our library with:
|
11 |
+
```python
|
12 |
+
from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
|
13 |
+
```
|
14 |
+
|
15 |
+
For details and available `**kwargs`, please refer to the sources below.
|
16 |
+
|
17 |
+
**Sources:**
|
18 |
+
|
19 |
+
- [Docs](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/)
|
20 |
+
- [GitHub](https://github.com/pymupdf/PyMuPDF)
|
21 |
+
- [PyPI](https://pypi.org/project/pymupdf4llm/)
|
22 |
+
|
23 |
+
::: medrag_multi_modal.document_loader.text_loader.pymupdf4llm_text_loader
|
docs/document_loader/text_loader/pypdf2_text_loader.md
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Load text from PDF files (using PyPDF2)
|
2 |
+
|
3 |
+
??? note "Note"
|
4 |
+
**Underlying Library:** `pypdf2`
|
5 |
+
|
6 |
+
A pure-python PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files
|
7 |
+
|
8 |
+
You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
|
9 |
+
|
10 |
+
Use it in our library with:
|
11 |
+
```python
|
12 |
+
from medrag_multi_modal.document_loader.text_loader import PyPDF2TextLoader
|
13 |
+
```
|
14 |
+
|
15 |
+
For details and available `**kwargs`, please refer to the sources below.
|
16 |
+
|
17 |
+
**Sources:**
|
18 |
+
|
19 |
+
- [Docs](https://pypdf2.readthedocs.io/en/3.x/)
|
20 |
+
- [GitHub](https://github.com/py-pdf/pypdf)
|
21 |
+
- [PyPI](https://pypi.org/project/PyPDF2/)
|
22 |
+
|
23 |
+
::: medrag_multi_modal.document_loader.text_loader.pypdf2_text_loader
|
medrag_multi_modal/document_loader/__init__.py
CHANGED
@@ -1,5 +1,17 @@
|
|
1 |
from .load_image import ImageLoader
|
2 |
-
from .load_text import TextLoader
|
3 |
from .load_text_image import TextImageLoader
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
__all__ = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from .load_image import ImageLoader
|
|
|
2 |
from .load_text_image import TextImageLoader
|
3 |
+
from .text_loader import (
|
4 |
+
MarkerTextLoader,
|
5 |
+
PDFPlumberTextLoader,
|
6 |
+
PyMuPDF4LLMTextLoader,
|
7 |
+
PyPDF2TextLoader,
|
8 |
+
)
|
9 |
|
10 |
+
__all__ = [
|
11 |
+
"PyMuPDF4LLMTextLoader",
|
12 |
+
"PyPDF2TextLoader",
|
13 |
+
"PDFPlumberTextLoader",
|
14 |
+
"MarkerTextLoader",
|
15 |
+
"ImageLoader",
|
16 |
+
"TextImageLoader",
|
17 |
+
]
|
medrag_multi_modal/document_loader/load_image.py
CHANGED
@@ -3,15 +3,15 @@ import os
|
|
3 |
from typing import Optional
|
4 |
|
5 |
import rich
|
|
|
6 |
import weave
|
7 |
from pdf2image.pdf2image import convert_from_path
|
8 |
from PIL import Image
|
9 |
|
10 |
-
import
|
11 |
-
from medrag_multi_modal.document_loader.load_text import TextLoader
|
12 |
|
13 |
|
14 |
-
class ImageLoader(
|
15 |
"""
|
16 |
`ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
|
17 |
loading of pages from a PDF file as images.
|
|
|
3 |
from typing import Optional
|
4 |
|
5 |
import rich
|
6 |
+
import wandb
|
7 |
import weave
|
8 |
from pdf2image.pdf2image import convert_from_path
|
9 |
from PIL import Image
|
10 |
|
11 |
+
from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
|
|
|
12 |
|
13 |
|
14 |
+
class ImageLoader(PyMuPDF4LLMTextLoader):
|
15 |
"""
|
16 |
`ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
|
17 |
loading of pages from a PDF file as images.
|
medrag_multi_modal/document_loader/load_text_image.py
DELETED
@@ -1,137 +0,0 @@
|
|
1 |
-
import asyncio
|
2 |
-
import os
|
3 |
-
from glob import glob
|
4 |
-
from typing import Optional
|
5 |
-
|
6 |
-
import pymupdf4llm
|
7 |
-
import rich
|
8 |
-
import weave
|
9 |
-
from PIL import Image
|
10 |
-
|
11 |
-
from medrag_multi_modal.document_loader.load_text import TextLoader
|
12 |
-
|
13 |
-
|
14 |
-
class TextImageLoader(TextLoader):
|
15 |
-
"""
|
16 |
-
A class for loading and processing text and images from a document.
|
17 |
-
|
18 |
-
The TextImageLoader class extends the TextLoader class to provide
|
19 |
-
functionality for extracting both text and images from a document
|
20 |
-
specified by a URL, document name, and file path. It processes the
|
21 |
-
document asynchronously, allowing for efficient handling of large
|
22 |
-
documents.
|
23 |
-
|
24 |
-
!!! example "Example Usage"
|
25 |
-
```python
|
26 |
-
import asyncio
|
27 |
-
|
28 |
-
import weave
|
29 |
-
|
30 |
-
from medrag_multi_modal.document_loader import TextImageLoader
|
31 |
-
|
32 |
-
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
33 |
-
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
34 |
-
loader = TextImageLoader(
|
35 |
-
url=url,
|
36 |
-
document_name="Gray's Anatomy",
|
37 |
-
document_file_path="grays_anatomy.pdf",
|
38 |
-
)
|
39 |
-
asyncio.run(
|
40 |
-
loader.load_data(
|
41 |
-
start_page=20,
|
42 |
-
end_page=25,
|
43 |
-
weave_dataset_name="grays-anatomy-text",
|
44 |
-
)
|
45 |
-
)
|
46 |
-
```
|
47 |
-
|
48 |
-
Args:
|
49 |
-
url (str): The URL of the document to be processed.
|
50 |
-
document_name (str): The name of the document.
|
51 |
-
document_file_path (str): The file path where the document is stored.
|
52 |
-
"""
|
53 |
-
|
54 |
-
def __init__(self, url: str, document_name: str, document_file_path: str):
|
55 |
-
super().__init__(url, document_name, document_file_path)
|
56 |
-
|
57 |
-
async def load_data(
|
58 |
-
self,
|
59 |
-
start_page: Optional[int] = None,
|
60 |
-
end_page: Optional[int] = None,
|
61 |
-
weave_dataset_name: Optional[str] = None,
|
62 |
-
image_path: Optional[str] = "./images",
|
63 |
-
dpi: int = 300,
|
64 |
-
):
|
65 |
-
"""
|
66 |
-
Asynchronously loads and processes text and images from a specified range of pages
|
67 |
-
in a document. This function extracts text in markdown format and images in PNG
|
68 |
-
format from the document, storing them in a list of dictionaries, each representing
|
69 |
-
a page. Optionally, the processed data can be published to a Weave dataset.
|
70 |
-
|
71 |
-
The function first determines the page indices to process using the
|
72 |
-
`get_page_indices` method. It then defines an asynchronous inner function,
|
73 |
-
`process_page`, which handles the extraction of text and images for a single page.
|
74 |
-
The text is extracted using the `pymupdf4llm.to_markdown` function, and images are
|
75 |
-
retrieved from the specified image path. The processed data is appended to the
|
76 |
-
`pages` list.
|
77 |
-
|
78 |
-
The function creates a list of tasks for processing each page asynchronously and
|
79 |
-
awaits their completion. If a `weave_dataset_name` is provided, the processed data
|
80 |
-
is published to a Weave dataset. Finally, the function returns the list of processed
|
81 |
-
pages.
|
82 |
-
|
83 |
-
Args:
|
84 |
-
start_page (Optional[int]): The starting page index for processing. If None,
|
85 |
-
defaults to the first page of the document.
|
86 |
-
end_page (Optional[int]): The ending page index for processing. If None,
|
87 |
-
defaults to the last page of the document.
|
88 |
-
weave_dataset_name (Optional[str]): The name of the Weave dataset to publish
|
89 |
-
the processed data to. If None, the data is not published.
|
90 |
-
image_path (Optional[str]): The directory path where extracted images are
|
91 |
-
stored. Defaults to "./images".
|
92 |
-
dpi (int): The resolution in dots per inch for image extraction. Defaults to 300.
|
93 |
-
|
94 |
-
Returns:
|
95 |
-
List[Dict]: A list of dictionaries, each containing the extracted text, page
|
96 |
-
index, document name, file path, file URL, and a list of images for each page
|
97 |
-
processed.
|
98 |
-
"""
|
99 |
-
start_page, end_page = self.get_page_indices(start_page, end_page)
|
100 |
-
pages = []
|
101 |
-
processed_pages_counter: int = 1
|
102 |
-
total_pages = end_page - start_page
|
103 |
-
|
104 |
-
async def process_page(page_idx):
|
105 |
-
nonlocal processed_pages_counter
|
106 |
-
text = pymupdf4llm.to_markdown(
|
107 |
-
doc=self.document_file_path,
|
108 |
-
pages=[page_idx],
|
109 |
-
show_progress=False,
|
110 |
-
write_images=True,
|
111 |
-
image_format="png",
|
112 |
-
dpi=dpi,
|
113 |
-
image_path=image_path,
|
114 |
-
)
|
115 |
-
image_paths = glob(
|
116 |
-
os.path.join(image_path, f"{self.document_file_path}-{page_idx}-*.png")
|
117 |
-
)
|
118 |
-
print(image_paths)
|
119 |
-
pages.append(
|
120 |
-
{
|
121 |
-
"text": text,
|
122 |
-
"page_idx": page_idx,
|
123 |
-
"document_name": self.document_name,
|
124 |
-
"file_path": self.document_file_path,
|
125 |
-
"file_url": self.url,
|
126 |
-
"images": [Image.open(image) for image in image_paths],
|
127 |
-
}
|
128 |
-
)
|
129 |
-
rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
|
130 |
-
processed_pages_counter += 1
|
131 |
-
|
132 |
-
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
|
133 |
-
for task in asyncio.as_completed(tasks):
|
134 |
-
await task
|
135 |
-
if weave_dataset_name:
|
136 |
-
weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
|
137 |
-
return pages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
medrag_multi_modal/document_loader/text_loader/__init__.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .marker_text_loader import MarkerTextLoader
|
2 |
+
from .pdfplumber_text_loader import PDFPlumberTextLoader
|
3 |
+
from .pymupdf4llm_text_loader import PyMuPDF4LLMTextLoader
|
4 |
+
from .pypdf2_text_loader import PyPDF2TextLoader
|
5 |
+
|
6 |
+
__all__ = [
|
7 |
+
"PyMuPDF4LLMTextLoader",
|
8 |
+
"PyPDF2TextLoader",
|
9 |
+
"PDFPlumberTextLoader",
|
10 |
+
"MarkerTextLoader",
|
11 |
+
]
|
medrag_multi_modal/document_loader/{load_text.py → text_loader/base_text_loader.py}
RENAMED
@@ -1,41 +1,22 @@
|
|
1 |
import asyncio
|
2 |
import os
|
3 |
-
from
|
|
|
4 |
|
5 |
-
import pymupdf4llm
|
6 |
import PyPDF2
|
7 |
import rich
|
8 |
import weave
|
9 |
from firerequests import FireRequests
|
10 |
|
11 |
|
12 |
-
class
|
13 |
"""
|
14 |
-
|
15 |
|
16 |
This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
|
17 |
-
|
18 |
-
of Page objects, which can be optionally published to a Weave dataset.
|
19 |
|
20 |
-
|
21 |
-
```python
|
22 |
-
import asyncio
|
23 |
-
|
24 |
-
import weave
|
25 |
-
|
26 |
-
from medrag_multi_modal.document_loader import TextLoader
|
27 |
-
|
28 |
-
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
29 |
-
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
30 |
-
loader = TextLoader(
|
31 |
-
url=url,
|
32 |
-
document_name="Gray's Anatomy",
|
33 |
-
document_file_path="grays_anatomy.pdf",
|
34 |
-
)
|
35 |
-
asyncio.run(
|
36 |
-
loader.load_data(start_page=9, end_page=15, weave_dataset_name="grays-anatomy-text")
|
37 |
-
)
|
38 |
-
```
|
39 |
|
40 |
Args:
|
41 |
url (str): The URL of the PDF file to download if not present locally.
|
@@ -55,7 +36,18 @@ class TextLoader:
|
|
55 |
|
56 |
def get_page_indices(
|
57 |
self, start_page: Optional[int] = None, end_page: Optional[int] = None
|
58 |
-
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
if start_page:
|
60 |
if start_page > self.page_count:
|
61 |
raise ValueError(
|
@@ -72,30 +64,61 @@ class TextLoader:
|
|
72 |
end_page = self.page_count - 1
|
73 |
return start_page, end_page
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
async def load_data(
|
76 |
self,
|
77 |
start_page: Optional[int] = None,
|
78 |
end_page: Optional[int] = None,
|
79 |
weave_dataset_name: Optional[str] = None,
|
80 |
-
|
|
|
81 |
"""
|
82 |
-
Asynchronously loads text from a PDF file specified by a URL or local file path
|
83 |
-
processes the text into markdown format,
|
|
|
84 |
|
85 |
This function downloads a PDF from a given URL if it does not already exist locally,
|
86 |
reads the specified range of pages, converts each page's content to markdown, and
|
87 |
-
returns a list of Page objects containing the text and metadata.
|
88 |
-
|
89 |
-
`
|
90 |
-
|
|
|
|
|
|
|
|
|
91 |
|
92 |
Args:
|
93 |
start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
|
94 |
end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
|
95 |
weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
|
|
|
96 |
|
97 |
Returns:
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
Raises:
|
101 |
ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
|
@@ -107,24 +130,17 @@ class TextLoader:
|
|
107 |
|
108 |
async def process_page(page_idx):
|
109 |
nonlocal processed_pages_counter
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
{
|
115 |
-
"text": text,
|
116 |
-
"page_idx": page_idx,
|
117 |
-
"document_name": self.document_name,
|
118 |
-
"file_path": self.document_file_path,
|
119 |
-
"file_url": self.url,
|
120 |
-
}
|
121 |
)
|
122 |
-
rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
|
123 |
processed_pages_counter += 1
|
124 |
|
125 |
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
|
126 |
for task in asyncio.as_completed(tasks):
|
127 |
await task
|
|
|
128 |
if weave_dataset_name:
|
129 |
weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
|
130 |
return pages
|
|
|
1 |
import asyncio
|
2 |
import os
|
3 |
+
from abc import ABC, abstractmethod
|
4 |
+
from typing import Dict, List, Optional
|
5 |
|
|
|
6 |
import PyPDF2
|
7 |
import rich
|
8 |
import weave
|
9 |
from firerequests import FireRequests
|
10 |
|
11 |
|
12 |
+
class BaseTextLoader(ABC):
|
13 |
"""
|
14 |
+
An abstract base class for loading text from a PDF file, processing it into markdown, and optionally publishing it to a Weave dataset.
|
15 |
|
16 |
This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
|
17 |
+
Subclasses should implement the specific PDF reading, text extraction, and markdown conversion methods.
|
|
|
18 |
|
19 |
+
The processed pages are finally stored in a list of Page objects, which can be optionally published to a Weave dataset.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
Args:
|
22 |
url (str): The URL of the PDF file to download if not present locally.
|
|
|
36 |
|
37 |
def get_page_indices(
|
38 |
self, start_page: Optional[int] = None, end_page: Optional[int] = None
|
39 |
+
) -> tuple[int, int]:
|
40 |
+
"""
|
41 |
+
Get the start and end page indices for processing.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
|
45 |
+
end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
tuple[int, int]: A tuple containing the start and end page indices.
|
49 |
+
"""
|
50 |
+
|
51 |
if start_page:
|
52 |
if start_page > self.page_count:
|
53 |
raise ValueError(
|
|
|
64 |
end_page = self.page_count - 1
|
65 |
return start_page, end_page
|
66 |
|
67 |
+
@abstractmethod
|
68 |
+
async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
|
69 |
+
"""
|
70 |
+
Abstract method to process a single page of the PDF and extract the text data.
|
71 |
+
|
72 |
+
Overwrite this method in the subclass to provide the actual implementation and
|
73 |
+
processing logic for each page of the PDF using various PDF processing libraries.
|
74 |
+
|
75 |
+
Args:
|
76 |
+
page_idx (int): The index of the page to process.
|
77 |
+
**kwargs: Additional keyword arguments that may be used by underlying libraries.
|
78 |
+
|
79 |
+
Returns:
|
80 |
+
Dict[str, str]: A dictionary containing the processed page data.
|
81 |
+
"""
|
82 |
+
pass
|
83 |
+
|
84 |
async def load_data(
|
85 |
self,
|
86 |
start_page: Optional[int] = None,
|
87 |
end_page: Optional[int] = None,
|
88 |
weave_dataset_name: Optional[str] = None,
|
89 |
+
**kwargs,
|
90 |
+
) -> List[Dict[str, str]]:
|
91 |
"""
|
92 |
+
Asynchronously loads text from a PDF file specified by a URL or local file path.
|
93 |
+
The overrided processing abstract method then processes the text into markdown format,
|
94 |
+
and optionally publishes it to a Weave dataset.
|
95 |
|
96 |
This function downloads a PDF from a given URL if it does not already exist locally,
|
97 |
reads the specified range of pages, converts each page's content to markdown, and
|
98 |
+
returns a list of Page objects containing the text and metadata.
|
99 |
+
|
100 |
+
It uses `PyPDF2` to calculate the number of pages in the PDF and the
|
101 |
+
overriden `extract_page_data` method provides the actual implementation to process
|
102 |
+
each page, extract the text from the PDF, and convert it to markdown.
|
103 |
+
It processes pages concurrently using `asyncio` for efficiency.
|
104 |
+
|
105 |
+
If a weave_dataset_name is provided, the processed pages are published to a Weave dataset.
|
106 |
|
107 |
Args:
|
108 |
start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
|
109 |
end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
|
110 |
weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
|
111 |
+
**kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
|
112 |
|
113 |
Returns:
|
114 |
+
List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
|
115 |
+
Each dictionary will have the following keys and values:
|
116 |
+
|
117 |
+
- "text": (str) the processed page data in markdown format.
|
118 |
+
- "page_idx": (int) the index of the page.
|
119 |
+
- "document_name": (str) the name of the document.
|
120 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
121 |
+
- "file_url": (str) the URL of the PDF file.
|
122 |
|
123 |
Raises:
|
124 |
ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
|
|
|
130 |
|
131 |
async def process_page(page_idx):
|
132 |
nonlocal processed_pages_counter
|
133 |
+
page_data = await self.extract_page_data(page_idx, **kwargs)
|
134 |
+
pages.append(page_data)
|
135 |
+
rich.print(
|
136 |
+
f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
)
|
|
|
138 |
processed_pages_counter += 1
|
139 |
|
140 |
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
|
141 |
for task in asyncio.as_completed(tasks):
|
142 |
await task
|
143 |
+
|
144 |
if weave_dataset_name:
|
145 |
weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
|
146 |
return pages
|
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict
|
2 |
+
|
3 |
+
from marker.convert import convert_single_pdf
|
4 |
+
from marker.models import load_all_models
|
5 |
+
|
6 |
+
from .base_text_loader import BaseTextLoader
|
7 |
+
|
8 |
+
|
9 |
+
class MarkerTextLoader(BaseTextLoader):
|
10 |
+
"""
|
11 |
+
A concrete implementation of the BaseTextLoader for loading text from a PDF file
|
12 |
+
using `marker-pdf`, processing it into a structured text format, and optionally publishing
|
13 |
+
it to a Weave dataset.
|
14 |
+
|
15 |
+
This class extends the BaseTextLoader and implements the abstract methods to
|
16 |
+
load and process pages from a PDF file using marker-pdf, which is a pipeline of deep learning models.
|
17 |
+
|
18 |
+
This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
|
19 |
+
It uses marker-pdf to read the PDF and extract structured text from each page. The processed pages are stored
|
20 |
+
in a list of Page objects, which can be optionally published to a Weave dataset.
|
21 |
+
|
22 |
+
!!! example "Example Usage"
|
23 |
+
```python
|
24 |
+
import asyncio
|
25 |
+
|
26 |
+
import weave
|
27 |
+
|
28 |
+
from medrag_multi_modal.document_loader.text_loader import MarkerTextLoader
|
29 |
+
|
30 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
31 |
+
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
32 |
+
loader = MarkerTextLoader(
|
33 |
+
url=url,
|
34 |
+
document_name="Gray's Anatomy",
|
35 |
+
document_file_path="grays_anatomy.pdf",
|
36 |
+
)
|
37 |
+
asyncio.run(
|
38 |
+
loader.load_data(
|
39 |
+
start_page=31,
|
40 |
+
end_page=36,
|
41 |
+
weave_dataset_name="grays-anatomy-text",
|
42 |
+
)
|
43 |
+
)
|
44 |
+
```
|
45 |
+
|
46 |
+
Args:
|
47 |
+
url (str): The URL of the PDF file to download if not present locally.
|
48 |
+
document_name (str): The name of the document for metadata purposes.
|
49 |
+
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
50 |
+
"""
|
51 |
+
|
52 |
+
async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
|
53 |
+
"""
|
54 |
+
Process a single page of the PDF and extract its structured text using marker-pdf.
|
55 |
+
|
56 |
+
Returns a dictionary with the processed page data.
|
57 |
+
The dictionary will have the following keys and values:
|
58 |
+
|
59 |
+
- "text": (str) the extracted structured text from the page.
|
60 |
+
- "page_idx": (int) the index of the page.
|
61 |
+
- "document_name": (str) the name of the document.
|
62 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
63 |
+
- "file_url": (str) the URL of the PDF file.
|
64 |
+
- "meta": (dict) the metadata extracted from the page by marker-pdf.
|
65 |
+
|
66 |
+
Args:
|
67 |
+
page_idx (int): The index of the page to process.
|
68 |
+
**kwargs: Additional keyword arguments to be passed to `marker.convert.convert_single_pdf`.
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
Dict[str, str]: A dictionary containing the processed page data.
|
72 |
+
"""
|
73 |
+
model_lst = load_all_models()
|
74 |
+
|
75 |
+
text, _, out_meta = convert_single_pdf(
|
76 |
+
self.document_file_path,
|
77 |
+
model_lst,
|
78 |
+
max_pages=1,
|
79 |
+
batch_multiplier=1,
|
80 |
+
start_page=page_idx,
|
81 |
+
ocr_all_pages=True,
|
82 |
+
**kwargs,
|
83 |
+
)
|
84 |
+
|
85 |
+
return {
|
86 |
+
"text": text,
|
87 |
+
"page_idx": page_idx,
|
88 |
+
"document_name": self.document_name,
|
89 |
+
"file_path": self.document_file_path,
|
90 |
+
"file_url": self.url,
|
91 |
+
"meta": out_meta,
|
92 |
+
}
|
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict
|
2 |
+
|
3 |
+
import pdfplumber
|
4 |
+
|
5 |
+
from .base_text_loader import BaseTextLoader
|
6 |
+
|
7 |
+
|
8 |
+
class PDFPlumberTextLoader(BaseTextLoader):
|
9 |
+
"""
|
10 |
+
A concrete implementation of the BaseTextLoader for loading text from a PDF file
|
11 |
+
using `pdfplumber`, processing it into a simple text format, and optionally publishing
|
12 |
+
it to a Weave dataset.
|
13 |
+
|
14 |
+
This class extends the BaseTextLoader and implements the abstract methods to
|
15 |
+
load and process pages from a PDF file.
|
16 |
+
|
17 |
+
This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
|
18 |
+
It uses pdfplumber to read the PDF and extract text from each page. The processed pages are stored in a list
|
19 |
+
of Page objects, which can be optionally published to a Weave dataset.
|
20 |
+
|
21 |
+
!!! example "Example Usage"
|
22 |
+
```python
|
23 |
+
import asyncio
|
24 |
+
|
25 |
+
import weave
|
26 |
+
|
27 |
+
from medrag_multi_modal.document_loader.text_loader import PDFPlumberTextLoader
|
28 |
+
|
29 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
30 |
+
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
31 |
+
loader = PDFPlumberTextLoader(
|
32 |
+
url=url,
|
33 |
+
document_name="Gray's Anatomy",
|
34 |
+
document_file_path="grays_anatomy.pdf",
|
35 |
+
)
|
36 |
+
asyncio.run(
|
37 |
+
loader.load_data(
|
38 |
+
start_page=31,
|
39 |
+
end_page=36,
|
40 |
+
weave_dataset_name="grays-anatomy-text",
|
41 |
+
)
|
42 |
+
)
|
43 |
+
```
|
44 |
+
|
45 |
+
Args:
|
46 |
+
url (str): The URL of the PDF file to download if not present locally.
|
47 |
+
document_name (str): The name of the document for metadata purposes.
|
48 |
+
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
+
"""
|
50 |
+
|
51 |
+
async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
|
52 |
+
"""
|
53 |
+
Process a single page of the PDF and extract its text using pdfplumber.
|
54 |
+
|
55 |
+
Returns a dictionary with the processed page data.
|
56 |
+
The dictionary will have the following keys and values:
|
57 |
+
|
58 |
+
- "text": (str) the extracted text from the page.
|
59 |
+
- "page_idx": (int) the index of the page.
|
60 |
+
- "document_name": (str) the name of the document.
|
61 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
62 |
+
- "file_url": (str) the URL of the PDF file.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
page_idx (int): The index of the page to process.
|
66 |
+
**kwargs: Additional keyword arguments to be passed to `pdfplumber.Page.extract_text`.
|
67 |
+
|
68 |
+
Returns:
|
69 |
+
Dict[str, str]: A dictionary containing the processed page data.
|
70 |
+
"""
|
71 |
+
with pdfplumber.open(self.document_file_path) as pdf:
|
72 |
+
page = pdf.pages[page_idx]
|
73 |
+
text = page.extract_text(**kwargs)
|
74 |
+
|
75 |
+
return {
|
76 |
+
"text": text,
|
77 |
+
"page_idx": page_idx,
|
78 |
+
"document_name": self.document_name,
|
79 |
+
"file_path": self.document_file_path,
|
80 |
+
"file_url": self.url,
|
81 |
+
}
|
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict
|
2 |
+
|
3 |
+
import pymupdf4llm
|
4 |
+
|
5 |
+
from .base_text_loader import BaseTextLoader
|
6 |
+
|
7 |
+
|
8 |
+
class PyMuPDF4LLMTextLoader(BaseTextLoader):
|
9 |
+
"""
|
10 |
+
A concrete implementation of the BaseTextLoader for loading text from a PDF file,
|
11 |
+
processing it into markdown using `pymupdf4llm`, and optionally publishing it to a Weave dataset.
|
12 |
+
|
13 |
+
This class extends the BaseTextLoader and implements the abstract methods to load and process pages from a PDF file.
|
14 |
+
|
15 |
+
This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
|
16 |
+
It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
|
17 |
+
of Page objects, which can be optionally published to a Weave dataset.
|
18 |
+
|
19 |
+
!!! example "Example Usage"
|
20 |
+
```python
|
21 |
+
import asyncio
|
22 |
+
|
23 |
+
import weave
|
24 |
+
|
25 |
+
from medrag_multi_modal.document_loader.text_loader import (
|
26 |
+
PyMuPDF4LLMTextLoader
|
27 |
+
)
|
28 |
+
|
29 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
30 |
+
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
31 |
+
loader = PyMuPDF4LLMTextLoader(
|
32 |
+
url=url,
|
33 |
+
document_name="Gray's Anatomy",
|
34 |
+
document_file_path="grays_anatomy.pdf",
|
35 |
+
)
|
36 |
+
asyncio.run(
|
37 |
+
loader.load_data(
|
38 |
+
start_page=31,
|
39 |
+
end_page=36,
|
40 |
+
weave_dataset_name="grays-anatomy-text",
|
41 |
+
)
|
42 |
+
)
|
43 |
+
```
|
44 |
+
|
45 |
+
Args:
|
46 |
+
url (str): The URL of the PDF file to download if not present locally.
|
47 |
+
document_name (str): The name of the document for metadata purposes.
|
48 |
+
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
+
"""
|
50 |
+
|
51 |
+
async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
|
52 |
+
"""
|
53 |
+
Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
|
54 |
+
|
55 |
+
Returns a dictionary with the processed page data.
|
56 |
+
The dictionary will have the following keys and values:
|
57 |
+
|
58 |
+
- "text": (str) the processed page data in markdown format.
|
59 |
+
- "page_idx": (int) the index of the page.
|
60 |
+
- "document_name": (str) the name of the document.
|
61 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
62 |
+
- "file_url": (str) the URL of the PDF file.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
page_idx (int): The index of the page to process.
|
66 |
+
**kwargs: Additional keyword arguments to be passed to `pymupdf4llm.to_markdown`.
|
67 |
+
|
68 |
+
Returns:
|
69 |
+
Dict[str, str]: A dictionary containing the processed page data.
|
70 |
+
"""
|
71 |
+
text = pymupdf4llm.to_markdown(
|
72 |
+
doc=self.document_file_path, pages=[page_idx], show_progress=False, **kwargs
|
73 |
+
)
|
74 |
+
return {
|
75 |
+
"text": text,
|
76 |
+
"page_idx": page_idx,
|
77 |
+
"document_name": self.document_name,
|
78 |
+
"file_path": self.document_file_path,
|
79 |
+
"file_url": self.url,
|
80 |
+
}
|
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict
|
2 |
+
|
3 |
+
import PyPDF2
|
4 |
+
|
5 |
+
from .base_text_loader import BaseTextLoader
|
6 |
+
|
7 |
+
|
8 |
+
class PyPDF2TextLoader(BaseTextLoader):
|
9 |
+
"""
|
10 |
+
A concrete implementation of the BaseTextLoader for loading text from a PDF file
|
11 |
+
using `PyPDF2`, processing it into a simple text format, and optionally publishing
|
12 |
+
it to a Weave dataset.
|
13 |
+
|
14 |
+
This class extends the BaseTextLoader and implements the abstract methods to
|
15 |
+
load and process pages from a PDF file.
|
16 |
+
|
17 |
+
This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
|
18 |
+
It uses PyPDF2 to read the PDF and extract text from each page. The processed pages are stored in a list
|
19 |
+
of Page objects, which can be optionally published to a Weave dataset.
|
20 |
+
|
21 |
+
!!! example "Example Usage"
|
22 |
+
```python
|
23 |
+
import asyncio
|
24 |
+
|
25 |
+
import weave
|
26 |
+
|
27 |
+
from medrag_multi_modal.document_loader.text_loader import PyPDF2TextLoader
|
28 |
+
|
29 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
30 |
+
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
31 |
+
loader = PyPDF2TextLoader(
|
32 |
+
url=url,
|
33 |
+
document_name="Gray's Anatomy",
|
34 |
+
document_file_path="grays_anatomy.pdf",
|
35 |
+
)
|
36 |
+
asyncio.run(
|
37 |
+
loader.load_data(
|
38 |
+
start_page=31,
|
39 |
+
end_page=36,
|
40 |
+
weave_dataset_name="grays-anatomy-text",
|
41 |
+
)
|
42 |
+
)
|
43 |
+
```
|
44 |
+
|
45 |
+
Args:
|
46 |
+
url (str): The URL of the PDF file to download if not present locally.
|
47 |
+
document_name (str): The name of the document for metadata purposes.
|
48 |
+
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
+
"""
|
50 |
+
|
51 |
+
async def extract_page_data(self, page_idx: int, **kwargs) -> Dict[str, str]:
|
52 |
+
"""
|
53 |
+
Process a single page of the PDF and extract its text using PyPDF2.
|
54 |
+
|
55 |
+
Returns a dictionary with the processed page data.
|
56 |
+
The dictionary will have the following keys and values:
|
57 |
+
|
58 |
+
- "text": (str) the extracted text from the page.
|
59 |
+
- "page_idx": (int) the index of the page.
|
60 |
+
- "document_name": (str) the name of the document.
|
61 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
62 |
+
- "file_url": (str) the URL of the PDF file.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
page_idx (int): The index of the page to process.
|
66 |
+
**kwargs: Additional keyword arguments to be passed to `PyPDF2.PdfReader.pages[0].extract_text`.
|
67 |
+
|
68 |
+
Returns:
|
69 |
+
Dict[str, str]: A dictionary containing the processed page data.
|
70 |
+
"""
|
71 |
+
with open(self.document_file_path, "rb") as file:
|
72 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
73 |
+
page = pdf_reader.pages[page_idx]
|
74 |
+
text = page.extract_text(**kwargs)
|
75 |
+
|
76 |
+
return {
|
77 |
+
"text": text,
|
78 |
+
"page_idx": page_idx,
|
79 |
+
"document_name": self.document_name,
|
80 |
+
"file_path": self.document_file_path,
|
81 |
+
"file_url": self.url,
|
82 |
+
}
|
medrag_multi_modal/retrieval/multi_modal_retrieval.py
CHANGED
@@ -1,23 +1,22 @@
|
|
1 |
import os
|
2 |
from typing import Any, Optional
|
3 |
|
|
|
4 |
import weave
|
5 |
from byaldi import RAGMultiModalModel
|
6 |
from PIL import Image
|
7 |
|
8 |
-
import wandb
|
9 |
-
|
10 |
from ..utils import get_wandb_artifact
|
11 |
|
12 |
|
13 |
class MultiModalRetriever(weave.Model):
|
14 |
"""
|
15 |
MultiModalRetriever is a class that facilitates the retrieval of page images using ColPali.
|
16 |
-
|
17 |
This class leverages the `byaldi.RAGMultiModalModel` to perform document retrieval tasks.
|
18 |
It can be initialized with a pre-trained model or from a specified W&B artifact. The class
|
19 |
also provides methods to index new data and to predict/retrieve documents based on a query.
|
20 |
-
|
21 |
!!! example "Indexing Data"
|
22 |
```python
|
23 |
import wandb
|
@@ -31,14 +30,14 @@ class MultiModalRetriever(weave.Model):
|
|
31 |
index_name="grays-anatomy",
|
32 |
)
|
33 |
```
|
34 |
-
|
35 |
!!! example "Retrieving Documents"
|
36 |
```python
|
37 |
import weave
|
38 |
|
39 |
import wandb
|
40 |
from medrag_multi_modal.retrieval import MultiModalRetriever
|
41 |
-
|
42 |
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
43 |
retriever = MultiModalRetriever.from_artifact(
|
44 |
index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
|
@@ -54,6 +53,7 @@ class MultiModalRetriever(weave.Model):
|
|
54 |
Attributes:
|
55 |
model_name (str): The name of the model to be used for retrieval.
|
56 |
"""
|
|
|
57 |
model_name: str
|
58 |
_docs_retrieval_model: Optional[RAGMultiModalModel] = None
|
59 |
_metadata: Optional[dict] = None
|
|
|
1 |
import os
|
2 |
from typing import Any, Optional
|
3 |
|
4 |
+
import wandb
|
5 |
import weave
|
6 |
from byaldi import RAGMultiModalModel
|
7 |
from PIL import Image
|
8 |
|
|
|
|
|
9 |
from ..utils import get_wandb_artifact
|
10 |
|
11 |
|
12 |
class MultiModalRetriever(weave.Model):
|
13 |
"""
|
14 |
MultiModalRetriever is a class that facilitates the retrieval of page images using ColPali.
|
15 |
+
|
16 |
This class leverages the `byaldi.RAGMultiModalModel` to perform document retrieval tasks.
|
17 |
It can be initialized with a pre-trained model or from a specified W&B artifact. The class
|
18 |
also provides methods to index new data and to predict/retrieve documents based on a query.
|
19 |
+
|
20 |
!!! example "Indexing Data"
|
21 |
```python
|
22 |
import wandb
|
|
|
30 |
index_name="grays-anatomy",
|
31 |
)
|
32 |
```
|
33 |
+
|
34 |
!!! example "Retrieving Documents"
|
35 |
```python
|
36 |
import weave
|
37 |
|
38 |
import wandb
|
39 |
from medrag_multi_modal.retrieval import MultiModalRetriever
|
40 |
+
|
41 |
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
42 |
retriever = MultiModalRetriever.from_artifact(
|
43 |
index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
|
|
|
53 |
Attributes:
|
54 |
model_name (str): The name of the model to be used for retrieval.
|
55 |
"""
|
56 |
+
|
57 |
model_name: str
|
58 |
_docs_retrieval_model: Optional[RAGMultiModalModel] = None
|
59 |
_metadata: Optional[dict] = None
|
mkdocs.yml
CHANGED
@@ -63,8 +63,12 @@ nav:
|
|
63 |
- Installation: 'installation/install.md'
|
64 |
- Development: 'installation/development.md'
|
65 |
- Document Loader:
|
66 |
-
- Text Loader:
|
67 |
-
|
|
|
|
|
|
|
|
|
68 |
- Image Loader: 'document_loader/load_image.md'
|
69 |
- Retrieval:
|
70 |
- Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
|
|
|
63 |
- Installation: 'installation/install.md'
|
64 |
- Development: 'installation/development.md'
|
65 |
- Document Loader:
|
66 |
+
- Text Loader:
|
67 |
+
- Base: 'document_loader/text_loader/base_text_loader.md'
|
68 |
+
- PyMuPDF4LLM: 'document_loader/text_loader/pymupdf4llm_text_loader.md'
|
69 |
+
- PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
|
70 |
+
- PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
|
71 |
+
- Marker: 'document_loader/text_loader/marker_text_loader.md'
|
72 |
- Image Loader: 'document_loader/load_image.md'
|
73 |
- Retrieval:
|
74 |
- Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
|
pyproject.toml
CHANGED
@@ -19,6 +19,7 @@ dependencies = [
|
|
19 |
"isort>=5.13.2",
|
20 |
"black>=24.10.0",
|
21 |
"ruff>=0.6.9",
|
|
|
22 |
"mkdocs>=1.6.1",
|
23 |
"mkdocstrings>=0.26.1",
|
24 |
"mkdocstrings-python>=1.11.1",
|
@@ -27,13 +28,17 @@ dependencies = [
|
|
27 |
"mkdocs-glightbox>=0.4.0",
|
28 |
"mkdocs-jupyter>=0.25.0",
|
29 |
"jupyter>=1.1.1",
|
|
|
30 |
]
|
31 |
|
32 |
[project.optional-dependencies]
|
33 |
core = [
|
34 |
"Byaldi>=0.0.5",
|
35 |
"firerequests>=0.0.7",
|
|
|
36 |
"pdf2image>=1.17.0",
|
|
|
|
|
37 |
"python-dotenv>=1.0.1",
|
38 |
"pymupdf4llm>=0.0.17",
|
39 |
"torch>=2.4.1",
|
@@ -42,7 +47,6 @@ core = [
|
|
42 |
|
43 |
dev = [
|
44 |
"pytest>=8.3.3",
|
45 |
-
"PyPDF2>=3.0.1",
|
46 |
"isort>=5.13.2",
|
47 |
"black>=24.10.0",
|
48 |
"ruff>=0.6.9",
|
|
|
19 |
"isort>=5.13.2",
|
20 |
"black>=24.10.0",
|
21 |
"ruff>=0.6.9",
|
22 |
+
"marker-pdf>=0.2.17",
|
23 |
"mkdocs>=1.6.1",
|
24 |
"mkdocstrings>=0.26.1",
|
25 |
"mkdocstrings-python>=1.11.1",
|
|
|
28 |
"mkdocs-glightbox>=0.4.0",
|
29 |
"mkdocs-jupyter>=0.25.0",
|
30 |
"jupyter>=1.1.1",
|
31 |
+
"pdfplumber>=0.11.4",
|
32 |
]
|
33 |
|
34 |
[project.optional-dependencies]
|
35 |
core = [
|
36 |
"Byaldi>=0.0.5",
|
37 |
"firerequests>=0.0.7",
|
38 |
+
"marker-pdf>=0.2.17",
|
39 |
"pdf2image>=1.17.0",
|
40 |
+
"pdfplumber>=0.11.4",
|
41 |
+
"PyPDF2>=3.0.1",
|
42 |
"python-dotenv>=1.0.1",
|
43 |
"pymupdf4llm>=0.0.17",
|
44 |
"torch>=2.4.1",
|
|
|
47 |
|
48 |
dev = [
|
49 |
"pytest>=8.3.3",
|
|
|
50 |
"isort>=5.13.2",
|
51 |
"black>=24.10.0",
|
52 |
"ruff>=0.6.9",
|
uv.lock
DELETED
The diff for this file is too large to render.
See raw diff
|
|