Spaces:
Sleeping
Sleeping
Commit
·
331f289
1
Parent(s):
f37090a
add: marker image loader + docs + corrections
Browse files- docs/document_loader/image_loader/marker_img_loader.md +4 -0
- docs/document_loader/image_loader/pdf2image_img_loader.md +1 -1
- medrag_multi_modal/document_loader/__init__.py +2 -1
- medrag_multi_modal/document_loader/image_loader/__init__.py +2 -2
- medrag_multi_modal/document_loader/image_loader/base_img_loader.py +3 -3
- medrag_multi_modal/document_loader/image_loader/marker_img_loader.py +74 -0
- medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py +1 -1
- mkdocs.yml +1 -0
docs/document_loader/image_loader/marker_img_loader.md
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Load images from PDF files (using Marker)
|
2 |
+
|
3 |
+
::: medrag_multi_modal.document_loader.image_loader.marker_img_loader
|
4 |
+
|
docs/document_loader/image_loader/pdf2image_img_loader.md
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
-
# Load images from PDF files (using
|
2 |
|
3 |
::: medrag_multi_modal.document_loader.image_loader.pdf2image_img_loader
|
|
|
1 |
+
# Load images from PDF files (using PDF2Image)
|
2 |
|
3 |
::: medrag_multi_modal.document_loader.image_loader.pdf2image_img_loader
|
medrag_multi_modal/document_loader/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from .image_loader import PDF2ImageLoader
|
2 |
from .text_loader import (
|
3 |
MarkerTextLoader,
|
4 |
PDFPlumberTextLoader,
|
@@ -12,4 +12,5 @@ __all__ = [
|
|
12 |
"PDFPlumberTextLoader",
|
13 |
"MarkerTextLoader",
|
14 |
"PDF2ImageLoader",
|
|
|
15 |
]
|
|
|
1 |
+
from .image_loader import MarkerImageLoader, PDF2ImageLoader
|
2 |
from .text_loader import (
|
3 |
MarkerTextLoader,
|
4 |
PDFPlumberTextLoader,
|
|
|
12 |
"PDFPlumberTextLoader",
|
13 |
"MarkerTextLoader",
|
14 |
"PDF2ImageLoader",
|
15 |
+
"MarkerImageLoader",
|
16 |
]
|
medrag_multi_modal/document_loader/image_loader/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from .
|
2 |
from .pdf2image_img_loader import PDF2ImageLoader
|
3 |
|
4 |
-
__all__ = ["PDF2ImageLoader", "
|
|
|
1 |
+
from .marker_img_loader import MarkerImageLoader
|
2 |
from .pdf2image_img_loader import PDF2ImageLoader
|
3 |
|
4 |
+
__all__ = ["PDF2ImageLoader", "MarkerImageLoader"]
|
medrag_multi_modal/document_loader/image_loader/base_img_loader.py
CHANGED
@@ -47,7 +47,7 @@ class BaseImageLoader(BaseTextLoader):
|
|
47 |
"""
|
48 |
Asynchronously loads images from a PDF file specified by a URL or local file path.
|
49 |
The overrided processing abstract method then processes the images,
|
50 |
-
and optionally publishes it to a
|
51 |
|
52 |
This function downloads a PDF from a given URL if it does not already exist locally,
|
53 |
reads the specified range of pages, scans each page's content to extract images, and
|
@@ -58,12 +58,12 @@ class BaseImageLoader(BaseTextLoader):
|
|
58 |
each page, extract the image content from the PDF, and convert it to png format.
|
59 |
It processes pages concurrently using `asyncio` for efficiency.
|
60 |
|
61 |
-
If a wandb_artifact_name is provided, the processed pages are published to a
|
62 |
|
63 |
Args:
|
64 |
start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
|
65 |
end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
|
66 |
-
wandb_artifact_name (Optional[str]): The name of the
|
67 |
image_save_dir (str): The directory to save the extracted images.
|
68 |
cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
|
69 |
**kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
|
|
|
47 |
"""
|
48 |
Asynchronously loads images from a PDF file specified by a URL or local file path.
|
49 |
The overrided processing abstract method then processes the images,
|
50 |
+
and optionally publishes it to a WandB artifact.
|
51 |
|
52 |
This function downloads a PDF from a given URL if it does not already exist locally,
|
53 |
reads the specified range of pages, scans each page's content to extract images, and
|
|
|
58 |
each page, extract the image content from the PDF, and convert it to png format.
|
59 |
It processes pages concurrently using `asyncio` for efficiency.
|
60 |
|
61 |
+
If a wandb_artifact_name is provided, the processed pages are published to a WandB artifact.
|
62 |
|
63 |
Args:
|
64 |
start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
|
65 |
end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
|
66 |
+
wandb_artifact_name (Optional[str]): The name of the WandB artifact to publish the pages to, if provided.
|
67 |
image_save_dir (str): The directory to save the extracted images.
|
68 |
cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
|
69 |
**kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
|
medrag_multi_modal/document_loader/image_loader/marker_img_loader.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Any, Dict
|
3 |
+
|
4 |
+
from marker.convert import convert_single_pdf
|
5 |
+
from marker.models import load_all_models
|
6 |
+
|
7 |
+
from .base_img_loader import BaseImageLoader
|
8 |
+
|
9 |
+
|
10 |
+
class MarkerImageLoader(BaseImageLoader):
|
11 |
+
"""
|
12 |
+
`MarkerImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
|
13 |
+
loading of pages from a PDF file as images using the marker library.
|
14 |
+
|
15 |
+
This class provides functionality to extract images from a PDF file using marker library,
|
16 |
+
and optionally publish these images to a WandB artifact.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
url (str): The URL of the PDF document.
|
20 |
+
document_name (str): The name of the document.
|
21 |
+
document_file_path (str): The path to the PDF file.
|
22 |
+
"""
|
23 |
+
|
24 |
+
def __init__(self, url: str, document_name: str, document_file_path: str):
|
25 |
+
super().__init__(url, document_name, document_file_path)
|
26 |
+
|
27 |
+
async def extract_page_data(
|
28 |
+
self, page_idx: int, image_save_dir: str, **kwargs
|
29 |
+
) -> Dict[str, Any]:
|
30 |
+
"""
|
31 |
+
Extracts a single page from the PDF as an image using marker library.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
page_idx (int): The index of the page to process.
|
35 |
+
image_save_dir (str): The directory to save the extracted image.
|
36 |
+
**kwargs: Additional keyword arguments that may be used by marker.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
Dict[str, Any]: A dictionary containing the processed page data.
|
40 |
+
The dictionary will have the following keys and values:
|
41 |
+
|
42 |
+
- "page_idx": (int) the index of the page.
|
43 |
+
- "document_name": (str) the name of the document.
|
44 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
45 |
+
- "file_url": (str) the URL of the PDF file.
|
46 |
+
- "image_file_path": (str) the local file path where the image is stored.
|
47 |
+
"""
|
48 |
+
model_lst = load_all_models()
|
49 |
+
|
50 |
+
_, images, out_meta = convert_single_pdf(
|
51 |
+
self.document_file_path,
|
52 |
+
model_lst,
|
53 |
+
max_pages=1,
|
54 |
+
batch_multiplier=1,
|
55 |
+
start_page=page_idx,
|
56 |
+
ocr_all_pages=True,
|
57 |
+
**kwargs,
|
58 |
+
)
|
59 |
+
|
60 |
+
image_file_paths = []
|
61 |
+
for img_idx, (_, image) in enumerate(images.items()):
|
62 |
+
image_file_name = f"page{page_idx}_fig{img_idx}.png"
|
63 |
+
image_file_path = os.path.join(image_save_dir, image_file_name)
|
64 |
+
image.save(image_file_path, "png")
|
65 |
+
image_file_paths.append(image_file_path)
|
66 |
+
|
67 |
+
return {
|
68 |
+
"page_idx": page_idx,
|
69 |
+
"document_name": self.document_name,
|
70 |
+
"file_path": self.document_file_path,
|
71 |
+
"file_url": self.url,
|
72 |
+
"image_file_paths": image_file_paths,
|
73 |
+
"meta": out_meta,
|
74 |
+
}
|
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py
CHANGED
@@ -12,7 +12,7 @@ class PDF2ImageLoader(BaseImageLoader):
|
|
12 |
loading of pages from a PDF file as images using the pdf2image library.
|
13 |
|
14 |
This class provides functionality to convert specific pages of a PDF document into images
|
15 |
-
and optionally publish these images to a
|
16 |
It is like a snapshot image version of each of the pages from the PDF.
|
17 |
|
18 |
Args:
|
|
|
12 |
loading of pages from a PDF file as images using the pdf2image library.
|
13 |
|
14 |
This class provides functionality to convert specific pages of a PDF document into images
|
15 |
+
and optionally publish these images to a WandB artifact.
|
16 |
It is like a snapshot image version of each of the pages from the PDF.
|
17 |
|
18 |
Args:
|
mkdocs.yml
CHANGED
@@ -72,6 +72,7 @@ nav:
|
|
72 |
- Image Loader:
|
73 |
- Base: 'document_loader/image_loader/base_img_loader.md'
|
74 |
- PDF2Image: 'document_loader/image_loader/pdf2image_img_loader.md'
|
|
|
75 |
- Chunking: 'chunking.md'
|
76 |
- Retrieval:
|
77 |
- Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
|
|
|
72 |
- Image Loader:
|
73 |
- Base: 'document_loader/image_loader/base_img_loader.md'
|
74 |
- PDF2Image: 'document_loader/image_loader/pdf2image_img_loader.md'
|
75 |
+
- Marker: 'document_loader/image_loader/marker_img_loader.md'
|
76 |
- Chunking: 'chunking.md'
|
77 |
- Retrieval:
|
78 |
- Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
|