mratanusarkar commited on
Commit
331f289
·
1 Parent(s): f37090a

add: marker image loader + docs + corrections

Browse files
docs/document_loader/image_loader/marker_img_loader.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Load images from PDF files (using Marker)
2
+
3
+ ::: medrag_multi_modal.document_loader.image_loader.marker_img_loader
4
+
docs/document_loader/image_loader/pdf2image_img_loader.md CHANGED
@@ -1,3 +1,3 @@
1
- # Load images from PDF files (using pdf2image)
2
 
3
  ::: medrag_multi_modal.document_loader.image_loader.pdf2image_img_loader
 
1
+ # Load images from PDF files (using PDF2Image)
2
 
3
  ::: medrag_multi_modal.document_loader.image_loader.pdf2image_img_loader
medrag_multi_modal/document_loader/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from .image_loader import PDF2ImageLoader
2
  from .text_loader import (
3
  MarkerTextLoader,
4
  PDFPlumberTextLoader,
@@ -12,4 +12,5 @@ __all__ = [
12
  "PDFPlumberTextLoader",
13
  "MarkerTextLoader",
14
  "PDF2ImageLoader",
 
15
  ]
 
1
+ from .image_loader import MarkerImageLoader, PDF2ImageLoader
2
  from .text_loader import (
3
  MarkerTextLoader,
4
  PDFPlumberTextLoader,
 
12
  "PDFPlumberTextLoader",
13
  "MarkerTextLoader",
14
  "PDF2ImageLoader",
15
+ "MarkerImageLoader",
16
  ]
medrag_multi_modal/document_loader/image_loader/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from .base_img_loader import BaseImageLoader
2
  from .pdf2image_img_loader import PDF2ImageLoader
3
 
4
- __all__ = ["PDF2ImageLoader", "BaseImageLoader"]
 
1
+ from .marker_img_loader import MarkerImageLoader
2
  from .pdf2image_img_loader import PDF2ImageLoader
3
 
4
+ __all__ = ["PDF2ImageLoader", "MarkerImageLoader"]
medrag_multi_modal/document_loader/image_loader/base_img_loader.py CHANGED
@@ -47,7 +47,7 @@ class BaseImageLoader(BaseTextLoader):
47
  """
48
  Asynchronously loads images from a PDF file specified by a URL or local file path.
49
  The overrided processing abstract method then processes the images,
50
- and optionally publishes it to a Weave artifact.
51
 
52
  This function downloads a PDF from a given URL if it does not already exist locally,
53
  reads the specified range of pages, scans each page's content to extract images, and
@@ -58,12 +58,12 @@ class BaseImageLoader(BaseTextLoader):
58
  each page, extract the image content from the PDF, and convert it to png format.
59
  It processes pages concurrently using `asyncio` for efficiency.
60
 
61
- If a wandb_artifact_name is provided, the processed pages are published to a Weave artifact.
62
 
63
  Args:
64
  start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
65
  end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
66
- wandb_artifact_name (Optional[str]): The name of the Weave artifact to publish the pages to, if provided.
67
  image_save_dir (str): The directory to save the extracted images.
68
  cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
69
  **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
 
47
  """
48
  Asynchronously loads images from a PDF file specified by a URL or local file path.
49
  The overrided processing abstract method then processes the images,
50
+ and optionally publishes it to a WandB artifact.
51
 
52
  This function downloads a PDF from a given URL if it does not already exist locally,
53
  reads the specified range of pages, scans each page's content to extract images, and
 
58
  each page, extract the image content from the PDF, and convert it to png format.
59
  It processes pages concurrently using `asyncio` for efficiency.
60
 
61
+ If a wandb_artifact_name is provided, the processed pages are published to a WandB artifact.
62
 
63
  Args:
64
  start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
65
  end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
66
+ wandb_artifact_name (Optional[str]): The name of the WandB artifact to publish the pages to, if provided.
67
  image_save_dir (str): The directory to save the extracted images.
68
  cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
69
  **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
medrag_multi_modal/document_loader/image_loader/marker_img_loader.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Dict
3
+
4
+ from marker.convert import convert_single_pdf
5
+ from marker.models import load_all_models
6
+
7
+ from .base_img_loader import BaseImageLoader
8
+
9
+
10
+ class MarkerImageLoader(BaseImageLoader):
11
+ """
12
+ `MarkerImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
13
+ loading of pages from a PDF file as images using the marker library.
14
+
15
+ This class provides functionality to extract images from a PDF file using marker library,
16
+ and optionally publish these images to a WandB artifact.
17
+
18
+ Args:
19
+ url (str): The URL of the PDF document.
20
+ document_name (str): The name of the document.
21
+ document_file_path (str): The path to the PDF file.
22
+ """
23
+
24
+ def __init__(self, url: str, document_name: str, document_file_path: str):
25
+ super().__init__(url, document_name, document_file_path)
26
+
27
+ async def extract_page_data(
28
+ self, page_idx: int, image_save_dir: str, **kwargs
29
+ ) -> Dict[str, Any]:
30
+ """
31
+ Extracts a single page from the PDF as an image using marker library.
32
+
33
+ Args:
34
+ page_idx (int): The index of the page to process.
35
+ image_save_dir (str): The directory to save the extracted image.
36
+ **kwargs: Additional keyword arguments that may be used by marker.
37
+
38
+ Returns:
39
+ Dict[str, Any]: A dictionary containing the processed page data.
40
+ The dictionary will have the following keys and values:
41
+
42
+ - "page_idx": (int) the index of the page.
43
+ - "document_name": (str) the name of the document.
44
+ - "file_path": (str) the local file path where the PDF is stored.
45
+ - "file_url": (str) the URL of the PDF file.
46
+ - "image_file_path": (str) the local file path where the image is stored.
47
+ """
48
+ model_lst = load_all_models()
49
+
50
+ _, images, out_meta = convert_single_pdf(
51
+ self.document_file_path,
52
+ model_lst,
53
+ max_pages=1,
54
+ batch_multiplier=1,
55
+ start_page=page_idx,
56
+ ocr_all_pages=True,
57
+ **kwargs,
58
+ )
59
+
60
+ image_file_paths = []
61
+ for img_idx, (_, image) in enumerate(images.items()):
62
+ image_file_name = f"page{page_idx}_fig{img_idx}.png"
63
+ image_file_path = os.path.join(image_save_dir, image_file_name)
64
+ image.save(image_file_path, "png")
65
+ image_file_paths.append(image_file_path)
66
+
67
+ return {
68
+ "page_idx": page_idx,
69
+ "document_name": self.document_name,
70
+ "file_path": self.document_file_path,
71
+ "file_url": self.url,
72
+ "image_file_paths": image_file_paths,
73
+ "meta": out_meta,
74
+ }
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py CHANGED
@@ -12,7 +12,7 @@ class PDF2ImageLoader(BaseImageLoader):
12
  loading of pages from a PDF file as images using the pdf2image library.
13
 
14
  This class provides functionality to convert specific pages of a PDF document into images
15
- and optionally publish these images to a Weave artifact.
16
  It is like a snapshot image version of each of the pages from the PDF.
17
 
18
  Args:
 
12
  loading of pages from a PDF file as images using the pdf2image library.
13
 
14
  This class provides functionality to convert specific pages of a PDF document into images
15
+ and optionally publish these images to a WandB artifact.
16
  It is like a snapshot image version of each of the pages from the PDF.
17
 
18
  Args:
mkdocs.yml CHANGED
@@ -72,6 +72,7 @@ nav:
72
  - Image Loader:
73
  - Base: 'document_loader/image_loader/base_img_loader.md'
74
  - PDF2Image: 'document_loader/image_loader/pdf2image_img_loader.md'
 
75
  - Chunking: 'chunking.md'
76
  - Retrieval:
77
  - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
 
72
  - Image Loader:
73
  - Base: 'document_loader/image_loader/base_img_loader.md'
74
  - PDF2Image: 'document_loader/image_loader/pdf2image_img_loader.md'
75
+ - Marker: 'document_loader/image_loader/marker_img_loader.md'
76
  - Chunking: 'chunking.md'
77
  - Retrieval:
78
  - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'