mratanusarkar commited on
Commit
4fd52cf
·
1 Parent(s): bf0f2e5

add: hacky impl of img extraction with pdfplumber

Browse files
medrag_multi_modal/document_loader/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from .image_loader import MarkerImageLoader, PDF2ImageLoader
2
  from .text_loader import (
3
  MarkerTextLoader,
4
  PDFPlumberTextLoader,
@@ -13,4 +13,5 @@ __all__ = [
13
  "MarkerTextLoader",
14
  "PDF2ImageLoader",
15
  "MarkerImageLoader",
 
16
  ]
 
1
+ from .image_loader import MarkerImageLoader, PDF2ImageLoader, PDFPlumberImageLoader
2
  from .text_loader import (
3
  MarkerTextLoader,
4
  PDFPlumberTextLoader,
 
13
  "MarkerTextLoader",
14
  "PDF2ImageLoader",
15
  "MarkerImageLoader",
16
+ "PDFPlumberImageLoader",
17
  ]
medrag_multi_modal/document_loader/image_loader/__init__.py CHANGED
@@ -1,4 +1,5 @@
1
  from .marker_img_loader import MarkerImageLoader
2
  from .pdf2image_img_loader import PDF2ImageLoader
 
3
 
4
- __all__ = ["PDF2ImageLoader", "MarkerImageLoader"]
 
1
  from .marker_img_loader import MarkerImageLoader
2
  from .pdf2image_img_loader import PDF2ImageLoader
3
+ from .pdfplumber_img_loader import PDFPlumberImageLoader
4
 
5
+ __all__ = ["PDF2ImageLoader", "MarkerImageLoader", "PDFPlumberImageLoader"]
medrag_multi_modal/document_loader/image_loader/pdfplumber_img_loader.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Dict
3
+
4
+ import pdfplumber
5
+
6
+ from .base_img_loader import BaseImageLoader
7
+
8
+
9
+ class PDFPlumberImageLoader(BaseImageLoader):
10
+ """
11
+ `PDFPlumberImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
12
+ loading of pages from a PDF file as images using the pdfplumber library.
13
+
14
+ This class provides functionality to extract images from a PDF file using pdfplumber library,
15
+ and optionally publish these images to a WandB artifact.
16
+
17
+ !!! example "Example Usage"
18
+ ```python
19
+ import asyncio
20
+
21
+ import weave
22
+
23
+ import wandb
24
+ from medrag_multi_modal.document_loader.image_loader import PDFPlumberImageLoader
25
+
26
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
27
+ wandb.init(project="medrag-multi-modal", entity="ml-colabs")
28
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
29
+ loader = PDFPlumberImageLoader(
30
+ url=url,
31
+ document_name="Gray's Anatomy",
32
+ document_file_path="grays_anatomy.pdf",
33
+ )
34
+ asyncio.run(
35
+ loader.load_data(
36
+ start_page=32,
37
+ end_page=37,
38
+ wandb_artifact_name="grays-anatomy-images",
39
+ cleanup=False,
40
+ )
41
+ )
42
+ ```
43
+
44
+ Args:
45
+ url (str): The URL of the PDF document.
46
+ document_name (str): The name of the document.
47
+ document_file_path (str): The path to the PDF file.
48
+ """
49
+
50
+ def __init__(self, url: str, document_name: str, document_file_path: str):
51
+ super().__init__(url, document_name, document_file_path)
52
+
53
+ async def extract_page_data(
54
+ self, page_idx: int, image_save_dir: str, **kwargs
55
+ ) -> Dict[str, Any]:
56
+ """
57
+ Extracts a single page from the PDF as an image using pdfplumber library.
58
+
59
+ Args:
60
+ page_idx (int): The index of the page to process.
61
+ image_save_dir (str): The directory to save the extracted image.
62
+ **kwargs: Additional keyword arguments that may be used by pdfplumber.
63
+
64
+ Returns:
65
+ Dict[str, Any]: A dictionary containing the processed page data.
66
+ The dictionary will have the following keys and values:
67
+
68
+ - "page_idx": (int) the index of the page.
69
+ - "document_name": (str) the name of the document.
70
+ - "file_path": (str) the local file path where the PDF is stored.
71
+ - "file_url": (str) the URL of the PDF file.
72
+ - "image_file_path": (str) the local file path where the image is stored.
73
+ """
74
+ with pdfplumber.open(self.document_file_path) as pdf:
75
+ page = pdf.pages[page_idx]
76
+ images = page.images
77
+
78
+ image_file_paths = []
79
+ for img_idx, image in enumerate(images):
80
+ extracted_image = page.crop(
81
+ (
82
+ image["x0"],
83
+ image["top"],
84
+ image["x1"],
85
+ image["bottom"],
86
+ )
87
+ ).to_image(resolution=300)
88
+
89
+ image_file_name = f"page{page_idx}_fig{img_idx}.png"
90
+ image_file_path = os.path.join(image_save_dir, image_file_name)
91
+
92
+ extracted_image.save(image_file_path, "png")
93
+ image_file_paths.append(image_file_path)
94
+
95
+ return {
96
+ "page_idx": page_idx,
97
+ "document_name": self.document_name,
98
+ "file_path": self.document_file_path,
99
+ "file_url": self.url,
100
+ "image_file_paths": image_file_paths,
101
+ }