mratanusarkar commited on
Commit
5406446
·
1 Parent(s): e19286a

temp: attempt - all format img extraction from pdf

Browse files
medrag_multi_modal/document_loader/image_loader/pymupdf_img_loader.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Dict
3
+
4
+ import fitz
5
+ from PIL import Image
6
+ from io import BytesIO
7
+
8
+ from .base_img_loader import BaseImageLoader
9
+
10
+
11
+ class PyMuPDFImageLoader(BaseImageLoader):
12
+ """
13
+ `PyMuPDFImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
14
+ loading of pages from a PDF file as images using the pymupdf library.
15
+
16
+ This class provides functionality to extract images from a PDF file using pymupdf library,
17
+ and optionally publish these images to a WandB artifact.
18
+
19
+ !!! example "Example Usage"
20
+ ```python
21
+ import asyncio
22
+
23
+ import weave
24
+
25
+ import wandb
26
+ from medrag_multi_modal.document_loader.image_loader import PyMuPDFImageLoader
27
+
28
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
29
+ wandb.init(project="medrag-multi-modal", entity="ml-colabs")
30
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
31
+ loader = PyMuPDFImageLoader(
32
+ url=url,
33
+ document_name="Gray's Anatomy",
34
+ document_file_path="grays_anatomy.pdf",
35
+ )
36
+ asyncio.run(
37
+ loader.load_data(
38
+ start_page=32,
39
+ end_page=37,
40
+ wandb_artifact_name="grays-anatomy-images",
41
+ cleanup=False,
42
+ )
43
+ )
44
+ ```
45
+
46
+ Args:
47
+ url (str): The URL of the PDF document.
48
+ document_name (str): The name of the document.
49
+ document_file_path (str): The path to the PDF file.
50
+ """
51
+
52
+ def __init__(self, url: str, document_name: str, document_file_path: str):
53
+ super().__init__(url, document_name, document_file_path)
54
+
55
+ async def extract_page_data(
56
+ self, page_idx: int, image_save_dir: str, **kwargs
57
+ ) -> Dict[str, Any]:
58
+ """
59
+ Extracts a single page from the PDF as an image using pymupdf library.
60
+
61
+ Args:
62
+ page_idx (int): The index of the page to process.
63
+ image_save_dir (str): The directory to save the extracted image.
64
+ **kwargs: Additional keyword arguments that may be used by pymupdf.
65
+
66
+ Returns:
67
+ Dict[str, Any]: A dictionary containing the processed page data.
68
+ The dictionary will have the following keys and values:
69
+
70
+ - "page_idx": (int) the index of the page.
71
+ - "document_name": (str) the name of the document.
72
+ - "file_path": (str) the local file path where the PDF is stored.
73
+ - "file_url": (str) the URL of the PDF file.
74
+ - "image_file_paths": (list) the local file paths where the images are stored.
75
+ """
76
+ image_file_paths = []
77
+
78
+ pdf_document = fitz.open(self.document_file_path)
79
+ page = pdf_document[page_idx]
80
+
81
+ images = page.get_images(full=True)
82
+ for img_idx, image in enumerate(images):
83
+ xref = image[0]
84
+ base_image = pdf_document.extract_image(xref)
85
+ image_bytes = base_image["image"]
86
+ image_ext = base_image["ext"]
87
+
88
+ if image_ext == "jb2":
89
+ image_ext = "png"
90
+ elif image_ext == "jpx":
91
+ image_ext = "jpg"
92
+
93
+ image_file_name = f"page{page_idx}_fig{img_idx}.{image_ext}"
94
+ image_file_path = os.path.join(image_save_dir, image_file_name)
95
+
96
+ # For JBIG2 and JPEG2000, we need to convert the image
97
+ if base_image["ext"] in ["jb2", "jpx"]:
98
+ try:
99
+ pix = fitz.Pixmap(image_bytes)
100
+ pix.save(image_file_path)
101
+ except Exception as e:
102
+ print(f"Error processing image: {e}")
103
+ # Fallback to using PIL for image conversion
104
+ try:
105
+ img = Image.open(BytesIO(image_bytes))
106
+ img.save(image_file_path)
107
+ except Exception as e:
108
+ print(f"Failed to process image with PIL: {e}")
109
+ continue # Skip this image if both methods fail
110
+ else:
111
+ with open(image_file_path, "wb") as image_file:
112
+ image_file.write(image_bytes)
113
+
114
+ image_file_paths.append(image_file_path)
115
+
116
+ pdf_document.close()
117
+
118
+ return {
119
+ "page_idx": page_idx,
120
+ "document_name": self.document_name,
121
+ "file_path": self.document_file_path,
122
+ "file_url": self.url,
123
+ "image_file_paths": image_file_paths,
124
+ }