geekyrakshit commited on
Commit
475fb67
·
1 Parent(s): 4344d0d

remove: load_image.py

Browse files
medrag_multi_modal/document_loader/load_image.py DELETED
@@ -1,131 +0,0 @@
1
- import asyncio
2
- import os
3
- from typing import Optional
4
-
5
- import rich
6
- import weave
7
- from pdf2image.pdf2image import convert_from_path
8
- from PIL import Image
9
-
10
- import wandb
11
- from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
12
-
13
-
14
- class ImageLoader(PyMuPDF4LLMTextLoader):
15
- """
16
- `ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
17
- loading of pages from a PDF file as images.
18
-
19
- This class provides functionality to convert specific pages of a PDF document into images
20
- and optionally publish these images to a Weave dataset.
21
-
22
- !!! example "Example Usage"
23
- ```python
24
- import asyncio
25
-
26
- import wandb
27
- from dotenv import load_dotenv
28
-
29
- from medrag_multi_modal.document_loader import ImageLoader
30
-
31
- load_dotenv()
32
- wandb.init(project="medrag-multi-modal", entity="ml-colabs")
33
- url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
34
- loader = ImageLoader(
35
- url=url,
36
- document_name="Gray's Anatomy",
37
- document_file_path="grays_anatomy.pdf",
38
- )
39
- asyncio.run(
40
- loader.load_data(
41
- start_page=31,
42
- end_page=33,
43
- dataset_name="grays-anatomy-images",
44
- )
45
- )
46
- ```
47
-
48
- Args:
49
- url (str): The URL of the PDF document.
50
- document_name (str): The name of the document.
51
- document_file_path (str): The path to the PDF file.
52
- """
53
-
54
- def __init__(self, url: str, document_name: str, document_file_path: str):
55
- super().__init__(url, document_name, document_file_path)
56
-
57
- def extract_data_from_pdf_file(
58
- self, pdf_file: str, page_number: int
59
- ) -> Image.Image:
60
- image = convert_from_path(
61
- pdf_file, first_page=page_number + 1, last_page=page_number + 1
62
- )[0]
63
- return image
64
-
65
- async def load_data(
66
- self,
67
- start_page: Optional[int] = None,
68
- end_page: Optional[int] = None,
69
- image_save_dir: str = "./images",
70
- dataset_name: Optional[str] = None,
71
- ):
72
- """
73
- Asynchronously loads images from a PDF file specified by a URL or local file path,
74
- processes the images for the specified range of pages, and optionally publishes them
75
- to a Weave dataset.
76
-
77
- This function reads the specified range of pages from a PDF document, converts each page
78
- to an image using the `pdf2image` library, and returns a list of dictionaries containing
79
- the image and metadata for each processed page. It processes pages concurrently using
80
- `asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
81
- published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
82
- with the specified name.
83
-
84
- Args:
85
- start_page (Optional[int]): The starting page index (0-based) to process.
86
- end_page (Optional[int]): The ending page index (0-based) to process.
87
- dataset_name (Optional[str]): The name of the Weave dataset to publish the
88
- processed images to. Defaults to None.
89
-
90
- Returns:
91
- list[dict]: A list of dictionaries, each containing the image and metadata for a
92
- processed page.
93
-
94
- Raises:
95
- ValueError: If the specified start_page or end_page is out of bounds of the document's
96
- page count.
97
- """
98
- os.makedirs(image_save_dir, exist_ok=True)
99
- start_page, end_page = self.get_page_indices(start_page, end_page)
100
- pages = []
101
- processed_pages_counter: int = 1
102
- total_pages = end_page - start_page
103
-
104
- async def process_page(page_idx):
105
- nonlocal processed_pages_counter
106
- image = convert_from_path(
107
- self.document_file_path,
108
- first_page=page_idx + 1,
109
- last_page=page_idx + 1,
110
- )[0]
111
- pages.append(
112
- {
113
- "page_idx": page_idx,
114
- "document_name": self.document_name,
115
- "file_path": self.document_file_path,
116
- "file_url": self.url,
117
- }
118
- )
119
- image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
120
- rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
121
- processed_pages_counter += 1
122
-
123
- tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
124
- for task in asyncio.as_completed(tasks):
125
- await task
126
- if dataset_name:
127
- artifact = wandb.Artifact(name=dataset_name, type="dataset")
128
- artifact.add_dir(local_path=image_save_dir)
129
- artifact.save()
130
- weave.publish(weave.Dataset(name=dataset_name, rows=pages))
131
- return pages