mratanusarkar commited on
Commit
a24da3d
·
1 Parent(s): d822059

update: codebase addressing review comments

Browse files
docs/document_loader/load_text_image.md DELETED
@@ -1,3 +0,0 @@
1
- ## Load text and images from PDF files
2
-
3
- ::: medrag_multi_modal.document_loader.load_text_image
 
 
 
 
medrag_multi_modal/__init__.py CHANGED
@@ -1,19 +0,0 @@
1
- from .document_loader import (
2
- ImageLoader,
3
- MarkerTextLoader,
4
- PDFPlumberTextLoader,
5
- PyMuPDF4LLMTextLoader,
6
- PyPDF2TextLoader,
7
- TextImageLoader,
8
- )
9
- from .retrieval import MultiModalRetriever
10
-
11
- __all__ = [
12
- "PyMuPDF4LLMTextLoader",
13
- "PyPDF2TextLoader",
14
- "PDFPlumberTextLoader",
15
- "MarkerTextLoader",
16
- "ImageLoader",
17
- "TextImageLoader",
18
- "MultiModalRetriever",
19
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
medrag_multi_modal/document_loader/load_text_image.py DELETED
@@ -1,137 +0,0 @@
1
- import asyncio
2
- import os
3
- from glob import glob
4
- from typing import Optional
5
-
6
- import pymupdf4llm
7
- import rich
8
- import weave
9
- from PIL import Image
10
-
11
- from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
12
-
13
-
14
- class TextImageLoader(PyMuPDF4LLMTextLoader):
15
- """
16
- A class for loading and processing text and images from a document.
17
-
18
- The TextImageLoader class extends the TextLoader class to provide
19
- functionality for extracting both text and images from a document
20
- specified by a URL, document name, and file path. It processes the
21
- document asynchronously, allowing for efficient handling of large
22
- documents.
23
-
24
- !!! example "Example Usage"
25
- ```python
26
- import asyncio
27
-
28
- import weave
29
-
30
- from medrag_multi_modal.document_loader import TextImageLoader
31
-
32
- weave.init(project_name="ml-colabs/medrag-multi-modal")
33
- url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
34
- loader = TextImageLoader(
35
- url=url,
36
- document_name="Gray's Anatomy",
37
- document_file_path="grays_anatomy.pdf",
38
- )
39
- asyncio.run(
40
- loader.load_data(
41
- start_page=20,
42
- end_page=25,
43
- weave_dataset_name="grays-anatomy-text",
44
- )
45
- )
46
- ```
47
-
48
- Args:
49
- url (str): The URL of the document to be processed.
50
- document_name (str): The name of the document.
51
- document_file_path (str): The file path where the document is stored.
52
- """
53
-
54
- def __init__(self, url: str, document_name: str, document_file_path: str):
55
- super().__init__(url, document_name, document_file_path)
56
-
57
- async def load_data(
58
- self,
59
- start_page: Optional[int] = None,
60
- end_page: Optional[int] = None,
61
- weave_dataset_name: Optional[str] = None,
62
- image_path: Optional[str] = "./images",
63
- dpi: int = 300,
64
- ):
65
- """
66
- Asynchronously loads and processes text and images from a specified range of pages
67
- in a document. This function extracts text in markdown format and images in PNG
68
- format from the document, storing them in a list of dictionaries, each representing
69
- a page. Optionally, the processed data can be published to a Weave dataset.
70
-
71
- The function first determines the page indices to process using the
72
- `get_page_indices` method. It then defines an asynchronous inner function,
73
- `process_page`, which handles the extraction of text and images for a single page.
74
- The text is extracted using the `pymupdf4llm.to_markdown` function, and images are
75
- retrieved from the specified image path. The processed data is appended to the
76
- `pages` list.
77
-
78
- The function creates a list of tasks for processing each page asynchronously and
79
- awaits their completion. If a `weave_dataset_name` is provided, the processed data
80
- is published to a Weave dataset. Finally, the function returns the list of processed
81
- pages.
82
-
83
- Args:
84
- start_page (Optional[int]): The starting page index for processing. If None,
85
- defaults to the first page of the document.
86
- end_page (Optional[int]): The ending page index for processing. If None,
87
- defaults to the last page of the document.
88
- weave_dataset_name (Optional[str]): The name of the Weave dataset to publish
89
- the processed data to. If None, the data is not published.
90
- image_path (Optional[str]): The directory path where extracted images are
91
- stored. Defaults to "./images".
92
- dpi (int): The resolution in dots per inch for image extraction. Defaults to 300.
93
-
94
- Returns:
95
- List[Dict]: A list of dictionaries, each containing the extracted text, page
96
- index, document name, file path, file URL, and a list of images for each page
97
- processed.
98
- """
99
- start_page, end_page = self.get_page_indices(start_page, end_page)
100
- pages = []
101
- processed_pages_counter: int = 1
102
- total_pages = end_page - start_page
103
-
104
- async def process_page(page_idx):
105
- nonlocal processed_pages_counter
106
- text = pymupdf4llm.to_markdown(
107
- doc=self.document_file_path,
108
- pages=[page_idx],
109
- show_progress=False,
110
- write_images=True,
111
- image_format="png",
112
- dpi=dpi,
113
- image_path=image_path,
114
- )
115
- image_paths = glob(
116
- os.path.join(image_path, f"{self.document_file_path}-{page_idx}-*.png")
117
- )
118
- print(image_paths)
119
- pages.append(
120
- {
121
- "text": text,
122
- "page_idx": page_idx,
123
- "document_name": self.document_name,
124
- "file_path": self.document_file_path,
125
- "file_url": self.url,
126
- "images": [Image.open(image) for image in image_paths],
127
- }
128
- )
129
- rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
130
- processed_pages_counter += 1
131
-
132
- tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
133
- for task in asyncio.as_completed(tasks):
134
- await task
135
- if weave_dataset_name:
136
- weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
137
- return pages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mkdocs.yml CHANGED
@@ -69,7 +69,6 @@ nav:
69
  - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
70
  - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
71
  - Marker: 'document_loader/text_loader/marker_text_loader.md'
72
- - Text and Image Loader: 'document_loader/load_text_image.md'
73
  - Image Loader: 'document_loader/load_image.md'
74
  - Retrieval:
75
  - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
 
69
  - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
70
  - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
71
  - Marker: 'document_loader/text_loader/marker_text_loader.md'
 
72
  - Image Loader: 'document_loader/load_image.md'
73
  - Retrieval:
74
  - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
uv.lock DELETED
The diff for this file is too large to render. See raw diff