geekyrakshit commited on
Commit
d889dc6
·
unverified ·
2 Parent(s): 59ae053 7b862ff

Merge pull request #2 from soumik12345/feat/text-loading

Browse files
.gitignore CHANGED
@@ -5,4 +5,5 @@ cursor_prompt.txt
5
  **pycache**
6
  .ruff_cache/
7
  test.py
8
- **.pdf
 
 
5
  **pycache**
6
  .ruff_cache/
7
  test.py
8
+ **.pdf
9
+ images/
docs/document_loader/load_text.md CHANGED
@@ -1,3 +1,3 @@
1
  ## Load text from PDF files
2
 
3
- ::: medrag_multi_modal.document_loader
 
1
  ## Load text from PDF files
2
 
3
+ ::: medrag_multi_modal.document_loader.load_text
docs/document_loader/load_text_image.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ## Load text and images from PDF files
2
+
3
+ ::: medrag_multi_modal.document_loader.load_text_image
medrag_multi_modal/document_loader/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
- from .load_text import load_text_from_pdf
 
2
 
3
- __all__ = ["load_text_from_pdf"]
 
1
+ from .load_text import TextLoader
2
+ from .load_text_image import TextImageLoader
3
 
4
+ __all__ = ["TextLoader", "TextImageLoader"]
medrag_multi_modal/document_loader/load_text.py CHANGED
@@ -7,54 +7,33 @@ import PyPDF2
7
  import rich
8
  import weave
9
  from firerequests import FireRequests
10
- from pydantic import BaseModel
11
 
12
 
13
- class Page(BaseModel):
14
- text: str
15
- page_idx: int
16
- document_name: str
17
- file_path: str
18
- file_url: str
19
-
20
-
21
- async def load_text_from_pdf(
22
- url: str,
23
- document_name: str,
24
- document_file_path: str,
25
- start_page: Optional[int] = None,
26
- end_page: Optional[int] = None,
27
- weave_dataset_name: Optional[str] = None,
28
- ) -> list[Page]:
29
  """
30
- Asynchronously loads text from a PDF file specified by a URL or local file path,
31
- processes the text into markdown format, and optionally publishes it to a Weave dataset.
32
 
33
- This function downloads a PDF from a given URL if it does not already exist locally,
34
- reads the specified range of pages, converts each page's content to markdown, and
35
- returns a list of Page objects containing the text and metadata. It uses PyPDF2 to read
36
- the PDF and pymupdf4llm to convert pages to markdown. It processes pages concurrently using
37
- `asyncio` for efficiency. If a weave_dataset_name is provided, the processed pages are published
38
- to a Weave dataset.
39
 
40
- !!! example "Example usage"
41
  ```python
42
  import asyncio
43
 
44
  import weave
45
 
46
- from medrag_multi_modal.document_loader import load_text_from_pdf
47
 
48
  weave.init(project_name="ml-colabs/medrag-multi-modal")
49
  url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
 
 
 
 
 
50
  asyncio.run(
51
- load_text_from_pdf(
52
- url=url,
53
- document_name="Gray's Anatomy",
54
- start_page=9,
55
- end_page=15,
56
- document_file_path="grays_anatomy.pdf",
57
- )
58
  )
59
  ```
60
 
@@ -62,61 +41,90 @@ async def load_text_from_pdf(
62
  url (str): The URL of the PDF file to download if not present locally.
63
  document_name (str): The name of the document for metadata purposes.
64
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
65
- start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
66
- end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
67
- weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
68
-
69
- Returns:
70
- list[Page]: A list of Page objects, each containing the text and metadata for a processed page.
71
-
72
- Raises:
73
- ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
74
  """
75
- if not os.path.exists(document_file_path):
76
- FireRequests().download(url, filename=document_file_path)
77
- with open(document_file_path, "rb") as file:
78
- pdf_reader = PyPDF2.PdfReader(file)
79
- page_count = len(pdf_reader.pages)
80
- print(f"Page count: {page_count}")
81
- if start_page:
82
- if start_page > page_count:
83
- raise ValueError(
84
- f"Start page {start_page} is greater than the total page count {page_count}"
85
- )
86
- else:
87
- start_page = 0
88
- if end_page:
89
- if end_page > page_count:
90
- raise ValueError(
91
- f"End page {end_page} is greater than the total page count {page_count}"
92
- )
93
- else:
94
- end_page = page_count - 1
95
-
96
- pages: list[Page] = []
97
- processed_pages_counter: int = 1
98
- total_pages = end_page - start_page
99
 
100
- async def process_page(page_idx):
101
- nonlocal processed_pages_counter
102
- text = pymupdf4llm.to_markdown(
103
- doc=document_file_path, pages=[page_idx], show_progress=False
104
- )
105
- pages.append(
106
- Page(
107
- text=text,
108
- page_idx=page_idx,
109
- document_name=document_name,
110
- file_path=document_file_path,
111
- file_url=url,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  )
113
- )
114
- rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
115
- processed_pages_counter += 1
116
-
117
- tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
118
- for task in asyncio.as_completed(tasks):
119
- await task
120
- if weave_dataset_name:
121
- weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
122
- return pages
 
 
 
 
 
 
 
 
 
7
  import rich
8
  import weave
9
  from firerequests import FireRequests
 
10
 
11
 
12
+ class TextLoader:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  """
14
+ A class for loading text from a PDF file, processing it into markdown, and optionally publishing it to a Weave dataset.
 
15
 
16
+ This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
17
+ It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
18
+ of Page objects, which can be optionally published to a Weave dataset.
 
 
 
19
 
20
+ !!! example "Example Usage"
21
  ```python
22
  import asyncio
23
 
24
  import weave
25
 
26
+ from medrag_multi_modal.document_loader import TextLoader
27
 
28
  weave.init(project_name="ml-colabs/medrag-multi-modal")
29
  url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
30
+ loader = TextLoader(
31
+ url=url,
32
+ document_name="Gray's Anatomy",
33
+ document_file_path="grays_anatomy.pdf",
34
+ )
35
  asyncio.run(
36
+ loader.load_data(start_page=9, end_page=15, weave_dataset_name="grays-anatomy-text")
 
 
 
 
 
 
37
  )
38
  ```
39
 
 
41
  url (str): The URL of the PDF file to download if not present locally.
42
  document_name (str): The name of the document for metadata purposes.
43
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
 
 
 
 
 
 
 
 
 
44
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ def __init__(self, url: str, document_name: str, document_file_path: str):
47
+ self.url = url
48
+ self.document_name = document_name
49
+ self.document_file_path = document_file_path
50
+ if not os.path.exists(self.document_file_path):
51
+ FireRequests().download(url, filename=self.document_file_path)
52
+ with open(self.document_file_path, "rb") as file:
53
+ pdf_reader = PyPDF2.PdfReader(file)
54
+ self.page_count = len(pdf_reader.pages)
55
+
56
+ def get_page_indices(
57
+ self, start_page: Optional[int] = None, end_page: Optional[int] = None
58
+ ):
59
+ if start_page:
60
+ if start_page > self.page_count:
61
+ raise ValueError(
62
+ f"Start page {start_page} is greater than the total page count {self.page_count}"
63
+ )
64
+ else:
65
+ start_page = 0
66
+ if end_page:
67
+ if end_page > self.page_count:
68
+ raise ValueError(
69
+ f"End page {end_page} is greater than the total page count {self.page_count}"
70
+ )
71
+ else:
72
+ end_page = self.page_count - 1
73
+ return start_page, end_page
74
+
75
+ async def load_data(
76
+ self,
77
+ start_page: Optional[int] = None,
78
+ end_page: Optional[int] = None,
79
+ weave_dataset_name: Optional[str] = None,
80
+ ):
81
+ """
82
+ Asynchronously loads text from a PDF file specified by a URL or local file path,
83
+ processes the text into markdown format, and optionally publishes it to a Weave dataset.
84
+
85
+ This function downloads a PDF from a given URL if it does not already exist locally,
86
+ reads the specified range of pages, converts each page's content to markdown, and
87
+ returns a list of Page objects containing the text and metadata. It uses PyPDF2 to read
88
+ the PDF and pymupdf4llm to convert pages to markdown. It processes pages concurrently using
89
+ `asyncio` for efficiency. If a weave_dataset_name is provided, the processed pages are published
90
+ to a Weave dataset.
91
+
92
+ Args:
93
+ start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
94
+ end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
95
+ weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
96
+
97
+ Returns:
98
+ list[Page]: A list of Page objects, each containing the text and metadata for a processed page.
99
+
100
+ Raises:
101
+ ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
102
+ """
103
+ start_page, end_page = self.get_page_indices(start_page, end_page)
104
+ pages = []
105
+ processed_pages_counter: int = 1
106
+ total_pages = end_page - start_page
107
+
108
+ async def process_page(page_idx):
109
+ nonlocal processed_pages_counter
110
+ text = pymupdf4llm.to_markdown(
111
+ doc=self.document_file_path, pages=[page_idx], show_progress=False
112
  )
113
+ pages.append(
114
+ {
115
+ "text": text,
116
+ "page_idx": page_idx,
117
+ "document_name": self.document_name,
118
+ "file_path": self.document_file_path,
119
+ "file_url": self.url,
120
+ }
121
+ )
122
+ rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
123
+ processed_pages_counter += 1
124
+
125
+ tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
126
+ for task in asyncio.as_completed(tasks):
127
+ await task
128
+ if weave_dataset_name:
129
+ weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
130
+ return pages
medrag_multi_modal/document_loader/load_text_image.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ from glob import glob
4
+ from typing import Optional
5
+
6
+ import pymupdf4llm
7
+ import rich
8
+ import weave
9
+ from PIL import Image
10
+
11
+ from medrag_multi_modal.document_loader.load_text import TextLoader
12
+
13
+
14
+ class TextImageLoader(TextLoader):
15
+ """
16
+ A class for loading and processing text and images from a document.
17
+
18
+ The TextImageLoader class extends the TextLoader class to provide
19
+ functionality for extracting both text and images from a document
20
+ specified by a URL, document name, and file path. It processes the
21
+ document asynchronously, allowing for efficient handling of large
22
+ documents.
23
+
24
+ !!! example "Example Usage"
25
+ ```python
26
+ import asyncio
27
+
28
+ import weave
29
+
30
+ from medrag_multi_modal.document_loader import TextImageLoader
31
+
32
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
33
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
34
+ loader = TextImageLoader(
35
+ url=url,
36
+ document_name="Gray's Anatomy",
37
+ document_file_path="grays_anatomy.pdf",
38
+ )
39
+ asyncio.run(
40
+ loader.load_data(
41
+ start_page=20,
42
+ end_page=25,
43
+ weave_dataset_name="grays-anatomy-text",
44
+ )
45
+ )
46
+ ```
47
+
48
+ Args:
49
+ url (str): The URL of the document to be processed.
50
+ document_name (str): The name of the document.
51
+ document_file_path (str): The file path where the document is stored.
52
+ """
53
+
54
+ def __init__(self, url: str, document_name: str, document_file_path: str):
55
+ super().__init__(url, document_name, document_file_path)
56
+
57
+ async def load_data(
58
+ self,
59
+ start_page: Optional[int] = None,
60
+ end_page: Optional[int] = None,
61
+ weave_dataset_name: Optional[str] = None,
62
+ image_path: Optional[str] = "./images",
63
+ dpi: int = 300,
64
+ ):
65
+ """
66
+ Asynchronously loads and processes text and images from a specified range of pages
67
+ in a document. This function extracts text in markdown format and images in PNG
68
+ format from the document, storing them in a list of dictionaries, each representing
69
+ a page. Optionally, the processed data can be published to a Weave dataset.
70
+
71
+ The function first determines the page indices to process using the
72
+ `get_page_indices` method. It then defines an asynchronous inner function,
73
+ `process_page`, which handles the extraction of text and images for a single page.
74
+ The text is extracted using the `pymupdf4llm.to_markdown` function, and images are
75
+ retrieved from the specified image path. The processed data is appended to the
76
+ `pages` list.
77
+
78
+ The function creates a list of tasks for processing each page asynchronously and
79
+ awaits their completion. If a `weave_dataset_name` is provided, the processed data
80
+ is published to a Weave dataset. Finally, the function returns the list of processed
81
+ pages.
82
+
83
+ Args:
84
+ start_page (Optional[int]): The starting page index for processing. If None,
85
+ defaults to the first page of the document.
86
+ end_page (Optional[int]): The ending page index for processing. If None,
87
+ defaults to the last page of the document.
88
+ weave_dataset_name (Optional[str]): The name of the Weave dataset to publish
89
+ the processed data to. If None, the data is not published.
90
+ image_path (Optional[str]): The directory path where extracted images are
91
+ stored. Defaults to "./images".
92
+ dpi (int): The resolution in dots per inch for image extraction. Defaults to 300.
93
+
94
+ Returns:
95
+ List[Dict]: A list of dictionaries, each containing the extracted text, page
96
+ index, document name, file path, file URL, and a list of images for each page
97
+ processed.
98
+ """
99
+ start_page, end_page = self.get_page_indices(start_page, end_page)
100
+ pages = []
101
+ processed_pages_counter: int = 1
102
+ total_pages = end_page - start_page
103
+
104
+ async def process_page(page_idx):
105
+ nonlocal processed_pages_counter
106
+ text = pymupdf4llm.to_markdown(
107
+ doc=self.document_file_path,
108
+ pages=[page_idx],
109
+ show_progress=False,
110
+ write_images=True,
111
+ image_format="png",
112
+ dpi=dpi,
113
+ image_path=image_path,
114
+ )
115
+ image_paths = glob(
116
+ os.path.join(image_path, f"{self.document_file_path}-{page_idx}-*.png")
117
+ )
118
+ print(image_paths)
119
+ pages.append(
120
+ {
121
+ "text": text,
122
+ "page_idx": page_idx,
123
+ "document_name": self.document_name,
124
+ "file_path": self.document_file_path,
125
+ "file_url": self.url,
126
+ "images": [Image.open(image) for image in image_paths],
127
+ }
128
+ )
129
+ rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
130
+ processed_pages_counter += 1
131
+
132
+ tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
133
+ for task in asyncio.as_completed(tasks):
134
+ await task
135
+ if weave_dataset_name:
136
+ weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
137
+ return pages
mkdocs.yml CHANGED
@@ -14,8 +14,6 @@ theme:
14
  toggle:
15
  icon: material/brightness-4
16
  name: Switch to light mode
17
- logo: assets/logomark.svg
18
- favicon: assets/logomark.svg
19
  features:
20
  - content.code.annotate
21
  - content.code.copy
@@ -63,5 +61,6 @@ nav:
63
  - Home: 'index.md'
64
  - Document Loader:
65
  - Text Loader: 'document_loader/load_text.md'
 
66
 
67
  repo_url: https://github.com/soumik12345/medrag-multi-modal
 
14
  toggle:
15
  icon: material/brightness-4
16
  name: Switch to light mode
 
 
17
  features:
18
  - content.code.annotate
19
  - content.code.copy
 
61
  - Home: 'index.md'
62
  - Document Loader:
63
  - Text Loader: 'document_loader/load_text.md'
64
+ - Text and Image Loader: 'document_loader/load_text_image.md'
65
 
66
  repo_url: https://github.com/soumik12345/medrag-multi-modal