mratanusarkar commited on
Commit
78dd8e8
·
1 Parent(s): 4304db6

chore: remove old load_text

Browse files
docs/document_loader/load_text.md DELETED
@@ -1,3 +0,0 @@
1
- ## Load text from PDF files
2
-
3
- ::: medrag_multi_modal.document_loader.load_text
 
 
 
 
medrag_multi_modal/document_loader/load_text.py DELETED
@@ -1,130 +0,0 @@
1
- import asyncio
2
- import os
3
- from typing import Optional
4
-
5
- import pymupdf4llm
6
- import PyPDF2
7
- import rich
8
- import weave
9
- from firerequests import FireRequests
10
-
11
-
12
- class TextLoader:
13
- """
14
- A class for loading text from a PDF file, processing it into markdown, and optionally publishing it to a Weave dataset.
15
-
16
- This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
17
- It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
18
- of Page objects, which can be optionally published to a Weave dataset.
19
-
20
- !!! example "Example Usage"
21
- ```python
22
- import asyncio
23
-
24
- import weave
25
-
26
- from medrag_multi_modal.document_loader import TextLoader
27
-
28
- weave.init(project_name="ml-colabs/medrag-multi-modal")
29
- url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
30
- loader = TextLoader(
31
- url=url,
32
- document_name="Gray's Anatomy",
33
- document_file_path="grays_anatomy.pdf",
34
- )
35
- asyncio.run(
36
- loader.load_data(start_page=9, end_page=15, weave_dataset_name="grays-anatomy-text")
37
- )
38
- ```
39
-
40
- Args:
41
- url (str): The URL of the PDF file to download if not present locally.
42
- document_name (str): The name of the document for metadata purposes.
43
- document_file_path (str): The local file path where the PDF is stored or will be downloaded.
44
- """
45
-
46
- def __init__(self, url: str, document_name: str, document_file_path: str):
47
- self.url = url
48
- self.document_name = document_name
49
- self.document_file_path = document_file_path
50
- if not os.path.exists(self.document_file_path):
51
- FireRequests().download(url, filename=self.document_file_path)
52
- with open(self.document_file_path, "rb") as file:
53
- pdf_reader = PyPDF2.PdfReader(file)
54
- self.page_count = len(pdf_reader.pages)
55
-
56
- def get_page_indices(
57
- self, start_page: Optional[int] = None, end_page: Optional[int] = None
58
- ):
59
- if start_page:
60
- if start_page > self.page_count:
61
- raise ValueError(
62
- f"Start page {start_page} is greater than the total page count {self.page_count}"
63
- )
64
- else:
65
- start_page = 0
66
- if end_page:
67
- if end_page > self.page_count:
68
- raise ValueError(
69
- f"End page {end_page} is greater than the total page count {self.page_count}"
70
- )
71
- else:
72
- end_page = self.page_count - 1
73
- return start_page, end_page
74
-
75
- async def load_data(
76
- self,
77
- start_page: Optional[int] = None,
78
- end_page: Optional[int] = None,
79
- weave_dataset_name: Optional[str] = None,
80
- ):
81
- """
82
- Asynchronously loads text from a PDF file specified by a URL or local file path,
83
- processes the text into markdown format, and optionally publishes it to a Weave dataset.
84
-
85
- This function downloads a PDF from a given URL if it does not already exist locally,
86
- reads the specified range of pages, converts each page's content to markdown, and
87
- returns a list of Page objects containing the text and metadata. It uses PyPDF2 to read
88
- the PDF and pymupdf4llm to convert pages to markdown. It processes pages concurrently using
89
- `asyncio` for efficiency. If a weave_dataset_name is provided, the processed pages are published
90
- to a Weave dataset.
91
-
92
- Args:
93
- start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
94
- end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
95
- weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
96
-
97
- Returns:
98
- list[Page]: A list of Page objects, each containing the text and metadata for a processed page.
99
-
100
- Raises:
101
- ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
102
- """
103
- start_page, end_page = self.get_page_indices(start_page, end_page)
104
- pages = []
105
- processed_pages_counter: int = 1
106
- total_pages = end_page - start_page
107
-
108
- async def process_page(page_idx):
109
- nonlocal processed_pages_counter
110
- text = pymupdf4llm.to_markdown(
111
- doc=self.document_file_path, pages=[page_idx], show_progress=False
112
- )
113
- pages.append(
114
- {
115
- "text": text,
116
- "page_idx": page_idx,
117
- "document_name": self.document_name,
118
- "file_path": self.document_file_path,
119
- "file_url": self.url,
120
- }
121
- )
122
- rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
123
- processed_pages_counter += 1
124
-
125
- tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
126
- for task in asyncio.as_completed(tasks):
127
- await task
128
- if weave_dataset_name:
129
- weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
130
- return pages