Spaces:
Sleeping
Sleeping
File size: 5,338 Bytes
b9d8094 c675904 b9d8094 c675904 b9d8094 c675904 7b862ff c675904 b9d8094 c675904 b9d8094 c675904 b9d8094 c675904 b9d8094 c675904 7b862ff c675904 b9d8094 c675904 7b862ff c675904 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import asyncio
import os
from typing import Optional
import pymupdf4llm
import PyPDF2
import rich
import weave
from firerequests import FireRequests
class TextLoader:
"""
A class for loading text from a PDF file, processing it into markdown, and optionally publishing it to a Weave dataset.
This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
of Page objects, which can be optionally published to a Weave dataset.
!!! example "Example Usage"
```python
import asyncio
import weave
from medrag_multi_modal.document_loader import TextLoader
weave.init(project_name="ml-colabs/medrag-multi-modal")
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
loader = TextLoader(
url=url,
document_name="Gray's Anatomy",
document_file_path="grays_anatomy.pdf",
)
asyncio.run(
loader.load_data(start_page=9, end_page=15, weave_dataset_name="grays-anatomy-text")
)
```
Args:
url (str): The URL of the PDF file to download if not present locally.
document_name (str): The name of the document for metadata purposes.
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
"""
def __init__(self, url: str, document_name: str, document_file_path: str):
self.url = url
self.document_name = document_name
self.document_file_path = document_file_path
if not os.path.exists(self.document_file_path):
FireRequests().download(url, filename=self.document_file_path)
with open(self.document_file_path, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
self.page_count = len(pdf_reader.pages)
def get_page_indices(
self, start_page: Optional[int] = None, end_page: Optional[int] = None
):
if start_page:
if start_page > self.page_count:
raise ValueError(
f"Start page {start_page} is greater than the total page count {self.page_count}"
)
else:
start_page = 0
if end_page:
if end_page > self.page_count:
raise ValueError(
f"End page {end_page} is greater than the total page count {self.page_count}"
)
else:
end_page = self.page_count - 1
return start_page, end_page
async def load_data(
self,
start_page: Optional[int] = None,
end_page: Optional[int] = None,
weave_dataset_name: Optional[str] = None,
):
"""
Asynchronously loads text from a PDF file specified by a URL or local file path,
processes the text into markdown format, and optionally publishes it to a Weave dataset.
This function downloads a PDF from a given URL if it does not already exist locally,
reads the specified range of pages, converts each page's content to markdown, and
returns a list of Page objects containing the text and metadata. It uses PyPDF2 to read
the PDF and pymupdf4llm to convert pages to markdown. It processes pages concurrently using
`asyncio` for efficiency. If a weave_dataset_name is provided, the processed pages are published
to a Weave dataset.
Args:
start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
Returns:
list[Page]: A list of Page objects, each containing the text and metadata for a processed page.
Raises:
ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
"""
start_page, end_page = self.get_page_indices(start_page, end_page)
pages = []
processed_pages_counter: int = 1
total_pages = end_page - start_page
async def process_page(page_idx):
nonlocal processed_pages_counter
text = pymupdf4llm.to_markdown(
doc=self.document_file_path, pages=[page_idx], show_progress=False
)
pages.append(
{
"text": text,
"page_idx": page_idx,
"document_name": self.document_name,
"file_path": self.document_file_path,
"file_url": self.url,
}
)
rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
processed_pages_counter += 1
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
for task in asyncio.as_completed(tasks):
await task
if weave_dataset_name:
weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
return pages
|