geekyrakshit's picture
add: TextImageLoader
7b862ff
raw
history blame
5.53 kB
import asyncio
import os
from glob import glob
from typing import Optional
import pymupdf4llm
import rich
import weave
from PIL import Image
from medrag_multi_modal.document_loader.load_text import TextLoader
class TextImageLoader(TextLoader):
"""
A class for loading and processing text and images from a document.
The TextImageLoader class extends the TextLoader class to provide
functionality for extracting both text and images from a document
specified by a URL, document name, and file path. It processes the
document asynchronously, allowing for efficient handling of large
documents.
!!! example "Example Usage"
```python
import asyncio
import weave
from medrag_multi_modal.document_loader import TextImageLoader
weave.init(project_name="ml-colabs/medrag-multi-modal")
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
loader = TextImageLoader(
url=url,
document_name="Gray's Anatomy",
document_file_path="grays_anatomy.pdf",
)
asyncio.run(
loader.load_data(
start_page=20,
end_page=25,
weave_dataset_name="grays-anatomy-text",
)
)
```
Args:
url (str): The URL of the document to be processed.
document_name (str): The name of the document.
document_file_path (str): The file path where the document is stored.
"""
def __init__(self, url: str, document_name: str, document_file_path: str):
super().__init__(url, document_name, document_file_path)
async def load_data(
self,
start_page: Optional[int] = None,
end_page: Optional[int] = None,
weave_dataset_name: Optional[str] = None,
image_path: Optional[str] = "./images",
dpi: int = 300,
):
"""
Asynchronously loads and processes text and images from a specified range of pages
in a document. This function extracts text in markdown format and images in PNG
format from the document, storing them in a list of dictionaries, each representing
a page. Optionally, the processed data can be published to a Weave dataset.
The function first determines the page indices to process using the
`get_page_indices` method. It then defines an asynchronous inner function,
`process_page`, which handles the extraction of text and images for a single page.
The text is extracted using the `pymupdf4llm.to_markdown` function, and images are
retrieved from the specified image path. The processed data is appended to the
`pages` list.
The function creates a list of tasks for processing each page asynchronously and
awaits their completion. If a `weave_dataset_name` is provided, the processed data
is published to a Weave dataset. Finally, the function returns the list of processed
pages.
Args:
start_page (Optional[int]): The starting page index for processing. If None,
defaults to the first page of the document.
end_page (Optional[int]): The ending page index for processing. If None,
defaults to the last page of the document.
weave_dataset_name (Optional[str]): The name of the Weave dataset to publish
the processed data to. If None, the data is not published.
image_path (Optional[str]): The directory path where extracted images are
stored. Defaults to "./images".
dpi (int): The resolution in dots per inch for image extraction. Defaults to 300.
Returns:
List[Dict]: A list of dictionaries, each containing the extracted text, page
index, document name, file path, file URL, and a list of images for each page
processed.
"""
start_page, end_page = self.get_page_indices(start_page, end_page)
pages = []
processed_pages_counter: int = 1
total_pages = end_page - start_page
async def process_page(page_idx):
nonlocal processed_pages_counter
text = pymupdf4llm.to_markdown(
doc=self.document_file_path,
pages=[page_idx],
show_progress=False,
write_images=True,
image_format="png",
dpi=dpi,
image_path=image_path,
)
image_paths = glob(
os.path.join(image_path, f"{self.document_file_path}-{page_idx}-*.png")
)
print(image_paths)
pages.append(
{
"text": text,
"page_idx": page_idx,
"document_name": self.document_name,
"file_path": self.document_file_path,
"file_url": self.url,
"images": [Image.open(image) for image in image_paths],
}
)
rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
processed_pages_counter += 1
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
for task in asyncio.as_completed(tasks):
await task
if weave_dataset_name:
weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
return pages