import asyncio import os from typing import Optional import rich import wandb import weave from pdf2image.pdf2image import convert_from_path from PIL import Image from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader class ImageLoader(PyMuPDF4LLMTextLoader): """ `ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and loading of pages from a PDF file as images. This class provides functionality to convert specific pages of a PDF document into images and optionally publish these images to a Weave dataset. !!! example "Example Usage" ```python import asyncio import wandb from dotenv import load_dotenv from medrag_multi_modal.document_loader import ImageLoader load_dotenv() wandb.init(project="medrag-multi-modal", entity="ml-colabs") url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf" loader = ImageLoader( url=url, document_name="Gray's Anatomy", document_file_path="grays_anatomy.pdf", ) asyncio.run( loader.load_data( start_page=31, end_page=33, dataset_name="grays-anatomy-images", ) ) ``` Args: url (str): The URL of the PDF document. document_name (str): The name of the document. document_file_path (str): The path to the PDF file. """ def __init__(self, url: str, document_name: str, document_file_path: str): super().__init__(url, document_name, document_file_path) def extract_data_from_pdf_file( self, pdf_file: str, page_number: int ) -> Image.Image: image = convert_from_path( pdf_file, first_page=page_number + 1, last_page=page_number + 1 )[0] return image async def load_data( self, start_page: Optional[int] = None, end_page: Optional[int] = None, image_save_dir: str = "./images", dataset_name: Optional[str] = None, ): """ Asynchronously loads images from a PDF file specified by a URL or local file path, processes the images for the specified range of pages, and optionally publishes them to a Weave dataset. This function reads the specified range of pages from a PDF document, converts each page to an image using the `pdf2image` library, and returns a list of dictionaries containing the image and metadata for each processed page. It processes pages concurrently using `asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are published to Weights & Biases artifact and the corresponding metadata to a Weave dataset with the specified name. Args: start_page (Optional[int]): The starting page index (0-based) to process. end_page (Optional[int]): The ending page index (0-based) to process. dataset_name (Optional[str]): The name of the Weave dataset to publish the processed images to. Defaults to None. Returns: list[dict]: A list of dictionaries, each containing the image and metadata for a processed page. Raises: ValueError: If the specified start_page or end_page is out of bounds of the document's page count. """ os.makedirs(image_save_dir, exist_ok=True) start_page, end_page = self.get_page_indices(start_page, end_page) pages = [] processed_pages_counter: int = 1 total_pages = end_page - start_page async def process_page(page_idx): nonlocal processed_pages_counter image = convert_from_path( self.document_file_path, first_page=page_idx + 1, last_page=page_idx + 1, )[0] pages.append( { "page_idx": page_idx, "document_name": self.document_name, "file_path": self.document_file_path, "file_url": self.url, } ) image.save(os.path.join(image_save_dir, f"{page_idx}.png")) rich.print(f"Processed pages {processed_pages_counter}/{total_pages}") processed_pages_counter += 1 tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)] for task in asyncio.as_completed(tasks): await task if dataset_name: artifact = wandb.Artifact(name=dataset_name, type="dataset") artifact.add_dir(local_path=image_save_dir) artifact.save() weave.publish(weave.Dataset(name=dataset_name, rows=pages)) return pages