Spaces:
Running
Running
File size: 4,923 Bytes
8992b40 bd0ff68 8992b40 e0aff18 8992b40 e0aff18 8992b40 e0aff18 8992b40 24e7c59 8992b40 bd0ff68 8992b40 bd0ff68 8992b40 bd0ff68 8992b40 bd0ff68 a7ff122 bd0ff68 8992b40 bd0ff68 8992b40 a7ff122 bd0ff68 8992b40 bd0ff68 8992b40 bd0ff68 8992b40 bd0ff68 8992b40 bd0ff68 8992b40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import asyncio
import os
from typing import Optional
import rich
import wandb
import weave
from pdf2image.pdf2image import convert_from_path
from PIL import Image
from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
class ImageLoader(PyMuPDF4LLMTextLoader):
"""
`ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
loading of pages from a PDF file as images.
This class provides functionality to convert specific pages of a PDF document into images
and optionally publish these images to a Weave dataset.
!!! example "Example Usage"
```python
import asyncio
import wandb
from dotenv import load_dotenv
from medrag_multi_modal.document_loader import ImageLoader
load_dotenv()
wandb.init(project="medrag-multi-modal", entity="ml-colabs")
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
loader = ImageLoader(
url=url,
document_name="Gray's Anatomy",
document_file_path="grays_anatomy.pdf",
)
asyncio.run(
loader.load_data(
start_page=31,
end_page=33,
dataset_name="grays-anatomy-images",
)
)
```
Args:
url (str): The URL of the PDF document.
document_name (str): The name of the document.
document_file_path (str): The path to the PDF file.
"""
def __init__(self, url: str, document_name: str, document_file_path: str):
super().__init__(url, document_name, document_file_path)
def extract_data_from_pdf_file(
self, pdf_file: str, page_number: int
) -> Image.Image:
image = convert_from_path(
pdf_file, first_page=page_number + 1, last_page=page_number + 1
)[0]
return image
async def load_data(
self,
start_page: Optional[int] = None,
end_page: Optional[int] = None,
image_save_dir: str = "./images",
dataset_name: Optional[str] = None,
):
"""
Asynchronously loads images from a PDF file specified by a URL or local file path,
processes the images for the specified range of pages, and optionally publishes them
to a Weave dataset.
This function reads the specified range of pages from a PDF document, converts each page
to an image using the `pdf2image` library, and returns a list of dictionaries containing
the image and metadata for each processed page. It processes pages concurrently using
`asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
with the specified name.
Args:
start_page (Optional[int]): The starting page index (0-based) to process.
end_page (Optional[int]): The ending page index (0-based) to process.
dataset_name (Optional[str]): The name of the Weave dataset to publish the
processed images to. Defaults to None.
Returns:
list[dict]: A list of dictionaries, each containing the image and metadata for a
processed page.
Raises:
ValueError: If the specified start_page or end_page is out of bounds of the document's
page count.
"""
os.makedirs(image_save_dir, exist_ok=True)
start_page, end_page = self.get_page_indices(start_page, end_page)
pages = []
processed_pages_counter: int = 1
total_pages = end_page - start_page
async def process_page(page_idx):
nonlocal processed_pages_counter
image = convert_from_path(
self.document_file_path,
first_page=page_idx + 1,
last_page=page_idx + 1,
)[0]
pages.append(
{
"page_idx": page_idx,
"document_name": self.document_name,
"file_path": self.document_file_path,
"file_url": self.url,
}
)
image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
processed_pages_counter += 1
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
for task in asyncio.as_completed(tasks):
await task
if dataset_name:
artifact = wandb.Artifact(name=dataset_name, type="dataset")
artifact.add_dir(local_path=image_save_dir)
artifact.save()
weave.publish(weave.Dataset(name=dataset_name, rows=pages))
return pages
|