Spaces:

geekyrakshit
/

medrag

Sleeping

App Files Files Community

medrag / medrag_multi_modal /document_loader /load_image.py

mratanusarkar

chore: format & linting + __init__ + fix: imports

e0aff18 6 months ago

raw

history blame

4.92 kB

	import asyncio
	import os
	from typing import Optional

	import rich
	import wandb
	import weave
	from pdf2image.pdf2image import convert_from_path
	from PIL import Image

	from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader


	class ImageLoader(PyMuPDF4LLMTextLoader):
	"""
	`ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
	loading of pages from a PDF file as images.

	This class provides functionality to convert specific pages of a PDF document into images
	and optionally publish these images to a Weave dataset.

	!!! example "Example Usage"
	```python
	import asyncio

	import wandb
	from dotenv import load_dotenv

	from medrag_multi_modal.document_loader import ImageLoader

	load_dotenv()
	wandb.init(project="medrag-multi-modal", entity="ml-colabs")
	url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
	loader = ImageLoader(
	url=url,
	document_name="Gray's Anatomy",
	document_file_path="grays_anatomy.pdf",
	)
	asyncio.run(
	loader.load_data(
	start_page=31,
	end_page=33,
	dataset_name="grays-anatomy-images",
	)
	)
	```

	Args:
	url (str): The URL of the PDF document.
	document_name (str): The name of the document.
	document_file_path (str): The path to the PDF file.
	"""

	def __init__(self, url: str, document_name: str, document_file_path: str):
	super().__init__(url, document_name, document_file_path)

	def extract_data_from_pdf_file(
	self, pdf_file: str, page_number: int
	) -> Image.Image:
	image = convert_from_path(
	pdf_file, first_page=page_number + 1, last_page=page_number + 1
	)[0]
	return image

	async def load_data(
	self,
	start_page: Optional[int] = None,
	end_page: Optional[int] = None,
	image_save_dir: str = "./images",
	dataset_name: Optional[str] = None,
	):
	"""
	Asynchronously loads images from a PDF file specified by a URL or local file path,
	processes the images for the specified range of pages, and optionally publishes them
	to a Weave dataset.

	This function reads the specified range of pages from a PDF document, converts each page
	to an image using the `pdf2image` library, and returns a list of dictionaries containing
	the image and metadata for each processed page. It processes pages concurrently using
	`asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
	published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
	with the specified name.

	Args:
	start_page (Optional[int]): The starting page index (0-based) to process.
	end_page (Optional[int]): The ending page index (0-based) to process.
	dataset_name (Optional[str]): The name of the Weave dataset to publish the
	processed images to. Defaults to None.

	Returns:
	list[dict]: A list of dictionaries, each containing the image and metadata for a
	processed page.

	Raises:
	ValueError: If the specified start_page or end_page is out of bounds of the document's
	page count.
	"""
	os.makedirs(image_save_dir, exist_ok=True)
	start_page, end_page = self.get_page_indices(start_page, end_page)
	pages = []
	processed_pages_counter: int = 1
	total_pages = end_page - start_page

	async def process_page(page_idx):
	nonlocal processed_pages_counter
	image = convert_from_path(
	self.document_file_path,
	first_page=page_idx + 1,
	last_page=page_idx + 1,
	)[0]
	pages.append(
	{
	"page_idx": page_idx,
	"document_name": self.document_name,
	"file_path": self.document_file_path,
	"file_url": self.url,
	}
	)
	image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
	rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
	processed_pages_counter += 1

	tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
	for task in asyncio.as_completed(tasks):
	await task
	if dataset_name:
	artifact = wandb.Artifact(name=dataset_name, type="dataset")
	artifact.add_dir(local_path=image_save_dir)
	artifact.save()
	weave.publish(weave.Dataset(name=dataset_name, rows=pages))
	return pages