File size: 4,923 Bytes
8992b40
bd0ff68
 
8992b40
 
e0aff18
8992b40
 
 
 
e0aff18
8992b40
 
e0aff18
8992b40
24e7c59
 
8992b40
 
 
 
 
 
 
 
bd0ff68
8992b40
 
 
 
 
bd0ff68
8992b40
 
 
 
 
 
 
 
 
 
bd0ff68
8992b40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd0ff68
 
a7ff122
 
bd0ff68
 
 
8992b40
 
 
 
 
 
 
 
bd0ff68
 
 
8992b40
 
a7ff122
 
bd0ff68
 
8992b40
 
 
 
 
 
 
 
 
bd0ff68
8992b40
 
 
 
 
 
 
bd0ff68
 
 
 
 
8992b40
 
 
 
 
 
 
 
bd0ff68
8992b40
 
 
 
 
 
bd0ff68
 
 
 
 
8992b40
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import asyncio
import os
from typing import Optional

import rich
import wandb
import weave
from pdf2image.pdf2image import convert_from_path
from PIL import Image

from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader


class ImageLoader(PyMuPDF4LLMTextLoader):
    """
    `ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
    loading of pages from a PDF file as images.

    This class provides functionality to convert specific pages of a PDF document into images
    and optionally publish these images to a Weave dataset.

    !!! example "Example Usage"
        ```python
        import asyncio

        import wandb
        from dotenv import load_dotenv

        from medrag_multi_modal.document_loader import ImageLoader

        load_dotenv()
        wandb.init(project="medrag-multi-modal", entity="ml-colabs")
        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
        loader = ImageLoader(
            url=url,
            document_name="Gray's Anatomy",
            document_file_path="grays_anatomy.pdf",
        )
        asyncio.run(
            loader.load_data(
                start_page=31,
                end_page=33,
                dataset_name="grays-anatomy-images",
            )
        )
        ```

    Args:
        url (str): The URL of the PDF document.
        document_name (str): The name of the document.
        document_file_path (str): The path to the PDF file.
    """

    def __init__(self, url: str, document_name: str, document_file_path: str):
        super().__init__(url, document_name, document_file_path)

    def extract_data_from_pdf_file(
        self, pdf_file: str, page_number: int
    ) -> Image.Image:
        image = convert_from_path(
            pdf_file, first_page=page_number + 1, last_page=page_number + 1
        )[0]
        return image

    async def load_data(
        self,
        start_page: Optional[int] = None,
        end_page: Optional[int] = None,
        image_save_dir: str = "./images",
        dataset_name: Optional[str] = None,
    ):
        """
        Asynchronously loads images from a PDF file specified by a URL or local file path,
        processes the images for the specified range of pages, and optionally publishes them
        to a Weave dataset.

        This function reads the specified range of pages from a PDF document, converts each page
        to an image using the `pdf2image` library, and returns a list of dictionaries containing
        the image and metadata for each processed page. It processes pages concurrently using
        `asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
        published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
        with the specified name.

        Args:
            start_page (Optional[int]): The starting page index (0-based) to process.
            end_page (Optional[int]): The ending page index (0-based) to process.
            dataset_name (Optional[str]): The name of the Weave dataset to publish the
                processed images to. Defaults to None.

        Returns:
            list[dict]: A list of dictionaries, each containing the image and metadata for a
                processed page.

        Raises:
            ValueError: If the specified start_page or end_page is out of bounds of the document's
                page count.
        """
        os.makedirs(image_save_dir, exist_ok=True)
        start_page, end_page = self.get_page_indices(start_page, end_page)
        pages = []
        processed_pages_counter: int = 1
        total_pages = end_page - start_page

        async def process_page(page_idx):
            nonlocal processed_pages_counter
            image = convert_from_path(
                self.document_file_path,
                first_page=page_idx + 1,
                last_page=page_idx + 1,
            )[0]
            pages.append(
                {
                    "page_idx": page_idx,
                    "document_name": self.document_name,
                    "file_path": self.document_file_path,
                    "file_url": self.url,
                }
            )
            image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
            rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
            processed_pages_counter += 1

        tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
        for task in asyncio.as_completed(tasks):
            await task
        if dataset_name:
            artifact = wandb.Artifact(name=dataset_name, type="dataset")
            artifact.add_dir(local_path=image_save_dir)
            artifact.save()
            weave.publish(weave.Dataset(name=dataset_name, rows=pages))
        return pages