File size: 4,348 Bytes
8992b40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import asyncio

import rich
import weave
from pdf2image.pdf2image import convert_from_path
from PIL import Image

from medrag_multi_modal.document_loader.load_text import TextLoader


class ImageLoader(TextLoader):
    """
    ImageLoader is a class that extends the TextLoader class to handle the extraction and
    loading of images from a PDF file.

    This class provides functionality to convert specific pages of a PDF document into images
    and optionally publish these images to a Weave dataset.

    !!! example "Example Usage"
        ```python
        import asyncio

        import weave
        from dotenv import load_dotenv

        from medrag_multi_modal.document_loader import ImageLoader

        load_dotenv()
        weave.init(project_name="ml-colabs/medrag-multi-modal")
        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
        loader = ImageLoader(
            url=url,
            document_name="Gray's Anatomy",
            document_file_path="grays_anatomy.pdf",
        )
        asyncio.run(
            loader.load_data(
                start_page=31,
                end_page=33,
                weave_dataset_name="grays-anatomy-text",
            )
        )
        ```

    Args:
        url (str): The URL of the PDF document.
        document_name (str): The name of the document.
        document_file_path (str): The path to the PDF file.
    """

    def __init__(self, url: str, document_name: str, document_file_path: str):
        super().__init__(url, document_name, document_file_path)

    def extract_data_from_pdf_file(
        self, pdf_file: str, page_number: int
    ) -> Image.Image:
        image = convert_from_path(
            pdf_file, first_page=page_number + 1, last_page=page_number + 1
        )[0]
        return image

    async def load_data(self, start_page: int, end_page: int, weave_dataset_name: str):
        """
        Asynchronously loads images from a PDF file specified by a URL or local file path,
        processes the images for the specified range of pages, and optionally publishes them
        to a Weave dataset.

        This function reads the specified range of pages from a PDF document, converts each page
        to an image using the `pdf2image` library, and returns a list of dictionaries containing
        the image and metadata for each processed page. It processes pages concurrently using
        `asyncio` for efficiency. If a weave_dataset_name is provided, the processed pages are
        published to a Weave dataset.

        Args:
            start_page (int): The starting page index (0-based) to process.
            end_page (int): The ending page index (0-based) to process.
            weave_dataset_name (str): The name of the Weave dataset to publish the pages to,
                if provided.

        Returns:
            list[dict]: A list of dictionaries, each containing the image and metadata for a
                processed page.

        Raises:
            ValueError: If the specified start_page or end_page is out of bounds of the document's
                page count.
        """
        start_page, end_page = self.get_page_indices(start_page, end_page)
        pages = []
        processed_pages_counter: int = 1
        total_pages = end_page - start_page

        async def process_page(page_idx):
            nonlocal processed_pages_counter
            pages.append(
                {
                    "image": convert_from_path(
                        self.document_file_path,
                        first_page=page_idx + 1,
                        last_page=page_idx + 1,
                    )[0],
                    "page_idx": page_idx,
                    "document_name": self.document_name,
                    "file_path": self.document_file_path,
                    "file_url": self.url,
                }
            )
            rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
            processed_pages_counter += 1

        tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
        for task in asyncio.as_completed(tasks):
            await task
        if weave_dataset_name:
            weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
        return pages