File size: 5,552 Bytes
7b862ff
 
 
 
 
 
 
 
 
 
e0aff18
7b862ff
 
e0aff18
7b862ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import asyncio
import os
from glob import glob
from typing import Optional

import pymupdf4llm
import rich
import weave
from PIL import Image

from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader


class TextImageLoader(PyMuPDF4LLMTextLoader):
    """
    A class for loading and processing text and images from a document.

    The TextImageLoader class extends the TextLoader class to provide
    functionality for extracting both text and images from a document
    specified by a URL, document name, and file path. It processes the
    document asynchronously, allowing for efficient handling of large
    documents.

    !!! example "Example Usage"
        ```python
        import asyncio

        import weave

        from medrag_multi_modal.document_loader import TextImageLoader

        weave.init(project_name="ml-colabs/medrag-multi-modal")
        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
        loader = TextImageLoader(
            url=url,
            document_name="Gray's Anatomy",
            document_file_path="grays_anatomy.pdf",
        )
        asyncio.run(
            loader.load_data(
                start_page=20,
                end_page=25,
                weave_dataset_name="grays-anatomy-text",
            )
        )
        ```

    Args:
        url (str): The URL of the document to be processed.
        document_name (str): The name of the document.
        document_file_path (str): The file path where the document is stored.
    """

    def __init__(self, url: str, document_name: str, document_file_path: str):
        super().__init__(url, document_name, document_file_path)

    async def load_data(
        self,
        start_page: Optional[int] = None,
        end_page: Optional[int] = None,
        weave_dataset_name: Optional[str] = None,
        image_path: Optional[str] = "./images",
        dpi: int = 300,
    ):
        """
        Asynchronously loads and processes text and images from a specified range of pages
        in a document. This function extracts text in markdown format and images in PNG
        format from the document, storing them in a list of dictionaries, each representing
        a page. Optionally, the processed data can be published to a Weave dataset.

        The function first determines the page indices to process using the
        `get_page_indices` method. It then defines an asynchronous inner function,
        `process_page`, which handles the extraction of text and images for a single page.
        The text is extracted using the `pymupdf4llm.to_markdown` function, and images are
        retrieved from the specified image path. The processed data is appended to the
        `pages` list.

        The function creates a list of tasks for processing each page asynchronously and
        awaits their completion. If a `weave_dataset_name` is provided, the processed data
        is published to a Weave dataset. Finally, the function returns the list of processed
        pages.

        Args:
            start_page (Optional[int]): The starting page index for processing. If None,
                defaults to the first page of the document.
            end_page (Optional[int]): The ending page index for processing. If None,
                defaults to the last page of the document.
            weave_dataset_name (Optional[str]): The name of the Weave dataset to publish
                the processed data to. If None, the data is not published.
            image_path (Optional[str]): The directory path where extracted images are
                stored. Defaults to "./images".
            dpi (int): The resolution in dots per inch for image extraction. Defaults to 300.

        Returns:
            List[Dict]: A list of dictionaries, each containing the extracted text, page
            index, document name, file path, file URL, and a list of images for each page
            processed.
        """
        start_page, end_page = self.get_page_indices(start_page, end_page)
        pages = []
        processed_pages_counter: int = 1
        total_pages = end_page - start_page

        async def process_page(page_idx):
            nonlocal processed_pages_counter
            text = pymupdf4llm.to_markdown(
                doc=self.document_file_path,
                pages=[page_idx],
                show_progress=False,
                write_images=True,
                image_format="png",
                dpi=dpi,
                image_path=image_path,
            )
            image_paths = glob(
                os.path.join(image_path, f"{self.document_file_path}-{page_idx}-*.png")
            )
            print(image_paths)
            pages.append(
                {
                    "text": text,
                    "page_idx": page_idx,
                    "document_name": self.document_name,
                    "file_path": self.document_file_path,
                    "file_url": self.url,
                    "images": [Image.open(image) for image in image_paths],
                }
            )
            rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
            processed_pages_counter += 1

        tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
        for task in asyncio.as_completed(tasks):
            await task
        if weave_dataset_name:
            weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
        return pages