File size: 10,201 Bytes
0767396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
import os
import shutil
import asyncio
from urllib.parse import quote
from dotenv import load_dotenv
from io import BufferedIOBase
from typing import List, Optional, Union
from pathlib import Path

from botocore.exceptions import ClientError
from botocore.config import Config
from boto3.session import Session

from pydantic import PrivateAttr
from llama_index.core.async_utils import run_jobs
from llama_parse import LlamaParse
from llama_parse.utils import (
    nest_asyncio_err,
    nest_asyncio_msg,
)

from llama_index.core.schema import Document


load_dotenv()

FileInput = Union[str, bytes, BufferedIOBase]


class S3ImageSaver:
    def __init__(self, bucket_name, access_key=None, secret_key=None, region_name=None):
        self.bucket_name = bucket_name
        self.region_name = region_name
        self.session = Session(
            aws_access_key_id=access_key,
            aws_secret_access_key=secret_key,
            region_name=self.region_name,
        )
        self.s3_client = self.session.client(
            "s3", config=Config(signature_version="s3v4", region_name=self.region_name)
        )

    def save_image(self, image_path, title):
        """Saves an image to the S3 bucket."""
        try:
            print("---Saving Images---")
            title_encoded = quote(title)
            s3_key = f"images/{title}/{os.path.basename(image_path)}"
            with open(image_path, "rb") as file:
                self.s3_client.upload_fileobj(file, self.bucket_name, s3_key)

            s3_url = f"https://{self.bucket_name}.s3.{self.region_name}.amazonaws.com/images/{title_encoded}/{os.path.basename(image_path)}"
            print(f"Image saved to S3 bucket: {s3_url}")
            return s3_url
        except ClientError as e:
            print(f"Error saving image to S3: {e}")
            return None


class LlamaParseWithS3(LlamaParse):
    _s3_image_saver: S3ImageSaver = PrivateAttr()

    def __init__(self, *args, s3_image_saver=None, **kwargs):
        super().__init__(*args, **kwargs)
        self._s3_image_saver = s3_image_saver or S3ImageSaver(
            bucket_name=os.getenv("S3_BUCKET_NAME"),
            access_key=os.getenv("AWS_ACCESS_KEY_ID"),
            secret_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
            region_name="us-west-2",
        )

    async def aget_images(
        self, json_result: List[dict], download_path: str
    ) -> List[dict]:
        """Download images from the parsed result."""
        headers = {"Authorization": f"Bearer {self.api_key}"}

        # make the download path
        if not os.path.exists(download_path):
            os.makedirs(download_path)

        try:
            images = []
            for result in json_result:
                job_id = result["job_id"]
                for page in result["pages"]:
                    if self.verbose:
                        print(f"> Image for page {page['page']}: {page['images']}")
                    for image in page["images"]:
                        image_name = image["name"]

                        # get the full path
                        image_path = os.path.join(download_path, f"{image_name}")

                        # get a valid image path
                        if not image_path.endswith(".png"):
                            if not image_path.endswith(".jpg"):
                                image_path += ".png"

                        image["path"] = image_path
                        image["job_id"] = job_id

                        image["original_file_path"] = result.get("file_path", None)

                        image["page_number"] = page["page"]
                        with open(image_path, "wb") as f:
                            image_url = f"{self.base_url}/api/parsing/job/{job_id}/result/image/{image_name}"
                            async with self.client_context() as client:
                                res = await client.get(
                                    image_url, headers=headers, timeout=self.max_timeout
                                )
                                res.raise_for_status()
                                f.write(res.content)
                        images.append(image)
            return images
        except Exception as e:
            print("Error while downloading images from the parsed result:", e)
            if self.ignore_errors:
                return []
            else:
                raise e

    async def aget_images_s3(self, json_result: List[dict], title) -> List[dict]:

        images = await self.aget_images(
            json_result, download_path="tmp/"
        )  # Download to temporary location

        # Process each image and upload to S3
        for image in images:
            image_path = image["path"]
            try:
                s3_url = self._s3_image_saver.save_image(image_path, title)
                if s3_url:
                    image["image_link"] = s3_url
            except Exception as e:
                print(f"Error saving image to S3: {image_path} - {e}")

        # After processing all images, delete the tmp folder
        folder_path = "tmp/"
        try:
            shutil.rmtree(folder_path)  # Deletes the folder and all its contents
            print(f"Folder {folder_path} and all its contents were deleted successfully.")
        except Exception as e:
            print(f"Error deleting folder {folder_path}: {e}")

        return images

    def get_images(self, json_result: List[dict], title) -> List[dict]:
        """Download images from the parsed result and save them to S3."""
        try:
            return asyncio.run(self.aget_images_s3(json_result, title))
        except RuntimeError as e:
            if nest_asyncio_err in str(e):
                raise RuntimeError(nest_asyncio_msg)
            else:
                raise e
            
    def get_single_job_id(json_result):
    # Check if the list is not empty and has at least one result
        if json_result:
            json_id = json_result[0].get("job_id")
            return json_id
        return None  # Return None if the list is empty

    # The code if we know the job id
    
    # async def _aget_json(
    #     self, job_id, file_path: FileInput, extra_info: Optional[dict] = None
    # ) -> List[dict]:
    #     """Load data from the input path."""
    #     try:
    #         if self.verbose:
    #             print("Started parsing the file under job_id %s" % job_id)
    #         result = await self._get_job_result(job_id, "json")
    #         result["job_id"] = job_id

    #         if not isinstance(file_path, (bytes, BufferedIOBase)):
    #             result["file_path"] = str(file_path)

    #         return [result]
    #     except Exception as e:
    #         file_repr = file_path if isinstance(file_path, str) else "<bytes/buffer>"
    #         print(f"Error while parsing the file '{file_repr}':", e)
    #         if self.ignore_errors:
    #             return []
    #         else:
    #             raise e

    async def aget_json(
        self,
        file_path: Union[List[FileInput], FileInput],
        extra_info: Optional[dict] = None,
    ) -> List[dict]:
        """Load data from the input path."""
        if isinstance(file_path, (str, Path, bytes, BufferedIOBase)):
            return await self._aget_json(file_path, extra_info=extra_info)

            # The code when job id known
            # return await self._aget_json(
            #     job_id="cda0870a-b896-4140-84ea-1565e1aa1565",
            #     file_path=file_path,
            #     extra_info=extra_info,
            # )
        elif isinstance(file_path, list):
            jobs = [self._aget_json(f, extra_info=extra_info) for f in file_path]
            try:
                results = await run_jobs(
                    jobs,
                    workers=self.num_workers,
                    desc="Parsing files",
                    show_progress=self.show_progress,
                )

                # return flattened results
                return [item for sublist in results for item in sublist]
            except RuntimeError as e:
                if nest_asyncio_err in str(e):
                    raise RuntimeError(nest_asyncio_msg)
                else:
                    raise e
        else:
            raise ValueError(
                "The input file_path must be a string or a list of strings."
            )
            
    async def _aload_data(
        self,
        job_id,
        extra_info: Optional[dict] = None,
        verbose: bool = False,
    ) -> List[Document]:
        """Load data from the input path."""
        try:
            result = await self._get_job_result(
                job_id, self.result_type.value, verbose=verbose
            )

            docs = [
                Document(
                    text=result[self.result_type.value],
                    metadata=extra_info or {},
                )
            ]
            if self.split_by_page:
                return self._get_sub_docs(docs)
            else:
                return docs

        except Exception as e:
            print(f"Error while parsing the file :", e)
            if self.ignore_errors:
                return []
            else:
                raise e

    async def aload_data(
        self,
        job_id,
        extra_info: Optional[dict] = None,
    ) -> List[Document]:
        """Load data from the input path."""
        try:
            return await self._aload_data(
                job_id, extra_info=extra_info, verbose=self.verbose
            )

        except RuntimeError as e:
            if nest_asyncio_err in str(e):
                raise RuntimeError(nest_asyncio_msg)
            else:
                raise e

    def load_data(
        self,
        job_id,
        extra_info: Optional[dict] = None,
    ) -> List[Document]:
        """Load data from the input path."""
        try:
            return asyncio.run(self.aload_data(job_id, extra_info))
        except RuntimeError as e:
            if nest_asyncio_err in str(e):
                raise RuntimeError(nest_asyncio_msg)
            else:
                raise e