Spaces:
Sleeping
Sleeping
File size: 10,201 Bytes
0767396 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 |
import os
import shutil
import asyncio
from urllib.parse import quote
from dotenv import load_dotenv
from io import BufferedIOBase
from typing import List, Optional, Union
from pathlib import Path
from botocore.exceptions import ClientError
from botocore.config import Config
from boto3.session import Session
from pydantic import PrivateAttr
from llama_index.core.async_utils import run_jobs
from llama_parse import LlamaParse
from llama_parse.utils import (
nest_asyncio_err,
nest_asyncio_msg,
)
from llama_index.core.schema import Document
load_dotenv()
FileInput = Union[str, bytes, BufferedIOBase]
class S3ImageSaver:
def __init__(self, bucket_name, access_key=None, secret_key=None, region_name=None):
self.bucket_name = bucket_name
self.region_name = region_name
self.session = Session(
aws_access_key_id=access_key,
aws_secret_access_key=secret_key,
region_name=self.region_name,
)
self.s3_client = self.session.client(
"s3", config=Config(signature_version="s3v4", region_name=self.region_name)
)
def save_image(self, image_path, title):
"""Saves an image to the S3 bucket."""
try:
print("---Saving Images---")
title_encoded = quote(title)
s3_key = f"images/{title}/{os.path.basename(image_path)}"
with open(image_path, "rb") as file:
self.s3_client.upload_fileobj(file, self.bucket_name, s3_key)
s3_url = f"https://{self.bucket_name}.s3.{self.region_name}.amazonaws.com/images/{title_encoded}/{os.path.basename(image_path)}"
print(f"Image saved to S3 bucket: {s3_url}")
return s3_url
except ClientError as e:
print(f"Error saving image to S3: {e}")
return None
class LlamaParseWithS3(LlamaParse):
_s3_image_saver: S3ImageSaver = PrivateAttr()
def __init__(self, *args, s3_image_saver=None, **kwargs):
super().__init__(*args, **kwargs)
self._s3_image_saver = s3_image_saver or S3ImageSaver(
bucket_name=os.getenv("S3_BUCKET_NAME"),
access_key=os.getenv("AWS_ACCESS_KEY_ID"),
secret_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
region_name="us-west-2",
)
async def aget_images(
self, json_result: List[dict], download_path: str
) -> List[dict]:
"""Download images from the parsed result."""
headers = {"Authorization": f"Bearer {self.api_key}"}
# make the download path
if not os.path.exists(download_path):
os.makedirs(download_path)
try:
images = []
for result in json_result:
job_id = result["job_id"]
for page in result["pages"]:
if self.verbose:
print(f"> Image for page {page['page']}: {page['images']}")
for image in page["images"]:
image_name = image["name"]
# get the full path
image_path = os.path.join(download_path, f"{image_name}")
# get a valid image path
if not image_path.endswith(".png"):
if not image_path.endswith(".jpg"):
image_path += ".png"
image["path"] = image_path
image["job_id"] = job_id
image["original_file_path"] = result.get("file_path", None)
image["page_number"] = page["page"]
with open(image_path, "wb") as f:
image_url = f"{self.base_url}/api/parsing/job/{job_id}/result/image/{image_name}"
async with self.client_context() as client:
res = await client.get(
image_url, headers=headers, timeout=self.max_timeout
)
res.raise_for_status()
f.write(res.content)
images.append(image)
return images
except Exception as e:
print("Error while downloading images from the parsed result:", e)
if self.ignore_errors:
return []
else:
raise e
async def aget_images_s3(self, json_result: List[dict], title) -> List[dict]:
images = await self.aget_images(
json_result, download_path="tmp/"
) # Download to temporary location
# Process each image and upload to S3
for image in images:
image_path = image["path"]
try:
s3_url = self._s3_image_saver.save_image(image_path, title)
if s3_url:
image["image_link"] = s3_url
except Exception as e:
print(f"Error saving image to S3: {image_path} - {e}")
# After processing all images, delete the tmp folder
folder_path = "tmp/"
try:
shutil.rmtree(folder_path) # Deletes the folder and all its contents
print(f"Folder {folder_path} and all its contents were deleted successfully.")
except Exception as e:
print(f"Error deleting folder {folder_path}: {e}")
return images
def get_images(self, json_result: List[dict], title) -> List[dict]:
"""Download images from the parsed result and save them to S3."""
try:
return asyncio.run(self.aget_images_s3(json_result, title))
except RuntimeError as e:
if nest_asyncio_err in str(e):
raise RuntimeError(nest_asyncio_msg)
else:
raise e
def get_single_job_id(json_result):
# Check if the list is not empty and has at least one result
if json_result:
json_id = json_result[0].get("job_id")
return json_id
return None # Return None if the list is empty
# The code if we know the job id
# async def _aget_json(
# self, job_id, file_path: FileInput, extra_info: Optional[dict] = None
# ) -> List[dict]:
# """Load data from the input path."""
# try:
# if self.verbose:
# print("Started parsing the file under job_id %s" % job_id)
# result = await self._get_job_result(job_id, "json")
# result["job_id"] = job_id
# if not isinstance(file_path, (bytes, BufferedIOBase)):
# result["file_path"] = str(file_path)
# return [result]
# except Exception as e:
# file_repr = file_path if isinstance(file_path, str) else "<bytes/buffer>"
# print(f"Error while parsing the file '{file_repr}':", e)
# if self.ignore_errors:
# return []
# else:
# raise e
async def aget_json(
self,
file_path: Union[List[FileInput], FileInput],
extra_info: Optional[dict] = None,
) -> List[dict]:
"""Load data from the input path."""
if isinstance(file_path, (str, Path, bytes, BufferedIOBase)):
return await self._aget_json(file_path, extra_info=extra_info)
# The code when job id known
# return await self._aget_json(
# job_id="cda0870a-b896-4140-84ea-1565e1aa1565",
# file_path=file_path,
# extra_info=extra_info,
# )
elif isinstance(file_path, list):
jobs = [self._aget_json(f, extra_info=extra_info) for f in file_path]
try:
results = await run_jobs(
jobs,
workers=self.num_workers,
desc="Parsing files",
show_progress=self.show_progress,
)
# return flattened results
return [item for sublist in results for item in sublist]
except RuntimeError as e:
if nest_asyncio_err in str(e):
raise RuntimeError(nest_asyncio_msg)
else:
raise e
else:
raise ValueError(
"The input file_path must be a string or a list of strings."
)
async def _aload_data(
self,
job_id,
extra_info: Optional[dict] = None,
verbose: bool = False,
) -> List[Document]:
"""Load data from the input path."""
try:
result = await self._get_job_result(
job_id, self.result_type.value, verbose=verbose
)
docs = [
Document(
text=result[self.result_type.value],
metadata=extra_info or {},
)
]
if self.split_by_page:
return self._get_sub_docs(docs)
else:
return docs
except Exception as e:
print(f"Error while parsing the file :", e)
if self.ignore_errors:
return []
else:
raise e
async def aload_data(
self,
job_id,
extra_info: Optional[dict] = None,
) -> List[Document]:
"""Load data from the input path."""
try:
return await self._aload_data(
job_id, extra_info=extra_info, verbose=self.verbose
)
except RuntimeError as e:
if nest_asyncio_err in str(e):
raise RuntimeError(nest_asyncio_msg)
else:
raise e
def load_data(
self,
job_id,
extra_info: Optional[dict] = None,
) -> List[Document]:
"""Load data from the input path."""
try:
return asyncio.run(self.aload_data(job_id, extra_info))
except RuntimeError as e:
if nest_asyncio_err in str(e):
raise RuntimeError(nest_asyncio_msg)
else:
raise e
|