Spaces:
Sleeping
Sleeping
File size: 5,213 Bytes
0767396 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import os
import nest_asyncio
from io import BytesIO
from typing import List
from dotenv import load_dotenv
from fastapi import UploadFile
from llama_index.core.schema import Document
from script.get_metadata import Metadata
from core.prompt import PARSER_INSTRUCTION
from service.llamaparse import S3ImageSaver
from service.llamaparse import LlamaParseWithS3
from utils.error_handlers import handle_error, handle_exception
from fastapi.responses import JSONResponse
load_dotenv()
nest_asyncio.apply()
def get_documents(json_list: List[dict]):
text_documents = []
try:
for idx, page in enumerate(json_list):
text_document = Document(text=page["md"], metadata={"page": page["page"]})
text_documents.append(text_document)
return text_documents
except Exception as e:
return handle_error(
e, "Error processing file in get_documents", status_code=400
)
def parse_journal(title, content: bytes, file_name: str, lang: str = "en"):
"""Parse the journal using LlamaParse."""
try:
# Initialize the parser
s3_image_saver = S3ImageSaver(
bucket_name=os.getenv("S3_BUCKET_NAME"),
access_key=os.getenv("AWS_ACCESS_KEY_ID"),
secret_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
region_name="us-west-2",
)
print("s3 image saver",s3_image_saver)
s3_parser = LlamaParseWithS3(
api_key=os.getenv(
"LLAMA_PARSE_API_KEY"
), # can also be set in your env as LLAMA_CLOUD_API_KEY
parsing_instruction=PARSER_INSTRUCTION,
result_type="markdown", # "markdown" and "text" are available
verbose=True,
language=lang, # Optionally you can define a language, default=en
s3_image_saver=s3_image_saver,
)
md_json_objs = s3_parser.get_json_result(
content, extra_info={"file_name": file_name}
)
json_list = md_json_objs[0]["pages"]
image_dicts = s3_parser.get_images(md_json_objs, title)
if isinstance(image_dicts, JSONResponse):
image_urls=image_dicts # Return the error response directly
else:
image_urls = [
{"page_number": img["page_number"], "image_link": img["image_link"]}
for img in image_dicts
if img["image_link"] is not None
]
return json_list, image_urls
except Exception as e:
return handle_error(
e, "Error processing file in parse_journal", status_code=400
)
async def upload_file(reference, file: UploadFile, lang: str = "en"):
try:
# Read the binary content of the uploaded file once
content = await file.read()
# Store the file content in a BytesIO stream for reuse later
file_stream = BytesIO(content)
# Parse the journal
title = reference["title"]
json_list, image_urls = parse_journal(title, content, file.filename, lang)
parsed_documents = get_parsed_documents(json_list, image_urls)
if isinstance(image_urls, JSONResponse):
return image_urls # Return the error response directly
metadata_gen = Metadata(reference)
documents_with_metadata = metadata_gen.apply_metadata(parsed_documents)
print("Banyak documents : \n", len(documents_with_metadata))
# Return both parsed documents and metadata
return documents_with_metadata, file_stream
except Exception as e:
print("error ", e)
return handle_exception(e)
def get_parsed_documents(json_dicts=None, image_links=None):
try:
"""Split docs into nodes, by separator."""
parsed_documents = []
# Preprocess metadata
md_texts = [d["md"] for d in json_dicts] if json_dicts is not None else None
# Create a dictionary to store lists of image links for each page number
image_link_dict = {}
if image_links:
for item in image_links:
page_number = item["page_number"]
image_link = item["image_link"]
if page_number in image_link_dict:
image_link_dict[page_number].append(image_link)
else:
image_link_dict[page_number] = [image_link]
md_texts = [d["md"] for d in json_dicts]
for idx, md_text in enumerate(md_texts):
page_number = idx + 1
chunk_metadata = {"page_number": page_number}
# Set the image link if it exists; otherwise, set it to None
chunk_metadata["image_links"] = image_link_dict.get(page_number, [])
# Add parsed text and create the Document object
parsed_document = Document(
text=md_text,
metadata=chunk_metadata,
)
parsed_documents.append(parsed_document)
return parsed_documents
except Exception as e:
return handle_error(
e, "Error processing documents in get_text_documents", status_code=400
)
|