File size: 5,213 Bytes
0767396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os
import nest_asyncio
from io import BytesIO
from typing import List

from dotenv import load_dotenv
from fastapi import UploadFile

from llama_index.core.schema import Document
from script.get_metadata import Metadata
from core.prompt import PARSER_INSTRUCTION
from service.llamaparse import S3ImageSaver
from service.llamaparse import LlamaParseWithS3
from utils.error_handlers import handle_error, handle_exception
from fastapi.responses import JSONResponse

load_dotenv()


nest_asyncio.apply()


def get_documents(json_list: List[dict]):
    text_documents = []
    try:
        for idx, page in enumerate(json_list):
            text_document = Document(text=page["md"], metadata={"page": page["page"]})
            text_documents.append(text_document)
        return text_documents
    except Exception as e:
        return handle_error(
            e, "Error processing file in get_documents", status_code=400
        )


def parse_journal(title, content: bytes, file_name: str, lang: str = "en"):
    """Parse the journal using LlamaParse."""
    try:
        # Initialize the parser
        s3_image_saver = S3ImageSaver(
            bucket_name=os.getenv("S3_BUCKET_NAME"),
            access_key=os.getenv("AWS_ACCESS_KEY_ID"),
            secret_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
            region_name="us-west-2",
        )
        print("s3 image saver",s3_image_saver)

        s3_parser = LlamaParseWithS3(
            api_key=os.getenv(
                "LLAMA_PARSE_API_KEY"
            ),  # can also be set in your env as LLAMA_CLOUD_API_KEY
            parsing_instruction=PARSER_INSTRUCTION,
            result_type="markdown",  # "markdown" and "text" are available
            verbose=True,
            language=lang,  # Optionally you can define a language, default=en
            s3_image_saver=s3_image_saver,
        )
        
        md_json_objs = s3_parser.get_json_result(
            content, extra_info={"file_name": file_name}
        )
        
        json_list = md_json_objs[0]["pages"]

        image_dicts = s3_parser.get_images(md_json_objs, title)
        
        if isinstance(image_dicts, JSONResponse):
            image_urls=image_dicts  # Return the error response directly
        else:
            image_urls = [
                {"page_number": img["page_number"], "image_link": img["image_link"]}
                for img in image_dicts
                if img["image_link"] is not None
            ]

        return json_list, image_urls

    except Exception as e:
        return handle_error(
            e, "Error processing file in parse_journal", status_code=400
        )


async def upload_file(reference, file: UploadFile, lang: str = "en"):
    try:
        # Read the binary content of the uploaded file once
        content = await file.read()
        
        # Store the file content in a BytesIO stream for reuse later
        file_stream = BytesIO(content)
        
        # Parse the journal
        title = reference["title"]

        json_list, image_urls = parse_journal(title, content, file.filename, lang)
        parsed_documents = get_parsed_documents(json_list, image_urls)
        
        if isinstance(image_urls, JSONResponse):
            return image_urls  # Return the error response directly

        metadata_gen = Metadata(reference)
        documents_with_metadata = metadata_gen.apply_metadata(parsed_documents)

        print("Banyak documents : \n", len(documents_with_metadata))

        # Return both parsed documents and metadata
        return documents_with_metadata, file_stream

    except Exception as e:
        print("error ", e)
        return handle_exception(e)

def get_parsed_documents(json_dicts=None, image_links=None):
    try:
        """Split docs into nodes, by separator."""
        parsed_documents = []

        # Preprocess metadata
        md_texts = [d["md"] for d in json_dicts] if json_dicts is not None else None
        
        # Create a dictionary to store lists of image links for each page number
        image_link_dict = {}
        if image_links:
            for item in image_links:
                page_number = item["page_number"]
                image_link = item["image_link"]
                if page_number in image_link_dict:
                    image_link_dict[page_number].append(image_link)
                else:
                    image_link_dict[page_number] = [image_link]  
        
        md_texts = [d["md"] for d in json_dicts]

        for idx, md_text in enumerate(md_texts):
            page_number = idx + 1
            chunk_metadata = {"page_number": page_number}

            # Set the image link if it exists; otherwise, set it to None
            chunk_metadata["image_links"] = image_link_dict.get(page_number, [])

            # Add parsed text and create the Document object
            parsed_document = Document(
                text=md_text,
                metadata=chunk_metadata,
            )

            parsed_documents.append(parsed_document)

        return parsed_documents
    except Exception as e:
        return handle_error(
            e, "Error processing documents in get_text_documents", status_code=400
        )