File size: 4,501 Bytes
9002555
 
 
 
69beac6
9002555
69beac6
d57efd6
9002555
 
b39c0ba
9002555
 
69beac6
9002555
 
 
 
 
 
 
 
 
 
 
 
69beac6
9002555
 
 
69beac6
 
 
 
 
 
 
9002555
 
 
 
 
 
 
 
 
 
 
 
d57efd6
9002555
d57efd6
9002555
 
 
 
69beac6
9002555
 
69beac6
 
 
 
 
9002555
 
69beac6
9002555
d57efd6
 
 
 
 
 
 
 
 
9002555
69beac6
d57efd6
 
 
9002555
 
 
d57efd6
b39c0ba
9002555
69beac6
9002555
b39c0ba
 
 
69beac6
 
 
b39c0ba
 
 
 
 
 
 
 
 
69beac6
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69beac6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from llama_index.core.ingestion import IngestionPipeline
from llama_index.embeddings.openai import OpenAIEmbedding
from config import PINECONE_CONFIG
from pinecone.grpc import PineconeGRPC as Pinecone
# from service.reader import Reader
from script.get_metadata import Metadata
from fastapi import UploadFile, status
from fastapi.responses import JSONResponse

from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)
from service.reader_v3 import upload_file

# from script.get_topic import extract_topic

import logging
import random


class Uploader:
    # def __init__(self, reference, file: UploadFile, content_table: UploadFile):
    def __init__(self, reference, file: UploadFile):
        self.file = file
        # self.content_table = content_table
        # self.reader = Reader()
        self.reference = reference
        self.metadata = Metadata(reference)

    # async def ingest_documents(self, file: UploadFile):
    #     """Load documents from the storage path."""
    #     documents = await self.reader.read_from_uploadfile(file)
    #     print("Banyak document : ", len(documents))
    #     print("document successfully ingested")

    #     return documents

    def check_existing_metadata(self, pinecone_index, title, random_vector):
        try:
            result = pinecone_index.query(
                vector=random_vector,
                top_k=1,
                filter={
                    "title": {"$eq": title},
                },
            )
            return result["matches"]
        except Exception as e:
            return JSONResponse(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                content=f"Error check existing metadata {str(e)}",
            )

    async def process_documents(self):
        # Ingest documents
        # documents = await self.ingest_documents(self.file)

        # Get metadata
        # documents_with_metadata = self.metadata.apply_metadata(documents)
        documents_with_metadata = await upload_file(self.reference, self.file)

        # Get Topic
        # topic_extractor = extract_topic(self.reference, self.content_table)
        # document_filtered = self.filter_document(documents_with_metadata)

        embed_model = OpenAIEmbedding()
        # Set up the ingestion pipeline
        pipeline = IngestionPipeline(
            transformations=[
                SemanticSplitterNodeParser(
                    buffer_size=1,
                    breakpoint_percentile_threshold=95,
                    embed_model=embed_model,
                ),
                # topic_extractor,
            ]
        )

        # splitter = SemanticSplitterNodeParser(
        #     buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
        # )

        #  Run the pipeline
        try:
            nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
            # nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
            return nodes_with_metadata

        except Exception as e:
            try:
                # If the first method fails, fallback to sentence splitter
                sentence_splitter = SentenceSplitter(chunk_size=512)
                nodes_with_metadata = sentence_splitter.get_nodes_from_documents(
                    documents_with_metadata
                )
                print("Pipeline processing completed with SentenceSplitter fallback.")
                return nodes_with_metadata
            except Exception as fallback_error:
                # Log the second error and return JSONResponse for FastAPI
                logging.error(f"Error with SentenceSplitter fallback: {fallback_error}")
                return JSONResponse(
                    status_code=500,
                    content="An internal server error occurred during pipeline processing.",
                )

    def filter_document(self, documents):
        api_key = PINECONE_CONFIG.PINECONE_API_KEY
        client = Pinecone(api_key=api_key)
        pinecone_index = client.Index("test")

        random_vector = [random.uniform(0, 1) for _ in range(1536)]

        filtered_documents = []
        for doc in documents:
            result = self.check_existing_metadata(
                pinecone_index, doc.metadata["title"], random_vector
            )

            if len(result) == 0:
                filtered_documents.append(doc)

        return filtered_documents