File size: 3,738 Bytes
9002555
 
 
 
 
 
d57efd6
 
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d57efd6
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d57efd6
9002555
d57efd6
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d57efd6
 
 
 
 
 
 
 
 
9002555
d57efd6
 
 
 
9002555
 
 
d57efd6
 
9002555
d57efd6
9002555
d57efd6
9002555
d57efd6
9002555
d57efd6
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from llama_index.core.ingestion import IngestionPipeline
from llama_index.embeddings.openai import OpenAIEmbedding
from config import PINECONE_CONFIG
from pinecone.grpc import PineconeGRPC as Pinecone
from service.reader import Reader
from script.get_metadata import Metadata
from fastapi import UploadFile,status
from fastapi.responses import JSONResponse

from llama_index.core.node_parser import (
    SemanticSplitterNodeParser,
)

# from script.get_topic import extract_topic

import logging
import random


class Uploader:
    # def __init__(self, reference, file: UploadFile, content_table: UploadFile):
    def __init__(self, reference, file: UploadFile):
        self.file = file
        # self.content_table = content_table
        self.reader = Reader()
        self.reference = reference
        self.metadata = Metadata(reference)

    async def ingest_documents(self, file: UploadFile):
        """Load documents from the storage path."""
        documents = await self.reader.read_from_uploadfile(file)
        print("Banyak document : ", len(documents))
        print("document successfully ingested")

        return documents
    
    def check_existing_metadata(self, pinecone_index, title, random_vector):
        try:
            result = pinecone_index.query(
                vector=random_vector,
                top_k=1,
                filter={
                    "title": {"$eq": title},
                },
            )
            return result["matches"]
        except Exception as e:
            return JSONResponse(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                content=f"Error check existing metadata {str(e)}",
            )

    async def process_documents(self):
        # Ingest documents
        documents = await self.ingest_documents(self.file)

        # topic_extractor = extract_topic(self.reference, self.content_table)

        embed_model = OpenAIEmbedding()

        # Get metadata
        documents_with_metadata = self.metadata.apply_metadata(documents)
        
        # document_filtered = self.filter_document(documents_with_metadata)

        # Set up the ingestion pipeline
        pipeline = IngestionPipeline(
            transformations=[
                SemanticSplitterNodeParser(
                    buffer_size=1,
                    breakpoint_percentile_threshold=95,
                    embed_model=embed_model,
                ),
                # topic_extractor,
            ]
        )
        
        # splitter = SemanticSplitterNodeParser(
        #     buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
        # )

        #  Run the pipeline
        try:
            nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
            
            return nodes_with_metadata
        
        except Exception as e:
            # Log the error and return JSONResponse for FastAPI
            logging.error(f"An error occurred in making pipeline: {e}")
            return JSONResponse(
                status_code=500,
                content="An internal server error occurred making pipeline.",
            )
            
    def filter_document(self, documents):
        api_key = PINECONE_CONFIG.PINECONE_API_KEY
        client = Pinecone(api_key=api_key)
        pinecone_index = client.Index("test")

        random_vector = [random.uniform(0, 1) for _ in range(1536)]

        filtered_documents = []
        for doc in documents:
            result = self.check_existing_metadata(
                pinecone_index, doc.metadata["title"], random_vector
            )

            if len(result) == 0:
                filtered_documents.append(doc)

        return filtered_documents