File size: 4,263 Bytes
9002555
 
 
 
69beac6
9002555
69beac6
d57efd6
9002555
 
b39c0ba
9002555
 
0767396
 
 
9002555
 
 
 
 
 
 
 
 
0767396
9002555
 
69beac6
9002555
 
0767396
9002555
 
 
 
 
 
 
 
 
 
 
 
d57efd6
9002555
d57efd6
9002555
 
 
 
0767396
 
 
 
69beac6
9002555
0767396
 
 
9002555
d57efd6
 
 
 
 
 
 
 
 
9002555
69beac6
d57efd6
 
 
9002555
 
 
0767396
d57efd6
661a3cb
69beac6
9002555
b39c0ba
 
 
69beac6
 
 
b39c0ba
0767396
b39c0ba
 
 
 
 
 
 
69beac6
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69beac6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from llama_index.core.ingestion import IngestionPipeline
from llama_index.embeddings.openai import OpenAIEmbedding
from config import PINECONE_CONFIG
from pinecone.grpc import PineconeGRPC as Pinecone
# from service.reader import Reader
from script.get_metadata import Metadata
from fastapi import UploadFile, status
from fastapi.responses import JSONResponse

from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)
from llama_index.core import Settings
# from service.reader_v3 import upload_file
from service.reader_v4 import upload_file

# from script.get_topic import extract_topic

import logging
import random


class Uploader:
    # def __init__(self, reference, file: UploadFile, content_table: UploadFile):
    def __init__(self, reference, file: UploadFile, lang: str = "en"):
        self.file = file
        # self.content_table = content_table
        # self.reader = Reader()
        self.reference = reference
        self.metadata = Metadata(reference)
        self.lang = lang

    def check_existing_metadata(self, pinecone_index, title, random_vector):
        try:
            result = pinecone_index.query(
                vector=random_vector,
                top_k=1,
                filter={
                    "title": {"$eq": title},
                },
            )
            return result["matches"]
        except Exception as e:
            return JSONResponse(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                content=f"Error check existing metadata {str(e)}",
            )

    async def process_documents(self):
        # Get metadata
        documents_with_metadata, file_stream = await upload_file(self.reference, self.file, self.lang)

        if isinstance(documents_with_metadata, JSONResponse):
            return documents_with_metadata  # Return the error response directly


        # embed_model = OpenAIEmbedding()
        embed_model = OpenAIEmbedding(model="text-embedding-3-large")
        Settings.embed_model = embed_model
        # Set up the ingestion pipeline
        pipeline = IngestionPipeline(
            transformations=[
                SemanticSplitterNodeParser(
                    buffer_size=1,
                    breakpoint_percentile_threshold=95,
                    embed_model=embed_model,
                ),
                # topic_extractor,
            ]
        )

        # splitter = SemanticSplitterNodeParser(
        #     buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
        # )

        #  Run the pipeline
        try:
            print("Pipeline processing completed with Semantic Spliter.")
            nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
            return nodes_with_metadata, file_stream

        except Exception as e:
            try:
                # If the first method fails, fallback to sentence splitter
                sentence_splitter = SentenceSplitter(chunk_size=512)
                nodes_with_metadata = sentence_splitter.get_nodes_from_documents(
                    documents_with_metadata
                )
                print("Pipeline processing completed with SentenceSplitter fallback.")
                return nodes_with_metadata, file_stream
            except Exception as fallback_error:
                # Log the second error and return JSONResponse for FastAPI
                logging.error(f"Error with SentenceSplitter fallback: {fallback_error}")
                return JSONResponse(
                    status_code=500,
                    content="An internal server error occurred during pipeline processing.",
                )

    def filter_document(self, documents):
        api_key = PINECONE_CONFIG.PINECONE_API_KEY
        client = Pinecone(api_key=api_key)
        pinecone_index = client.Index("test")

        random_vector = [random.uniform(0, 1) for _ in range(1536)]

        filtered_documents = []
        for doc in documents:
            result = self.check_existing_metadata(
                pinecone_index, doc.metadata["title"], random_vector
            )

            if len(result) == 0:
                filtered_documents.append(doc)

        return filtered_documents