Spaces:
Sleeping
Sleeping
File size: 4,501 Bytes
9002555 69beac6 9002555 69beac6 d57efd6 9002555 b39c0ba 9002555 69beac6 9002555 69beac6 9002555 69beac6 9002555 d57efd6 9002555 d57efd6 9002555 69beac6 9002555 69beac6 9002555 69beac6 9002555 d57efd6 9002555 69beac6 d57efd6 9002555 d57efd6 b39c0ba 9002555 69beac6 9002555 b39c0ba 69beac6 b39c0ba 69beac6 9002555 69beac6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
from llama_index.core.ingestion import IngestionPipeline
from llama_index.embeddings.openai import OpenAIEmbedding
from config import PINECONE_CONFIG
from pinecone.grpc import PineconeGRPC as Pinecone
# from service.reader import Reader
from script.get_metadata import Metadata
from fastapi import UploadFile, status
from fastapi.responses import JSONResponse
from llama_index.core.node_parser import (
SentenceSplitter,
SemanticSplitterNodeParser,
)
from service.reader_v3 import upload_file
# from script.get_topic import extract_topic
import logging
import random
class Uploader:
# def __init__(self, reference, file: UploadFile, content_table: UploadFile):
def __init__(self, reference, file: UploadFile):
self.file = file
# self.content_table = content_table
# self.reader = Reader()
self.reference = reference
self.metadata = Metadata(reference)
# async def ingest_documents(self, file: UploadFile):
# """Load documents from the storage path."""
# documents = await self.reader.read_from_uploadfile(file)
# print("Banyak document : ", len(documents))
# print("document successfully ingested")
# return documents
def check_existing_metadata(self, pinecone_index, title, random_vector):
try:
result = pinecone_index.query(
vector=random_vector,
top_k=1,
filter={
"title": {"$eq": title},
},
)
return result["matches"]
except Exception as e:
return JSONResponse(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
content=f"Error check existing metadata {str(e)}",
)
async def process_documents(self):
# Ingest documents
# documents = await self.ingest_documents(self.file)
# Get metadata
# documents_with_metadata = self.metadata.apply_metadata(documents)
documents_with_metadata = await upload_file(self.reference, self.file)
# Get Topic
# topic_extractor = extract_topic(self.reference, self.content_table)
# document_filtered = self.filter_document(documents_with_metadata)
embed_model = OpenAIEmbedding()
# Set up the ingestion pipeline
pipeline = IngestionPipeline(
transformations=[
SemanticSplitterNodeParser(
buffer_size=1,
breakpoint_percentile_threshold=95,
embed_model=embed_model,
),
# topic_extractor,
]
)
# splitter = SemanticSplitterNodeParser(
# buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
# )
# Run the pipeline
try:
nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
# nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
return nodes_with_metadata
except Exception as e:
try:
# If the first method fails, fallback to sentence splitter
sentence_splitter = SentenceSplitter(chunk_size=512)
nodes_with_metadata = sentence_splitter.get_nodes_from_documents(
documents_with_metadata
)
print("Pipeline processing completed with SentenceSplitter fallback.")
return nodes_with_metadata
except Exception as fallback_error:
# Log the second error and return JSONResponse for FastAPI
logging.error(f"Error with SentenceSplitter fallback: {fallback_error}")
return JSONResponse(
status_code=500,
content="An internal server error occurred during pipeline processing.",
)
def filter_document(self, documents):
api_key = PINECONE_CONFIG.PINECONE_API_KEY
client = Pinecone(api_key=api_key)
pinecone_index = client.Index("test")
random_vector = [random.uniform(0, 1) for _ in range(1536)]
filtered_documents = []
for doc in documents:
result = self.check_existing_metadata(
pinecone_index, doc.metadata["title"], random_vector
)
if len(result) == 0:
filtered_documents.append(doc)
return filtered_documents
|