Spaces:
Sleeping
Sleeping
from llama_index.core.ingestion import IngestionPipeline | |
from llama_index.embeddings.openai import OpenAIEmbedding | |
from config import PINECONE_CONFIG | |
from pinecone.grpc import PineconeGRPC as Pinecone | |
from service.reader import Reader | |
from script.get_metadata import Metadata | |
from fastapi import UploadFile,status | |
from fastapi.responses import JSONResponse | |
from llama_index.core.node_parser import ( | |
SemanticSplitterNodeParser, | |
) | |
# from script.get_topic import extract_topic | |
import logging | |
import random | |
class Uploader: | |
# def __init__(self, reference, file: UploadFile, content_table: UploadFile): | |
def __init__(self, reference, file: UploadFile): | |
self.file = file | |
# self.content_table = content_table | |
self.reader = Reader() | |
self.reference = reference | |
self.metadata = Metadata(reference) | |
async def ingest_documents(self, file: UploadFile): | |
"""Load documents from the storage path.""" | |
documents = await self.reader.read_from_uploadfile(file) | |
print("Banyak document : ", len(documents)) | |
print("document successfully ingested") | |
return documents | |
def check_existing_metadata(self, pinecone_index, title, random_vector): | |
try: | |
result = pinecone_index.query( | |
vector=random_vector, | |
top_k=1, | |
filter={ | |
"title": {"$eq": title}, | |
}, | |
) | |
return result["matches"] | |
except Exception as e: | |
return JSONResponse( | |
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | |
content=f"Error check existing metadata {str(e)}", | |
) | |
async def process_documents(self): | |
# Ingest documents | |
documents = await self.ingest_documents(self.file) | |
# topic_extractor = extract_topic(self.reference, self.content_table) | |
embed_model = OpenAIEmbedding() | |
# Get metadata | |
documents_with_metadata = self.metadata.apply_metadata(documents) | |
# document_filtered = self.filter_document(documents_with_metadata) | |
# Set up the ingestion pipeline | |
pipeline = IngestionPipeline( | |
transformations=[ | |
SemanticSplitterNodeParser( | |
buffer_size=1, | |
breakpoint_percentile_threshold=95, | |
embed_model=embed_model, | |
), | |
# topic_extractor, | |
] | |
) | |
# splitter = SemanticSplitterNodeParser( | |
# buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model | |
# ) | |
# Run the pipeline | |
try: | |
nodes_with_metadata = pipeline.run(documents=documents_with_metadata) | |
return nodes_with_metadata | |
except Exception as e: | |
# Log the error and return JSONResponse for FastAPI | |
logging.error(f"An error occurred in making pipeline: {e}") | |
return JSONResponse( | |
status_code=500, | |
content="An internal server error occurred making pipeline.", | |
) | |
def filter_document(self, documents): | |
api_key = PINECONE_CONFIG.PINECONE_API_KEY | |
client = Pinecone(api_key=api_key) | |
pinecone_index = client.Index("test") | |
random_vector = [random.uniform(0, 1) for _ in range(1536)] | |
filtered_documents = [] | |
for doc in documents: | |
result = self.check_existing_metadata( | |
pinecone_index, doc.metadata["title"], random_vector | |
) | |
if len(result) == 0: | |
filtered_documents.append(doc) | |
return filtered_documents |