Spaces:
Sleeping
Sleeping
File size: 4,263 Bytes
9002555 69beac6 9002555 69beac6 d57efd6 9002555 b39c0ba 9002555 0767396 9002555 0767396 9002555 69beac6 9002555 0767396 9002555 d57efd6 9002555 d57efd6 9002555 0767396 69beac6 9002555 0767396 9002555 d57efd6 9002555 69beac6 d57efd6 9002555 0767396 d57efd6 661a3cb 69beac6 9002555 b39c0ba 69beac6 b39c0ba 0767396 b39c0ba 69beac6 9002555 69beac6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
from llama_index.core.ingestion import IngestionPipeline
from llama_index.embeddings.openai import OpenAIEmbedding
from config import PINECONE_CONFIG
from pinecone.grpc import PineconeGRPC as Pinecone
# from service.reader import Reader
from script.get_metadata import Metadata
from fastapi import UploadFile, status
from fastapi.responses import JSONResponse
from llama_index.core.node_parser import (
SentenceSplitter,
SemanticSplitterNodeParser,
)
from llama_index.core import Settings
# from service.reader_v3 import upload_file
from service.reader_v4 import upload_file
# from script.get_topic import extract_topic
import logging
import random
class Uploader:
# def __init__(self, reference, file: UploadFile, content_table: UploadFile):
def __init__(self, reference, file: UploadFile, lang: str = "en"):
self.file = file
# self.content_table = content_table
# self.reader = Reader()
self.reference = reference
self.metadata = Metadata(reference)
self.lang = lang
def check_existing_metadata(self, pinecone_index, title, random_vector):
try:
result = pinecone_index.query(
vector=random_vector,
top_k=1,
filter={
"title": {"$eq": title},
},
)
return result["matches"]
except Exception as e:
return JSONResponse(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
content=f"Error check existing metadata {str(e)}",
)
async def process_documents(self):
# Get metadata
documents_with_metadata, file_stream = await upload_file(self.reference, self.file, self.lang)
if isinstance(documents_with_metadata, JSONResponse):
return documents_with_metadata # Return the error response directly
# embed_model = OpenAIEmbedding()
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
Settings.embed_model = embed_model
# Set up the ingestion pipeline
pipeline = IngestionPipeline(
transformations=[
SemanticSplitterNodeParser(
buffer_size=1,
breakpoint_percentile_threshold=95,
embed_model=embed_model,
),
# topic_extractor,
]
)
# splitter = SemanticSplitterNodeParser(
# buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
# )
# Run the pipeline
try:
print("Pipeline processing completed with Semantic Spliter.")
nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
return nodes_with_metadata, file_stream
except Exception as e:
try:
# If the first method fails, fallback to sentence splitter
sentence_splitter = SentenceSplitter(chunk_size=512)
nodes_with_metadata = sentence_splitter.get_nodes_from_documents(
documents_with_metadata
)
print("Pipeline processing completed with SentenceSplitter fallback.")
return nodes_with_metadata, file_stream
except Exception as fallback_error:
# Log the second error and return JSONResponse for FastAPI
logging.error(f"Error with SentenceSplitter fallback: {fallback_error}")
return JSONResponse(
status_code=500,
content="An internal server error occurred during pipeline processing.",
)
def filter_document(self, documents):
api_key = PINECONE_CONFIG.PINECONE_API_KEY
client = Pinecone(api_key=api_key)
pinecone_index = client.Index("test")
random_vector = [random.uniform(0, 1) for _ in range(1536)]
filtered_documents = []
for doc in documents:
result = self.check_existing_metadata(
pinecone_index, doc.metadata["title"], random_vector
)
if len(result) == 0:
filtered_documents.append(doc)
return filtered_documents
|