Spaces:

Multimedika
/

Bot_Development

Sleeping

App Files Files Community

Bot_Development / script /document_uploader.py

dsmultimedika

Build Application

9002555 6 months ago

raw

history blame

3.98 kB

	from llama_index.core.ingestion import IngestionPipeline
	from llama_index.core.extractors import PydanticProgramExtractor
	from llama_index.embeddings.openai import OpenAIEmbedding
	from config import PINECONE_CONFIG
	from pinecone.grpc import PineconeGRPC as Pinecone
	from service.reader import Reader
	from script.get_metadata import Metadata
	from fastapi import UploadFile, HTTPException,status

	from llama_index.core.node_parser import (
	SentenceSplitter,
	SemanticSplitterNodeParser,
	)

	# from script.get_topic import extract_topic

	import logging
	import random


	class Uploader:
	# def __init__(self, reference, file: UploadFile, content_table: UploadFile):
	def __init__(self, reference, file: UploadFile):
	self.file = file
	# self.content_table = content_table
	self.reader = Reader()
	self.reference = reference
	self.metadata = Metadata(reference)

	async def ingest_documents(self, file: UploadFile):
	"""Load documents from the storage path."""
	documents = await self.reader.read_from_uploadfile(file)
	print("document successfully ingested")

	return documents

	def check_existing_metadata(self, pinecone_index, title, random_vector):
	try:
	result = pinecone_index.query(
	vector=random_vector,
	top_k=1,
	filter={
	"title": {"$eq": title},
	},
	)
	return result["matches"]
	except Exception as e:
	raise HTTPException(
	status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
	detail=f"Error check existing metadata {str(e)}",
	)

	async def process_documents(self):
	# Ingest documents
	print("test")
	documents = await self.ingest_documents(self.file)
	print("documents")

	# topic_extractor = extract_topic(self.reference, self.content_table)

	embed_model = OpenAIEmbedding()

	# Get metadata
	documents_with_metadata = self.metadata.apply_metadata(documents)
	print("documents_with_metadata")

	# document_filtered = self.filter_document(documents_with_metadata)

	# Set up the ingestion pipeline
	# pipeline = IngestionPipeline(
	# transformations=[
	# SemanticSplitterNodeParser(
	# buffer_size=1,
	# breakpoint_percentile_threshold=95,
	# embed_model=embed_model,
	# ),
	# # topic_extractor,
	# ]
	# )

	splitter = SemanticSplitterNodeParser(
	buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
	)

	# Run the pipeline
	try:
	# nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
	nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
	print("Pipeline processing completed updated.")
	return nodes_with_metadata
	except Exception as e:
	# Log the error and raise HTTPException for FastAPI
	logging.error(f"An error occurred in making pipeline: {e}")
	raise HTTPException(
	status_code=500,
	detail="An internal server error occurred making pipeline.",
	)

	def filter_document(self, documents):
	api_key = PINECONE_CONFIG.PINECONE_API_KEY
	client = Pinecone(api_key=api_key)
	pinecone_index = client.Index("test")

	random_vector = [random.uniform(0, 1) for _ in range(1536)]

	filtered_documents = []
	for doc in documents:
	result = self.check_existing_metadata(
	pinecone_index, doc.metadata["title"], random_vector
	)

	if len(result) == 0:
	filtered_documents.append(doc)

	return filtered_documents