from langchain_mongodb import MongoDBAtlasVectorSearch from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader, TextLoader from embed_with_db import embeddings, config, VECTORDB_STORE, client from tqdm import tqdm class VectorDataBase(): def __init__(self, file_path, db_collection, file_type='pdf', ): self.file_path = file_path self.file_type= file_type self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=32) self.db_collection = client[config['DB_NAME']][db_collection] def load_docs_split(self): if str(self.file_type).lower() == 'pdf': loader = PyPDFLoader(self.file_path) elif str(self.file_type).lower() == 'text': loader = TextLoader(self.file_path) else: loader = None if loader: docs = loader.load() return self.text_splitter.split_documents(docs) else: return self.text_splitter.create_documents([self.file_path]) def docs_embeddings(self): texts = self.load_docs_split() if texts: docsearch = MongoDBAtlasVectorSearch.from_documents( texts, embeddings, collection=self.db_collection, index_name=config['VECTOR_SEARCH_INDEX']) print('done!') return docsearch else: print('documents is not embedded') return 'Some issues' @staticmethod def add_collection_database(doc): collection.insert_one( { 'text': doc.page_content, 'embedding': embeddings.embed_query(doc.page_content), 'source': doc.metadata.get('source', 'Unknown'), 'page': doc.metadata.get('page', 0) } ) def embedding_with_loop(self): docs = self.load_docs_split() if docs: for doc in tqdm(docs): self.add_collection_database(doc)