""" Indexing with vector database """ from pathlib import Path import re import chromadb from unidecode import unidecode from langchain_community.document_loaders import PyPDFLoader, TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_chroma import Chroma from langchain_huggingface import HuggingFaceEmbeddings # Load PDF or TXT document and create doc splits def load_doc(list_file_path, chunk_size, chunk_overlap): """Load documents and create doc splits""" pages = [] full_text = "" for path in list_file_path: if path.endswith(".pdf"): loader = PyPDFLoader(path) elif path.endswith(".txt"): loader = TextLoader(path) else: continue doc_pages = loader.load() pages.extend(doc_pages) full_text += "\n".join([p.page_content for p in doc_pages]) + "\n" text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) doc_splits = text_splitter.split_documents(pages) return doc_splits, full_text # Generate collection name for vector database def create_collection_name(filepath): """Create collection name for vector database""" collection_name = Path(filepath).stem collection_name = collection_name.replace(" ", "-") collection_name = unidecode(collection_name) collection_name = re.sub("[^A-Za-z0-9]+", "-", collection_name) collection_name = collection_name[:50] if len(collection_name) < 3: collection_name = collection_name + "xyz" if not collection_name[0].isalnum(): collection_name = "A" + collection_name[1:] if not collection_name[-1].isalnum(): collection_name = collection_name[:-1] + "Z" print("\n\nFilepath: ", filepath) print("Collection name: ", collection_name) return collection_name # Create vector database def create_db(splits, collection_name): """Create embeddings and vector database""" embedding = HuggingFaceEmbeddings( model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2", ) chromadb.api.client.SharedSystemClient.clear_system_cache() new_client = chromadb.EphemeralClient() vectordb = Chroma.from_documents( documents=splits, embedding=embedding, client=new_client, collection_name=collection_name, ) return vectordb