import box import yaml from langchain.vectorstores import FAISS from langchain.document_loaders import PyPDFDirectoryLoader from langchain.text_splitter import CharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import PyPDFLoader, DirectoryLoader from langchain.embeddings import HuggingFaceEmbeddings from langchain_community.embeddings.sentence_transformer import ( SentenceTransformerEmbeddings, ) from langchain.vectorstores import Chroma # Import config vars with open('config.yml', 'r', encoding='utf8') as ymlfile: cfg = box.Box(yaml.safe_load(ymlfile)) def run_ingest(): loader = DirectoryLoader(cfg.DATA_PATH, glob='*.pdf', loader_cls=PyPDFLoader) documents = loader.load() print("documents",documents) text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20,length_function =len,add_start_index = True) text = text_splitter.split_documents(documents) embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'}) # load it into Chroma # save to disk db2 = Chroma.from_documents(text, embedding_function, persist_directory="./vectorestore/chroma") if __name__ == "__main__": run_ingest()