from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import UnstructuredFileLoader from langchain.vectorstores.faiss import FAISS from langchain.embeddings import OpenAIEmbeddings import pickle import os import nltk nltk.download('punkt') nltk.download('averaged_perceptron_tagger') def ingest_data(vector_file_path): # Load Data loader = UnstructuredFileLoader("cleaned_site_contents2023-02-24.txt") raw_documents = loader.load() # Split text text_splitter = RecursiveCharacterTextSplitter() documents = text_splitter.split_documents(raw_documents) # Load Data to vectorstore embeddings = OpenAIEmbeddings() vectorstore = FAISS.from_documents(documents, embeddings) # Save vectorstore with open(vector_file_path, "wb") as f: pickle.dump(vectorstore, f) return vector_file_path def get_vectorstore(vector_file_path): if os.path.isfile(vector_file_path): return vector_file_path else: return ingest_data(vector_file_path)