|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.document_loaders import UnstructuredFileLoader |
|
from langchain.vectorstores.faiss import FAISS |
|
from langchain.embeddings import OpenAIEmbeddings |
|
import pickle |
|
import os |
|
import nltk |
|
nltk.download('punkt') |
|
nltk.download('averaged_perceptron_tagger') |
|
def ingest_data(vector_file_path): |
|
|
|
loader = UnstructuredFileLoader("cleaned_site_contents2023-02-24.txt") |
|
raw_documents = loader.load() |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter() |
|
documents = text_splitter.split_documents(raw_documents) |
|
|
|
|
|
|
|
embeddings = OpenAIEmbeddings() |
|
vectorstore = FAISS.from_documents(documents, embeddings) |
|
|
|
|
|
|
|
with open(vector_file_path, "wb") as f: |
|
pickle.dump(vectorstore, f) |
|
return vector_file_path |
|
|
|
def get_vectorstore(vector_file_path): |
|
if os.path.isfile(vector_file_path): |
|
return vector_file_path |
|
else: |
|
return ingest_data(vector_file_path) |
|
|