|
import os |
|
from langchain.document_loaders import TextLoader, DirectoryLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.vectorstores import FAISS |
|
|
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN", None) |
|
|
|
|
|
loader = DirectoryLoader('data2/text/range/0-5000', loader_cls=TextLoader) |
|
documents = loader.load() |
|
print('len of documents are', len(documents)) |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=250) |
|
all_splits = text_splitter.split_documents(documents) |
|
print("Length_of all_splits:", len(all_splits)) |
|
|
|
|
|
model_name = "sentence-transformers/all-mpnet-base-v2" |
|
model_kwargs = {"device": "cuda"} |
|
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs) |
|
|
|
|
|
vectorstore = FAISS.from_documents(all_splits, embeddings) |
|
vectorstore.save_local('faiss_index') |
|
|
|
print("Embeddings stored successfully!") |
|
|