Spaces:
Sleeping
Sleeping
File size: 1,072 Bytes
d90465c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
import os
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
# Set your Hugging Face token
HF_TOKEN = os.environ.get("HF_TOKEN", None)
# Load documents
loader = DirectoryLoader('data2/text/range/0-5000', loader_cls=TextLoader)
documents = loader.load()
print('len of documents are', len(documents))
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=250)
all_splits = text_splitter.split_documents(documents)
print("Length of all_splits:", len(all_splits))
# Generate embeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
# Store embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)
vectorstore.save_local('faiss_index')
print("Embeddings stored successfully!")
|