Spaces:
Sleeping
Sleeping
from langchain_community.vectorstores import FAISS | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import TextLoader | |
import pickle | |
import streamlit as st | |
import os | |
CACHE_DIR = "./cache" | |
CACHE_FILE = os.path.join(CACHE_DIR, "vectorstore_cache.pkl") | |
def load_or_create_vectorstore(): | |
"""Load vectorstore from cache if it exists, otherwise create and cache it""" | |
embedder_model = "hiiamsid/sentence_similarity_spanish_es" | |
embeddings = HuggingFaceEmbeddings(model_name=embedder_model) | |
# Try to load from cache first | |
if os.path.exists(CACHE_FILE): | |
try: | |
with open(CACHE_FILE, 'rb') as f: | |
vectorstore = pickle.load(f) | |
st.success("Successfully loaded vectorstore from cache") | |
return vectorstore | |
except Exception as e: | |
st.warning(f"Failed to load cache: {str(e)}. Creating new vectorstore...") | |
# If cache doesn't exist or loading failed, create new vectorstore | |
txt_dir = "./rag_documents/" | |
txt_files = [f for f in os.listdir(txt_dir) if f.endswith('.txt')] | |
all_documents = [] | |
for txt_file in txt_files: | |
file_path = os.path.join(txt_dir, txt_file) | |
try: | |
loader = TextLoader(file_path) | |
documents = loader.load() | |
all_documents.extend(documents) | |
except Exception as e: | |
st.error(f"Error loading {txt_file}: {str(e)}") | |
continue | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=10) | |
texts = text_splitter.split_documents(all_documents) | |
vectorstore = FAISS.from_documents(texts, embeddings) | |
# Create cache directory if it doesn't exist | |
os.makedirs(CACHE_DIR, exist_ok=True) | |
# Save to cache | |
try: | |
with open(CACHE_FILE, 'wb') as f: | |
pickle.dump(vectorstore, f) | |
st.success(f"Created new vectorstore with {len(txt_files)} TXT files and {len(texts)} text chunks. Cached for future use.") | |
except Exception as e: | |
st.warning(f"Failed to cache vectorstore: {str(e)}") | |
return vectorstore | |