from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import TextLoader import pickle import streamlit as st import os CACHE_DIR = "./cache" CACHE_FILE = os.path.join(CACHE_DIR, "vectorstore_cache.pkl") def load_or_create_vectorstore(): """Load vectorstore from cache if it exists, otherwise create and cache it""" embedder_model = "hiiamsid/sentence_similarity_spanish_es" embeddings = HuggingFaceEmbeddings(model_name=embedder_model) # Try to load from cache first if os.path.exists(CACHE_FILE): try: with open(CACHE_FILE, 'rb') as f: vectorstore = pickle.load(f) st.success("Successfully loaded vectorstore from cache") return vectorstore except Exception as e: st.warning(f"Failed to load cache: {str(e)}. Creating new vectorstore...") # If cache doesn't exist or loading failed, create new vectorstore txt_dir = "./rag_documents/" txt_files = [f for f in os.listdir(txt_dir) if f.endswith('.txt')] all_documents = [] for txt_file in txt_files: file_path = os.path.join(txt_dir, txt_file) try: loader = TextLoader(file_path) documents = loader.load() all_documents.extend(documents) except Exception as e: st.error(f"Error loading {txt_file}: {str(e)}") continue text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=10) texts = text_splitter.split_documents(all_documents) vectorstore = FAISS.from_documents(texts, embeddings) # Create cache directory if it doesn't exist os.makedirs(CACHE_DIR, exist_ok=True) # Save to cache try: with open(CACHE_FILE, 'wb') as f: pickle.dump(vectorstore, f) st.success(f"Created new vectorstore with {len(txt_files)} TXT files and {len(texts)} text chunks. Cached for future use.") except Exception as e: st.warning(f"Failed to cache vectorstore: {str(e)}") return vectorstore