|
from langchain_community.vectorstores import FAISS |
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.document_loaders import TextLoader |
|
import pickle |
|
import streamlit as st |
|
|
|
import os |
|
|
|
CACHE_DIR = "./cache" |
|
CACHE_FILE = os.path.join(CACHE_DIR, "vectorstore_cache.pkl") |
|
|
|
|
|
def load_or_create_vectorstore(): |
|
"""Load vectorstore from cache if it exists, otherwise create and cache it""" |
|
embedder_model = "hiiamsid/sentence_similarity_spanish_es" |
|
embeddings = HuggingFaceEmbeddings(model_name=embedder_model) |
|
|
|
|
|
if os.path.exists(CACHE_FILE): |
|
try: |
|
with open(CACHE_FILE, 'rb') as f: |
|
vectorstore = pickle.load(f) |
|
st.success("Successfully loaded vectorstore from cache") |
|
return vectorstore |
|
except Exception as e: |
|
st.warning(f"Failed to load cache: {str(e)}. Creating new vectorstore...") |
|
|
|
|
|
txt_dir = "./rag_documents/" |
|
txt_files = [f for f in os.listdir(txt_dir) if f.endswith('.txt')] |
|
all_documents = [] |
|
|
|
for txt_file in txt_files: |
|
file_path = os.path.join(txt_dir, txt_file) |
|
try: |
|
loader = TextLoader(file_path) |
|
documents = loader.load() |
|
all_documents.extend(documents) |
|
except Exception as e: |
|
st.error(f"Error loading {txt_file}: {str(e)}") |
|
continue |
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=10) |
|
texts = text_splitter.split_documents(all_documents) |
|
vectorstore = FAISS.from_documents(texts, embeddings) |
|
|
|
|
|
os.makedirs(CACHE_DIR, exist_ok=True) |
|
|
|
|
|
try: |
|
with open(CACHE_FILE, 'wb') as f: |
|
pickle.dump(vectorstore, f) |
|
st.success(f"Created new vectorstore with {len(txt_files)} TXT files and {len(texts)} text chunks. Cached for future use.") |
|
except Exception as e: |
|
st.warning(f"Failed to cache vectorstore: {str(e)}") |
|
|
|
return vectorstore |
|
|