tb_tst_ai / utils /RAG_utils.py
Daniil
Pushing the repo
17d7268
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
import pickle
import streamlit as st
import os
CACHE_DIR = "./cache"
CACHE_FILE = os.path.join(CACHE_DIR, "vectorstore_cache.pkl")
def load_or_create_vectorstore():
"""Load vectorstore from cache if it exists, otherwise create and cache it"""
embedder_model = "hiiamsid/sentence_similarity_spanish_es"
embeddings = HuggingFaceEmbeddings(model_name=embedder_model)
# Try to load from cache first
if os.path.exists(CACHE_FILE):
try:
with open(CACHE_FILE, 'rb') as f:
vectorstore = pickle.load(f)
st.success("Successfully loaded vectorstore from cache")
return vectorstore
except Exception as e:
st.warning(f"Failed to load cache: {str(e)}. Creating new vectorstore...")
# If cache doesn't exist or loading failed, create new vectorstore
txt_dir = "./rag_documents/"
txt_files = [f for f in os.listdir(txt_dir) if f.endswith('.txt')]
all_documents = []
for txt_file in txt_files:
file_path = os.path.join(txt_dir, txt_file)
try:
loader = TextLoader(file_path)
documents = loader.load()
all_documents.extend(documents)
except Exception as e:
st.error(f"Error loading {txt_file}: {str(e)}")
continue
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=10)
texts = text_splitter.split_documents(all_documents)
vectorstore = FAISS.from_documents(texts, embeddings)
# Create cache directory if it doesn't exist
os.makedirs(CACHE_DIR, exist_ok=True)
# Save to cache
try:
with open(CACHE_FILE, 'wb') as f:
pickle.dump(vectorstore, f)
st.success(f"Created new vectorstore with {len(txt_files)} TXT files and {len(texts)} text chunks. Cached for future use.")
except Exception as e:
st.warning(f"Failed to cache vectorstore: {str(e)}")
return vectorstore