tb_tst_ai

Running

tb_tst_ai / utils /RAG_utils.py

Daniil

Pushing the repo

17d7268 4 months ago

2.21 kB

	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import TextLoader
	import pickle
	import streamlit as st

	import os

	CACHE_DIR = "./cache"
	CACHE_FILE = os.path.join(CACHE_DIR, "vectorstore_cache.pkl")


	def load_or_create_vectorstore():
	"""Load vectorstore from cache if it exists, otherwise create and cache it"""
	embedder_model = "hiiamsid/sentence_similarity_spanish_es"
	embeddings = HuggingFaceEmbeddings(model_name=embedder_model)

	# Try to load from cache first
	if os.path.exists(CACHE_FILE):
	try:
	with open(CACHE_FILE, 'rb') as f:
	vectorstore = pickle.load(f)
	st.success("Successfully loaded vectorstore from cache")
	return vectorstore
	except Exception as e:
	st.warning(f"Failed to load cache: {str(e)}. Creating new vectorstore...")

	# If cache doesn't exist or loading failed, create new vectorstore
	txt_dir = "./rag_documents/"
	txt_files = [f for f in os.listdir(txt_dir) if f.endswith('.txt')]
	all_documents = []

	for txt_file in txt_files:
	file_path = os.path.join(txt_dir, txt_file)
	try:
	loader = TextLoader(file_path)
	documents = loader.load()
	all_documents.extend(documents)
	except Exception as e:
	st.error(f"Error loading {txt_file}: {str(e)}")
	continue

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=10)
	texts = text_splitter.split_documents(all_documents)
	vectorstore = FAISS.from_documents(texts, embeddings)

	# Create cache directory if it doesn't exist
	os.makedirs(CACHE_DIR, exist_ok=True)

	# Save to cache
	try:
	with open(CACHE_FILE, 'wb') as f:
	pickle.dump(vectorstore, f)
	st.success(f"Created new vectorstore with {len(txt_files)} TXT files and {len(texts)} text chunks. Cached for future use.")
	except Exception as e:
	st.warning(f"Failed to cache vectorstore: {str(e)}")

	return vectorstore