File size: 2,214 Bytes
17d7268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
import pickle
import streamlit as st

import os

CACHE_DIR = "./cache"
CACHE_FILE = os.path.join(CACHE_DIR, "vectorstore_cache.pkl")


def load_or_create_vectorstore():
    """Load vectorstore from cache if it exists, otherwise create and cache it"""
    embedder_model = "hiiamsid/sentence_similarity_spanish_es"
    embeddings = HuggingFaceEmbeddings(model_name=embedder_model)
    
    # Try to load from cache first
    if os.path.exists(CACHE_FILE):
        try:
            with open(CACHE_FILE, 'rb') as f:
                vectorstore = pickle.load(f)
            st.success("Successfully loaded vectorstore from cache")
            return vectorstore
        except Exception as e:
            st.warning(f"Failed to load cache: {str(e)}. Creating new vectorstore...")
    
    # If cache doesn't exist or loading failed, create new vectorstore
    txt_dir = "./rag_documents/"
    txt_files = [f for f in os.listdir(txt_dir) if f.endswith('.txt')]
    all_documents = []

    for txt_file in txt_files:
        file_path = os.path.join(txt_dir, txt_file)
        try:
            loader = TextLoader(file_path)
            documents = loader.load()
            all_documents.extend(documents)
        except Exception as e:
            st.error(f"Error loading {txt_file}: {str(e)}")
            continue
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=10)
    texts = text_splitter.split_documents(all_documents)
    vectorstore = FAISS.from_documents(texts, embeddings)
    
    # Create cache directory if it doesn't exist
    os.makedirs(CACHE_DIR, exist_ok=True)
    
    # Save to cache
    try:
        with open(CACHE_FILE, 'wb') as f:
            pickle.dump(vectorstore, f)
        st.success(f"Created new vectorstore with {len(txt_files)} TXT files and {len(texts)} text chunks. Cached for future use.")
    except Exception as e:
        st.warning(f"Failed to cache vectorstore: {str(e)}")
    
    return vectorstore