JairoDanielMT commited on
Commit
47687a2
·
verified ·
1 Parent(s): f942f85

Update core/vectorstore/vectorstore_manager.py

Browse files
Files changed (1) hide show
  1. core/vectorstore/vectorstore_manager.py +136 -136
core/vectorstore/vectorstore_manager.py CHANGED
@@ -1,136 +1,136 @@
1
- # core/vectorstore/vectorstore_manager.py
2
- import os
3
- import faiss
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain_community.docstore.in_memory import InMemoryDocstore
6
- from langchain_community.vectorstores import FAISS as FAISS_STORE
7
- from vectorstore.document_processor import DocumentProcessor
8
- from vectorstore.embeddings import EmbeddingManager
9
- from vectorstore.distance_strategy import DistanceStrategyManager
10
- from loguru import logger
11
-
12
-
13
- class VectorStoreManager:
14
- """
15
- Gestión minimalista de FAISS para EDULLM:
16
- - Indexa documentos
17
- - Carga/guarda el índice
18
- - Expone retriever para RAG
19
- """
20
-
21
- def __init__(self, path: str, name: str):
22
- self.path = path
23
- self.store_path = os.path.join("database", name)
24
- self.embeddings = EmbeddingManager.get_embeddings()
25
- self.strategy = DistanceStrategyManager().strategy
26
- self.vectorstore = None
27
- logger.info(f"🔹 Inicializando VectorStoreManager en ruta: {self.store_path}")
28
- self._initialize()
29
-
30
- def _initialize(self):
31
- if self.exist_vectorstore():
32
- logger.info("✅ Índice FAISS encontrado. Cargando desde disco...")
33
- self.vectorstore = self.load_vectorstore()
34
- else:
35
- logger.warning("⚠️ No existe índice previo. Creando índice vacío...")
36
- dummy = self.embeddings.embed_query("init")
37
- index = faiss.IndexFlatL2(len(dummy))
38
- self.vectorstore = FAISS_STORE(
39
- embedding_function=self.embeddings,
40
- index=index,
41
- docstore=InMemoryDocstore(),
42
- index_to_docstore_id={},
43
- distance_strategy=self.strategy,
44
- )
45
-
46
- def create_vectorstore(self) -> None:
47
- logger.info(f"🚀 Procesando documentos en '{self.path}' para indexar...")
48
- docs = DocumentProcessor(self.path).files_to_texts()
49
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=400)
50
- chunks = splitter.split_documents(docs)
51
- self.vectorstore.add_documents(chunks)
52
- self.save_vectorstore()
53
- logger.success("🎯 Vectorstore creado y guardado correctamente.")
54
-
55
- def save_vectorstore(self) -> None:
56
- try:
57
- os.makedirs(self.store_path, exist_ok=True)
58
- self.vectorstore.save_local(self.store_path)
59
- logger.info(f"💾 Índice guardado en '{self.store_path}'.")
60
- except Exception as e:
61
- logger.error(f"❌ Error al guardar el vectorstore: {e}")
62
-
63
- def load_vectorstore(self):
64
- try:
65
- logger.info(f"📂 Cargando vectorstore desde '{self.store_path}'.")
66
- return FAISS_STORE.load_local(
67
- folder_path=self.store_path,
68
- embeddings=self.embeddings,
69
- allow_dangerous_deserialization=True,
70
- distance_strategy=self.strategy,
71
- )
72
- except Exception as e:
73
- logger.error(f"❌ Error al cargar el vectorstore: {e}")
74
- raise
75
-
76
- def exist_vectorstore(self) -> bool:
77
- """Verifica si el vectorstore existe, creando la carpeta base si es necesario."""
78
- base_dir = "database"
79
-
80
- if not os.path.isdir(base_dir):
81
- logger.warning(f"📂 Directorio base '{base_dir}' no encontrado. Creando...")
82
- os.makedirs(base_dir, exist_ok=True)
83
- return False
84
-
85
- if os.path.isdir(self.store_path):
86
- logger.info(f"✅ Vectorstore encontrado en '{self.store_path}'.")
87
- return True
88
- else:
89
- logger.info(f"ℹ️ Vectorstore no existe aún en '{self.store_path}'.")
90
- return False
91
-
92
- def as_retriever(
93
- self,
94
- search_type: str = "similarity_score_threshold",
95
- k: int = 4,
96
- score_threshold: float = 0.75,
97
- fallback_to_similarity: bool = True,
98
- **kwargs,
99
- ):
100
- if not self.vectorstore:
101
- self.vectorstore = self.load_vectorstore()
102
-
103
- logger.debug(
104
- f"🔍 Configurando retriever: type={search_type}, k={k}, threshold={score_threshold}"
105
- )
106
- search_kwargs = {"k": k, "score_threshold": score_threshold}
107
- retriever = self.vectorstore.as_retriever(
108
- search_type=search_type, search_kwargs=search_kwargs
109
- )
110
-
111
- if fallback_to_similarity:
112
- logger.info(
113
- "🛡️ Fallback activado: Si no hay resultados, se usará búsqueda por similarity."
114
- )
115
-
116
- class SafeRetriever:
117
- def __init__(self, primary, fallback):
118
- self.primary = primary
119
- self.fallback = fallback
120
-
121
- def invoke(self, query):
122
- docs = self.primary.invoke(query)
123
- if not docs:
124
- logger.warning(
125
- "⚠️ Sin resultados en threshold. Aplicando fallback a similarity."
126
- )
127
- return self.fallback.invoke(query)
128
- return docs
129
-
130
- fallback_retriever = self.vectorstore.as_retriever(
131
- search_type="similarity", search_kwargs={"k": k}
132
- )
133
-
134
- return SafeRetriever(retriever, fallback_retriever)
135
-
136
- return retriever
 
1
+ # core/vectorstore/vectorstore_manager.py
2
+ import os
3
+ import faiss
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.docstore.in_memory import InMemoryDocstore
6
+ from langchain_community.vectorstores import FAISS as FAISS_STORE
7
+ from core.vectorstore.document_processor import DocumentProcessor
8
+ from core.vectorstore.embeddings import EmbeddingManager
9
+ from core.vectorstore.distance_strategy import DistanceStrategyManager
10
+ from loguru import logger
11
+
12
+
13
+ class VectorStoreManager:
14
+ """
15
+ Gestión minimalista de FAISS para EDULLM:
16
+ - Indexa documentos
17
+ - Carga/guarda el índice
18
+ - Expone retriever para RAG
19
+ """
20
+
21
+ def __init__(self, path: str, name: str):
22
+ self.path = path
23
+ self.store_path = os.path.join("database", name)
24
+ self.embeddings = EmbeddingManager.get_embeddings()
25
+ self.strategy = DistanceStrategyManager().strategy
26
+ self.vectorstore = None
27
+ logger.info(f"🔹 Inicializando VectorStoreManager en ruta: {self.store_path}")
28
+ self._initialize()
29
+
30
+ def _initialize(self):
31
+ if self.exist_vectorstore():
32
+ logger.info("✅ Índice FAISS encontrado. Cargando desde disco...")
33
+ self.vectorstore = self.load_vectorstore()
34
+ else:
35
+ logger.warning("⚠️ No existe índice previo. Creando índice vacío...")
36
+ dummy = self.embeddings.embed_query("init")
37
+ index = faiss.IndexFlatL2(len(dummy))
38
+ self.vectorstore = FAISS_STORE(
39
+ embedding_function=self.embeddings,
40
+ index=index,
41
+ docstore=InMemoryDocstore(),
42
+ index_to_docstore_id={},
43
+ distance_strategy=self.strategy,
44
+ )
45
+
46
+ def create_vectorstore(self) -> None:
47
+ logger.info(f"🚀 Procesando documentos en '{self.path}' para indexar...")
48
+ docs = DocumentProcessor(self.path).files_to_texts()
49
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=400)
50
+ chunks = splitter.split_documents(docs)
51
+ self.vectorstore.add_documents(chunks)
52
+ self.save_vectorstore()
53
+ logger.success("🎯 Vectorstore creado y guardado correctamente.")
54
+
55
+ def save_vectorstore(self) -> None:
56
+ try:
57
+ os.makedirs(self.store_path, exist_ok=True)
58
+ self.vectorstore.save_local(self.store_path)
59
+ logger.info(f"💾 Índice guardado en '{self.store_path}'.")
60
+ except Exception as e:
61
+ logger.error(f"❌ Error al guardar el vectorstore: {e}")
62
+
63
+ def load_vectorstore(self):
64
+ try:
65
+ logger.info(f"📂 Cargando vectorstore desde '{self.store_path}'.")
66
+ return FAISS_STORE.load_local(
67
+ folder_path=self.store_path,
68
+ embeddings=self.embeddings,
69
+ allow_dangerous_deserialization=True,
70
+ distance_strategy=self.strategy,
71
+ )
72
+ except Exception as e:
73
+ logger.error(f"❌ Error al cargar el vectorstore: {e}")
74
+ raise
75
+
76
+ def exist_vectorstore(self) -> bool:
77
+ """Verifica si el vectorstore existe, creando la carpeta base si es necesario."""
78
+ base_dir = "database"
79
+
80
+ if not os.path.isdir(base_dir):
81
+ logger.warning(f"📂 Directorio base '{base_dir}' no encontrado. Creando...")
82
+ os.makedirs(base_dir, exist_ok=True)
83
+ return False
84
+
85
+ if os.path.isdir(self.store_path):
86
+ logger.info(f"✅ Vectorstore encontrado en '{self.store_path}'.")
87
+ return True
88
+ else:
89
+ logger.info(f"ℹ️ Vectorstore no existe aún en '{self.store_path}'.")
90
+ return False
91
+
92
+ def as_retriever(
93
+ self,
94
+ search_type: str = "similarity_score_threshold",
95
+ k: int = 4,
96
+ score_threshold: float = 0.75,
97
+ fallback_to_similarity: bool = True,
98
+ **kwargs,
99
+ ):
100
+ if not self.vectorstore:
101
+ self.vectorstore = self.load_vectorstore()
102
+
103
+ logger.debug(
104
+ f"🔍 Configurando retriever: type={search_type}, k={k}, threshold={score_threshold}"
105
+ )
106
+ search_kwargs = {"k": k, "score_threshold": score_threshold}
107
+ retriever = self.vectorstore.as_retriever(
108
+ search_type=search_type, search_kwargs=search_kwargs
109
+ )
110
+
111
+ if fallback_to_similarity:
112
+ logger.info(
113
+ "🛡️ Fallback activado: Si no hay resultados, se usará búsqueda por similarity."
114
+ )
115
+
116
+ class SafeRetriever:
117
+ def __init__(self, primary, fallback):
118
+ self.primary = primary
119
+ self.fallback = fallback
120
+
121
+ def invoke(self, query):
122
+ docs = self.primary.invoke(query)
123
+ if not docs:
124
+ logger.warning(
125
+ "⚠️ Sin resultados en threshold. Aplicando fallback a similarity."
126
+ )
127
+ return self.fallback.invoke(query)
128
+ return docs
129
+
130
+ fallback_retriever = self.vectorstore.as_retriever(
131
+ search_type="similarity", search_kwargs={"k": k}
132
+ )
133
+
134
+ return SafeRetriever(retriever, fallback_retriever)
135
+
136
+ return retriever