Spaces:
Running
Running
anekameni
commited on
Commit
·
f6d49e1
1
Parent(s):
9f75075
Refactor RAG system and vector store management; remove unused document loading methods and streamline initialization process
Browse files- requirements.txt +0 -5
- src/rag_pipeline/rag_system.py +1 -10
- src/vector_store/vector_store.py +0 -46
requirements.txt
CHANGED
@@ -6,10 +6,5 @@ langchain
|
|
6 |
langchain-huggingface
|
7 |
ollama==0.4.5
|
8 |
chromadb==0.5.23
|
9 |
-
pdf2image==1.17.0
|
10 |
-
Pillow==11.1.0
|
11 |
-
easyocr==1.7.2
|
12 |
-
PyMuPDF==1.25.1
|
13 |
tqdm==4.67.1
|
14 |
-
keybert==0.8.5
|
15 |
gradio==5.9.1
|
|
|
6 |
langchain-huggingface
|
7 |
ollama==0.4.5
|
8 |
chromadb==0.5.23
|
|
|
|
|
|
|
|
|
9 |
tqdm==4.67.1
|
|
|
10 |
gradio==5.9.1
|
src/rag_pipeline/rag_system.py
CHANGED
@@ -33,10 +33,6 @@ class RAGSystem:
|
|
33 |
"""Initialize embeddings based on environment configuration"""
|
34 |
return get_llm_model_embedding()
|
35 |
|
36 |
-
def load_documents(self) -> List:
|
37 |
-
"""Load and split documents from the specified directory"""
|
38 |
-
return self.vector_store_management.load_documents()
|
39 |
-
|
40 |
def initialize_vector_store(self, documents: List = None):
|
41 |
"""Initialize or load the vector store"""
|
42 |
self.vector_store_management.initialize_vector_store(documents)
|
@@ -117,11 +113,6 @@ if __name__ == "__main__":
|
|
117 |
# Initialize RAG system
|
118 |
rag = RAGSystem(docs_dir, persist_directory_dir, batch_size)
|
119 |
|
120 |
-
|
121 |
-
rag.initialize_vector_store() # vector store initialized
|
122 |
-
else:
|
123 |
-
# Load and index documents
|
124 |
-
documents = rag.load_documents()
|
125 |
-
rag.initialize_vector_store(documents) # documents
|
126 |
|
127 |
print(rag.query("Quand a eu lieu la traite négrière ?"))
|
|
|
33 |
"""Initialize embeddings based on environment configuration"""
|
34 |
return get_llm_model_embedding()
|
35 |
|
|
|
|
|
|
|
|
|
36 |
def initialize_vector_store(self, documents: List = None):
|
37 |
"""Initialize or load the vector store"""
|
38 |
self.vector_store_management.initialize_vector_store(documents)
|
|
|
113 |
# Initialize RAG system
|
114 |
rag = RAGSystem(docs_dir, persist_directory_dir, batch_size)
|
115 |
|
116 |
+
rag.initialize_vector_store() # vector store initialized
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
print(rag.query("Quand a eu lieu la traite négrière ?"))
|
src/vector_store/vector_store.py
CHANGED
@@ -66,49 +66,3 @@ class VectorStoreManager:
|
|
66 |
persist_directory=self.persist_directory_dir,
|
67 |
embedding_function=self.embeddings,
|
68 |
)
|
69 |
-
|
70 |
-
def _load_text_documents(self) -> List:
|
71 |
-
"""*
|
72 |
-
Load and split documents from the specified directory
|
73 |
-
@TODO Move this function to chunking
|
74 |
-
"""
|
75 |
-
loader = DirectoryLoader(self.docs_dir, glob="**/*.txt", loader_cls=TextLoader)
|
76 |
-
documents = loader.load()
|
77 |
-
|
78 |
-
splitter = RecursiveCharacterTextSplitter(
|
79 |
-
chunk_size=1000,
|
80 |
-
chunk_overlap=200,
|
81 |
-
length_function=len,
|
82 |
-
)
|
83 |
-
return splitter.split_documents(documents)
|
84 |
-
|
85 |
-
def _load_json_documents(self) -> List:
|
86 |
-
"""*
|
87 |
-
Load and split documents from the specified directory
|
88 |
-
@TODO Move this function to chunking
|
89 |
-
"""
|
90 |
-
files = glob(os.path.join(self.docs_dir, "*.json"))
|
91 |
-
|
92 |
-
def load_json_file(file_path):
|
93 |
-
with open(file_path, "r") as f:
|
94 |
-
data = json.load(f)["kwargs"]
|
95 |
-
return Document.model_validate(
|
96 |
-
{**data, "metadata": sanitize_metadata(data["metadata"])}
|
97 |
-
)
|
98 |
-
|
99 |
-
with ThreadPoolExecutor() as executor:
|
100 |
-
documents = list(
|
101 |
-
tqdm(
|
102 |
-
executor.map(load_json_file, files),
|
103 |
-
total=len(files),
|
104 |
-
desc="Loading JSON documents",
|
105 |
-
)
|
106 |
-
)
|
107 |
-
|
108 |
-
return documents
|
109 |
-
|
110 |
-
def load_documents(self) -> List:
|
111 |
-
files = glob(os.path.join(self.docs_dir, "*.json"))
|
112 |
-
if len(files):
|
113 |
-
return self._load_json_documents()
|
114 |
-
return self._load_text_documents()
|
|
|
66 |
persist_directory=self.persist_directory_dir,
|
67 |
embedding_function=self.embeddings,
|
68 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|