anekameni commited on
Commit
f6d49e1
·
1 Parent(s): 9f75075

Refactor RAG system and vector store management; remove unused document loading methods and streamline initialization process

Browse files
requirements.txt CHANGED
@@ -6,10 +6,5 @@ langchain
6
  langchain-huggingface
7
  ollama==0.4.5
8
  chromadb==0.5.23
9
- pdf2image==1.17.0
10
- Pillow==11.1.0
11
- easyocr==1.7.2
12
- PyMuPDF==1.25.1
13
  tqdm==4.67.1
14
- keybert==0.8.5
15
  gradio==5.9.1
 
6
  langchain-huggingface
7
  ollama==0.4.5
8
  chromadb==0.5.23
 
 
 
 
9
  tqdm==4.67.1
 
10
  gradio==5.9.1
src/rag_pipeline/rag_system.py CHANGED
@@ -33,10 +33,6 @@ class RAGSystem:
33
  """Initialize embeddings based on environment configuration"""
34
  return get_llm_model_embedding()
35
 
36
- def load_documents(self) -> List:
37
- """Load and split documents from the specified directory"""
38
- return self.vector_store_management.load_documents()
39
-
40
  def initialize_vector_store(self, documents: List = None):
41
  """Initialize or load the vector store"""
42
  self.vector_store_management.initialize_vector_store(documents)
@@ -117,11 +113,6 @@ if __name__ == "__main__":
117
  # Initialize RAG system
118
  rag = RAGSystem(docs_dir, persist_directory_dir, batch_size)
119
 
120
- if len(glob(os.path.join(persist_directory_dir, "*/*.bin"))):
121
- rag.initialize_vector_store() # vector store initialized
122
- else:
123
- # Load and index documents
124
- documents = rag.load_documents()
125
- rag.initialize_vector_store(documents) # documents
126
 
127
  print(rag.query("Quand a eu lieu la traite négrière ?"))
 
33
  """Initialize embeddings based on environment configuration"""
34
  return get_llm_model_embedding()
35
 
 
 
 
 
36
  def initialize_vector_store(self, documents: List = None):
37
  """Initialize or load the vector store"""
38
  self.vector_store_management.initialize_vector_store(documents)
 
113
  # Initialize RAG system
114
  rag = RAGSystem(docs_dir, persist_directory_dir, batch_size)
115
 
116
+ rag.initialize_vector_store() # vector store initialized
 
 
 
 
 
117
 
118
  print(rag.query("Quand a eu lieu la traite négrière ?"))
src/vector_store/vector_store.py CHANGED
@@ -66,49 +66,3 @@ class VectorStoreManager:
66
  persist_directory=self.persist_directory_dir,
67
  embedding_function=self.embeddings,
68
  )
69
-
70
- def _load_text_documents(self) -> List:
71
- """*
72
- Load and split documents from the specified directory
73
- @TODO Move this function to chunking
74
- """
75
- loader = DirectoryLoader(self.docs_dir, glob="**/*.txt", loader_cls=TextLoader)
76
- documents = loader.load()
77
-
78
- splitter = RecursiveCharacterTextSplitter(
79
- chunk_size=1000,
80
- chunk_overlap=200,
81
- length_function=len,
82
- )
83
- return splitter.split_documents(documents)
84
-
85
- def _load_json_documents(self) -> List:
86
- """*
87
- Load and split documents from the specified directory
88
- @TODO Move this function to chunking
89
- """
90
- files = glob(os.path.join(self.docs_dir, "*.json"))
91
-
92
- def load_json_file(file_path):
93
- with open(file_path, "r") as f:
94
- data = json.load(f)["kwargs"]
95
- return Document.model_validate(
96
- {**data, "metadata": sanitize_metadata(data["metadata"])}
97
- )
98
-
99
- with ThreadPoolExecutor() as executor:
100
- documents = list(
101
- tqdm(
102
- executor.map(load_json_file, files),
103
- total=len(files),
104
- desc="Loading JSON documents",
105
- )
106
- )
107
-
108
- return documents
109
-
110
- def load_documents(self) -> List:
111
- files = glob(os.path.join(self.docs_dir, "*.json"))
112
- if len(files):
113
- return self._load_json_documents()
114
- return self._load_text_documents()
 
66
  persist_directory=self.persist_directory_dir,
67
  embedding_function=self.embeddings,
68
  )