Spaces:

alexneakameni
/

medivocate

Running

App Files Files Community

anekameni commited on 29 days ago

Commit

f6d49e1

1 Parent(s): 9f75075

Refactor RAG system and vector store management; remove unused document loading methods and streamline initialization process

Browse files

Files changed (3) hide show

requirements.txt +0 -5
src/rag_pipeline/rag_system.py +1 -10
src/vector_store/vector_store.py +0 -46

requirements.txt CHANGED Viewed

@@ -6,10 +6,5 @@ langchain
 langchain-huggingface
 ollama==0.4.5
 chromadb==0.5.23
-pdf2image==1.17.0
-Pillow==11.1.0
-easyocr==1.7.2
-PyMuPDF==1.25.1
 tqdm==4.67.1
-keybert==0.8.5
 gradio==5.9.1

 langchain-huggingface
 ollama==0.4.5
 chromadb==0.5.23
 tqdm==4.67.1
 gradio==5.9.1

src/rag_pipeline/rag_system.py CHANGED Viewed

@@ -33,10 +33,6 @@ class RAGSystem:
         """Initialize embeddings based on environment configuration"""
         return get_llm_model_embedding()
-    def load_documents(self) -> List:
-        """Load and split documents from the specified directory"""
-        return self.vector_store_management.load_documents()
     def initialize_vector_store(self, documents: List = None):
         """Initialize or load the vector store"""
         self.vector_store_management.initialize_vector_store(documents)
@@ -117,11 +113,6 @@ if __name__ == "__main__":
     # Initialize RAG system
     rag = RAGSystem(docs_dir, persist_directory_dir, batch_size)
-    if len(glob(os.path.join(persist_directory_dir, "*/*.bin"))):
-        rag.initialize_vector_store()  # vector store initialized
-    else:
-        # Load and index documents
-        documents = rag.load_documents()
-        rag.initialize_vector_store(documents)  # documents
     print(rag.query("Quand a eu lieu la traite négrière ?"))

         """Initialize embeddings based on environment configuration"""
         return get_llm_model_embedding()
     def initialize_vector_store(self, documents: List = None):
         """Initialize or load the vector store"""
         self.vector_store_management.initialize_vector_store(documents)
     # Initialize RAG system
     rag = RAGSystem(docs_dir, persist_directory_dir, batch_size)
+    rag.initialize_vector_store()  # vector store initialized
     print(rag.query("Quand a eu lieu la traite négrière ?"))

src/vector_store/vector_store.py CHANGED Viewed

@@ -66,49 +66,3 @@ class VectorStoreManager:
                 persist_directory=self.persist_directory_dir,
                 embedding_function=self.embeddings,
             )
-    def _load_text_documents(self) -> List:
-        """*
-        Load and split documents from the specified directory
-        @TODO Move this function to chunking
-        """
-        loader = DirectoryLoader(self.docs_dir, glob="**/*.txt", loader_cls=TextLoader)
-        documents = loader.load()
-        splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000,
-            chunk_overlap=200,
-            length_function=len,
-        )
-        return splitter.split_documents(documents)
-    def _load_json_documents(self) -> List:
-        """*
-        Load and split documents from the specified directory
-        @TODO Move this function to chunking
-        """
-        files = glob(os.path.join(self.docs_dir, "*.json"))
-        def load_json_file(file_path):
-            with open(file_path, "r") as f:
-                data = json.load(f)["kwargs"]
-            return Document.model_validate(
-                {**data, "metadata": sanitize_metadata(data["metadata"])}
-            )
-        with ThreadPoolExecutor() as executor:
-            documents = list(
-                tqdm(
-                    executor.map(load_json_file, files),
-                    total=len(files),
-                    desc="Loading JSON documents",
-                )
-            )
-        return documents
-    def load_documents(self) -> List:
-        files = glob(os.path.join(self.docs_dir, "*.json"))
-        if len(files):
-            return self._load_json_documents()
-        return self._load_text_documents()

                 persist_directory=self.persist_directory_dir,
                 embedding_function=self.embeddings,
             )