medivocate / src /vector_store /vector_store.py
anekameni
Refactor RAG system query methods; update descriptions and improve logging for better clarity
56d99ec
raw
history blame
1.74 kB
import os
from typing import List
from langchain_chroma import Chroma
from tqdm import tqdm
from ..utilities.llm_models import get_llm_model_embedding
class VectorStoreManager:
def __init__(self, docs_dir: str, persist_directory_dir: str, batch_size=64):
self.embeddings = get_llm_model_embedding()
self.vector_store = None
self.docs_dir = docs_dir
self.persist_directory_dir = persist_directory_dir
self.batch_size = batch_size
self.collection_name = os.getenv("OLLAM_EMB").split(":")[0]
def _batch_process_documents(self, documents: List):
"""Process documents in batches"""
for i in tqdm(
range(0, len(documents), self.batch_size), desc="Processing documents"
):
batch = documents[i : i + self.batch_size]
if not self.vector_store:
# Initialize vector store with first batch
self.vector_store = Chroma.from_documents(
collection_name=self.collection_name,
documents=batch,
embedding=self.embeddings,
persist_directory=self.persist_directory_dir,
)
else:
# Add subsequent batches
self.vector_store.add_documents(batch)
def initialize_vector_store(self, documents: List = None):
"""Initialize or load the vector store"""
if documents:
self._batch_process_documents(documents)
else:
self.vector_store = Chroma(
collection_name=self.collection_name,
persist_directory=self.persist_directory_dir,
embedding_function=self.embeddings,
)