FastAPI / chroma_utils.py
VishnuRamDebyez's picture
Update chroma_utils.py
1f694ab verified
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredHTMLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from typing import List
from langchain_core.documents import Document
import os
os.environ["GOOGLE_API_KEY"]=os.getenv("GOOGLE_API_KEY")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
embedding_function = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)
def load_and_split_document(file_path: str) -> List[Document]:
if file_path.endswith('.pdf'):
loader = PyPDFLoader(file_path)
elif file_path.endswith('.docx'):
loader = Docx2txtLoader(file_path)
elif file_path.endswith('.html'):
loader = UnstructuredHTMLLoader(file_path)
else:
raise ValueError(f"Unsupported file type: {file_path}")
documents = loader.load()
return text_splitter.split_documents(documents)
def index_document_to_chroma(file_path: str, file_id: int) -> bool:
try:
splits = load_and_split_document(file_path)
# Add metadata to each split
for split in splits:
split.metadata['file_id'] = file_id
vectorstore.add_documents(splits)
# vectorstore.persist()
return True
except Exception as e:
print(f"Error indexing document: {e}")
return False
def delete_doc_from_chroma(file_id: int):
try:
docs = vectorstore.get(where={"file_id": file_id})
print(f"Found {len(docs['ids'])} document chunks for file_id {file_id}")
vectorstore._collection.delete(where={"file_id": file_id})
print(f"Deleted all documents with file_id {file_id}")
return True
except Exception as e:
print(f"Error deleting document with file_id {file_id} from Chroma: {str(e)}")
return False