from langchain_google_genai import ChatGoogleGenerativeAI from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredHTMLLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_google_genai import GoogleGenerativeAIEmbeddings from langchain_chroma import Chroma from typing import List from langchain_core.documents import Document import os os.environ["GOOGLE_API_KEY"]=os.getenv("GOOGLE_API_KEY") text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len) embedding_function = GoogleGenerativeAIEmbeddings(model="models/embedding-001") vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function) def load_and_split_document(file_path: str) -> List[Document]: if file_path.endswith('.pdf'): loader = PyPDFLoader(file_path) elif file_path.endswith('.docx'): loader = Docx2txtLoader(file_path) elif file_path.endswith('.html'): loader = UnstructuredHTMLLoader(file_path) else: raise ValueError(f"Unsupported file type: {file_path}") documents = loader.load() return text_splitter.split_documents(documents) def index_document_to_chroma(file_path: str, file_id: int) -> bool: try: splits = load_and_split_document(file_path) # Add metadata to each split for split in splits: split.metadata['file_id'] = file_id vectorstore.add_documents(splits) # vectorstore.persist() return True except Exception as e: print(f"Error indexing document: {e}") return False def delete_doc_from_chroma(file_id: int): try: docs = vectorstore.get(where={"file_id": file_id}) print(f"Found {len(docs['ids'])} document chunks for file_id {file_id}") vectorstore._collection.delete(where={"file_id": file_id}) print(f"Deleted all documents with file_id {file_id}") return True except Exception as e: print(f"Error deleting document with file_id {file_id} from Chroma: {str(e)}") return False