VishnuRamDebyez commited on
Commit
0de1017
·
verified ·
1 Parent(s): df5c110

Create chroma_utils.py

Browse files
Files changed (1) hide show
  1. chroma_utils.py +52 -0
chroma_utils.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredHTMLLoader
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+ from langchain_openai import OpenAIEmbeddings
4
+ from langchain_chroma import Chroma
5
+ from typing import List
6
+ from langchain_core.documents import Document
7
+ import os
8
+
9
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
10
+ embedding_function = OpenAIEmbeddings()
11
+ vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)
12
+
13
+ def load_and_split_document(file_path: str) -> List[Document]:
14
+ if file_path.endswith('.pdf'):
15
+ loader = PyPDFLoader(file_path)
16
+ elif file_path.endswith('.docx'):
17
+ loader = Docx2txtLoader(file_path)
18
+ elif file_path.endswith('.html'):
19
+ loader = UnstructuredHTMLLoader(file_path)
20
+ else:
21
+ raise ValueError(f"Unsupported file type: {file_path}")
22
+
23
+ documents = loader.load()
24
+ return text_splitter.split_documents(documents)
25
+
26
+ def index_document_to_chroma(file_path: str, file_id: int) -> bool:
27
+ try:
28
+ splits = load_and_split_document(file_path)
29
+
30
+ # Add metadata to each split
31
+ for split in splits:
32
+ split.metadata['file_id'] = file_id
33
+
34
+ vectorstore.add_documents(splits)
35
+ # vectorstore.persist()
36
+ return True
37
+ except Exception as e:
38
+ print(f"Error indexing document: {e}")
39
+ return False
40
+
41
+ def delete_doc_from_chroma(file_id: int):
42
+ try:
43
+ docs = vectorstore.get(where={"file_id": file_id})
44
+ print(f"Found {len(docs['ids'])} document chunks for file_id {file_id}")
45
+
46
+ vectorstore._collection.delete(where={"file_id": file_id})
47
+ print(f"Deleted all documents with file_id {file_id}")
48
+
49
+ return True
50
+ except Exception as e:
51
+ print(f"Error deleting document with file_id {file_id} from Chroma: {str(e)}")
52
+ return False