raghuv-aditya commited on
Commit
999f447
·
verified ·
1 Parent(s): b877b0a

Create embeddings.py

Browse files
Files changed (1) hide show
  1. embeddings.py +33 -0
embeddings.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ embeddings.py
3
+
4
+ Module for processing and storing document embeddings using ChromaDB.
5
+ """
6
+
7
+ import os
8
+ from langchain_openai import OpenAIEmbeddings
9
+ from langchain_chroma import Chroma
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+
12
+ PERSIST_DIRECTORY = "./chroma_db/courses"
13
+
14
+ def process_documents_with_chroma(documents):
15
+ """Processes documents and stores embeddings in ChromaDB.
16
+
17
+ Args:
18
+ documents (list): List of documents to be embedded.
19
+
20
+ Returns:
21
+ Chroma: Vector store with document embeddings.
22
+ """
23
+ if os.path.exists(PERSIST_DIRECTORY):
24
+ print("Loading existing embeddings from ChromaDB...")
25
+ vector_store = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=OpenAIEmbeddings())
26
+ else:
27
+ print("Creating new embeddings and saving to ChromaDB...")
28
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
29
+ texts = text_splitter.split_documents(documents)
30
+
31
+ embeddings = OpenAIEmbeddings()
32
+ vector_store = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY)
33
+ return vector_store