Spaces:
Sleeping
Sleeping
""" | |
embeddings.py | |
Module for processing and storing document embeddings using ChromaDB. | |
""" | |
import os | |
from langchain_openai import OpenAIEmbeddings | |
from langchain_chroma import Chroma | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
PERSIST_DIRECTORY = "./chroma_db/courses" | |
def process_documents_with_chroma(documents): | |
"""Processes documents and stores embeddings in ChromaDB. | |
Args: | |
documents (list): List of documents to be embedded. | |
Returns: | |
Chroma: Vector store with document embeddings. | |
""" | |
if os.path.exists(PERSIST_DIRECTORY): | |
print("Loading existing embeddings from ChromaDB...") | |
vector_store = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=OpenAIEmbeddings()) | |
else: | |
print("Creating new embeddings and saving to ChromaDB...") | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100) | |
texts = text_splitter.split_documents(documents) | |
embeddings = OpenAIEmbeddings() | |
vector_store = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY) | |
return vector_store | |