from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain.schema import Document from langchain_community.vectorstores import Chroma from langchain_openai import OpenAIEmbeddings import pandas as pd def load_csv_data(file_path: str): df = pd.read_csv(file_path) texts = "" for i in range(len(df)): texts += df.iloc[i]["content"] # Split the texts into smaller chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) split_texts = text_splitter.split_text(texts) # Split the concatenated text into smaller chunks documents = [Document(page_content=text) for text in split_texts] # Create Document instances return documents # Return the created documents def create_vector_db(docs: list[Document]): persist_directory = "vector_db" vector_db = Chroma.from_documents(docs, embedding=OpenAIEmbeddings(), persist_directory=persist_directory) return vector_db if __name__ == "__main__": docs = load_csv_data("doc_ai/pdf_data.csv") vector_db = create_vector_db(docs)