ChatIGL / langchain_src /vector_db.py
Koshti10's picture
Upload 51 files
9610b37 verified
raw
history blame
1.12 kB
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
import pandas as pd
def load_csv_data(file_path: str):
df = pd.read_csv(file_path)
texts = ""
for i in range(len(df)):
texts += df.iloc[i]["content"]
# Split the texts into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
split_texts = text_splitter.split_text(texts) # Split the concatenated text into smaller chunks
documents = [Document(page_content=text) for text in split_texts] # Create Document instances
return documents # Return the created documents
def create_vector_db(docs: list[Document]):
persist_directory = "vector_db"
vector_db = Chroma.from_documents(docs, embedding=OpenAIEmbeddings(), persist_directory=persist_directory)
return vector_db
if __name__ == "__main__":
docs = load_csv_data("doc_ai/pdf_data.csv")
vector_db = create_vector_db(docs)