pdfchat / ingest_data.py
fakezeta
switching to intfloat/e5-base embedding model
3536102
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
import os
import time
import streamlit as st
def embed_doc(filename):
if len(os.listdir("."))>0:
loader=PyPDFLoader(filename)
start = time.time()
raw_documents = loader.load()
# Split text
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=0,
length_function=len
)
documents = text_splitter.split_documents(raw_documents)
end = time.time()
st.text("Load and split text: "+str(round(end - start,1)))
start = time.time()
embeddings = HuggingFaceEmbeddings(model_name="intfloat/e5-base")
end = time.time()
st.text("Embedding time: "+str(round(end - start,1)))
start = time.time()
vectorstore = Chroma.from_documents(documents, embeddings)
end = time.time()
st.text("Vectorizing time: "+str(round(end - start,1)))
return vectorstore