PTE_Smart-AI / document_processor.py
JohnsonMLEngineer's picture
Update document_processor.py
4ac5387 verified
raw
history blame contribute delete
373 Bytes
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def read_documents(directory):
return PyPDFLoader(directory).load()
def chunk_data(docs, chunk_size=800, chunk_overlap=40):
return RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap).split_documents(docs)