InteractiveSurvey / src /demo /asg_splitter.py
technicolor's picture
update
0a8d09f
from .asg_loader import DocumentLoading
from langchain_text_splitters import RecursiveCharacterTextSplitter
class TextSplitting:
def mineru_recursive_splitter(self, file_path, survey_id, mode):
docs = DocumentLoading().load_pdf(file_path, survey_id, mode)
if docs is None:
# 若加载失败,则返回空列表,调用方需处理
return []
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=30,
length_function=len,
is_separator_regex=False,
)
texts = text_splitter.create_documents([docs])
return texts
def pypdf_recursive_splitter(self, file_path, survey_id):
docs = DocumentLoading().pypdf_loader(file_path, survey_id)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=20,
length_function=len,
is_separator_regex=False,
)
texts = text_splitter.create_documents([docs])
return texts