Spaces:
Sleeping
Sleeping
File size: 1,060 Bytes
a97d040 0a8d09f a97d040 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
from .asg_loader import DocumentLoading
from langchain_text_splitters import RecursiveCharacterTextSplitter
class TextSplitting:
def mineru_recursive_splitter(self, file_path, survey_id, mode):
docs = DocumentLoading().load_pdf(file_path, survey_id, mode)
if docs is None:
# 若加载失败,则返回空列表,调用方需处理
return []
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=30,
length_function=len,
is_separator_regex=False,
)
texts = text_splitter.create_documents([docs])
return texts
def pypdf_recursive_splitter(self, file_path, survey_id):
docs = DocumentLoading().pypdf_loader(file_path, survey_id)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=20,
length_function=len,
is_separator_regex=False,
)
texts = text_splitter.create_documents([docs])
return texts |