File size: 1,060 Bytes
a97d040
 
 
 
 
 
0a8d09f
 
 
a97d040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from .asg_loader import DocumentLoading
from langchain_text_splitters import RecursiveCharacterTextSplitter

class TextSplitting:
    def mineru_recursive_splitter(self, file_path, survey_id, mode):
        docs = DocumentLoading().load_pdf(file_path, survey_id, mode)
        if docs is None:
            # 若加载失败,则返回空列表,调用方需处理
            return []
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=400,
            chunk_overlap=30,
            length_function=len,
            is_separator_regex=False,
        )
        texts = text_splitter.create_documents([docs])
        return texts

    def pypdf_recursive_splitter(self, file_path, survey_id):
        docs = DocumentLoading().pypdf_loader(file_path, survey_id)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=300,
            chunk_overlap=20,
            length_function=len,
            is_separator_regex=False,
        )
        texts = text_splitter.create_documents([docs])
        return texts