File size: 5,655 Bytes
eccde2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from dataclasses import dataclass
import re

# Data class for representing a text split
@dataclass
class Split:
    text: str  # the split text
    is_sentence: bool  # save whether this is a full sentence

# Data class for representing a document
@dataclass
class Document:
    doc_id: str
    text: str
    metadata: dict

# Class for splitting text into sentences
class SentenceSplitter:
    def __init__(self, chunk_size=100, chunk_overlap=50):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        # List of functions for splitting text
        self._split_fn_sentence = [self._split_by_sep('\n\n'), self._split_by_regex("[^,.;γ€‚οΌŸοΌ]+[,.;γ€‚οΌŸοΌ]?")]
        self._split_fn_subsentence = [self._split_by_sep(' ')]

    def _split_by_sep(self, sep):
        # Split text by separator and maintain the separator
        def fun(text):
            parts = text.split(sep)
            result = [sep + s if i > 0 else s for i, s in enumerate(parts)]
            return [s for s in result if s]
        return lambda text: fun(text)

    def _split_by_regex(self, regex):
        # Split text using a regular expression
        return lambda text: re.findall(regex, text)

    def _splits_by_fns(self, text):
        for split_fn in self._split_fn_sentence:
            splits = split_fn(text)
            if len(splits) > 1:
                return splits, True

        for split_fn in self._split_fn_subsentence:
            splits = split_fn(text)
            if len(splits) > 1:
                break

        return splits, False

    def _token_size(self, text):
        # Calculate the token size of text
        return len(text.split(' '))

    def _split(self, text, chunk_size):
        # Break text into splits that are smaller than chunk size
        if self._token_size(text) <= chunk_size:
            return [Split(text, is_sentence=True)]

        text_splits = []
        text_splits_by_fns, is_sentence = self._splits_by_fns(text)
        for text_split_by_fns in text_splits_by_fns:
            if self._token_size(text_split_by_fns) <= chunk_size:
                text_splits.append(Split(text_split_by_fns, is_sentence=is_sentence))
            else:
                recursive_text_splits = self._split(text_split_by_fns, chunk_size=chunk_size)
                text_splits.extend(recursive_text_splits)
        return text_splits

    def _merge(self, splits, chunk_size):
        # Merge splits into chunks
        chunks, cur_chunk, last_chunk = [], [], []
        cur_chunk_len = 0
        new_chunk = True

        def close_chunk():
            nonlocal chunks, cur_chunk, last_chunk, cur_chunk_len, new_chunk

            chunks.append("".join([text for text, length in cur_chunk]))
            last_chunk = cur_chunk
            cur_chunk = []
            cur_chunk_len = 0
            new_chunk = True
            # Add overlap to the new chunk from previous chunks
            if len(last_chunk) > 0:
                last_index = len(last_chunk) - 1
                while (
                    last_index >= 0
                    and cur_chunk_len + last_chunk[last_index][1] <= self.chunk_overlap
                ):
                    text, length = last_chunk[last_index]
                    cur_chunk_len += length
                    cur_chunk.insert(0, (text, length))
                    last_index -= 1

        while len(splits) > 0:
            cur_split = splits[0]
            cur_split_len = self._token_size(cur_split.text)

            # Close the chunk if it exceeds chunk_size
            if cur_chunk_len + cur_split_len > chunk_size and not new_chunk:
                close_chunk()
            else:
                if (
                    cur_split.is_sentence
                    or cur_chunk_len + cur_split_len <= chunk_size
                    or new_chunk  # new chunk, always add at least one split
                ):
                    # Add split to chunk
                    cur_chunk_len += cur_split_len
                    cur_chunk.append((cur_split.text, cur_split_len))
                    splits.pop(0)
                    new_chunk = False
                else:
                    # Close out the chunk
                    close_chunk()

        # Handle the last chunk
        if not new_chunk:
            chunk = "".join([text for text, length in cur_chunk])
            chunks.append(chunk)

        # Run post-processing to remove blank spaces
        new_chunks = [chunk.strip() for chunk in chunks if chunk.strip() != ""]
        return new_chunks

    def split_texts(self, documents):
        chunked_documents = []
        for page_no, document in enumerate(documents):
            text, metadata = document['text'], document['metadata']
            if text == "":
                continue
            splits = self._split(text, self.chunk_size)
            chunks = self._merge(splits, self.chunk_size)
            for chunk_no, chunk in enumerate(chunks):
                chunk_id = f"{metadata['file_name']}__{page_no}__{chunk_no}"
                chunk_metadata = {'file_name': metadata['file_name'], 'page_no': page_no, 'chunk_no': chunk_no}
                data = Document(chunk_id, chunk, chunk_metadata)
                chunked_documents.append(data)
        return chunked_documents

if __name__ == '__main__':
    document = {
        "text": "This is example texts",
        "metadata": {"file_name": "example.pdf", "page_no": 1}
    }
    documents = [document] * 10
    splitter = SentenceSplitter(chunk_size=100, chunk_overlap=30)
    splitted_documents = splitter.split_texts(documents)

    print(splitted_documents[0])