Spaces:
Runtime error
Runtime error
File size: 5,655 Bytes
eccde2c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
from dataclasses import dataclass
import re
# Data class for representing a text split
@dataclass
class Split:
text: str # the split text
is_sentence: bool # save whether this is a full sentence
# Data class for representing a document
@dataclass
class Document:
doc_id: str
text: str
metadata: dict
# Class for splitting text into sentences
class SentenceSplitter:
def __init__(self, chunk_size=100, chunk_overlap=50):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
# List of functions for splitting text
self._split_fn_sentence = [self._split_by_sep('\n\n'), self._split_by_regex("[^,.;γοΌοΌ]+[,.;γοΌοΌ]?")]
self._split_fn_subsentence = [self._split_by_sep(' ')]
def _split_by_sep(self, sep):
# Split text by separator and maintain the separator
def fun(text):
parts = text.split(sep)
result = [sep + s if i > 0 else s for i, s in enumerate(parts)]
return [s for s in result if s]
return lambda text: fun(text)
def _split_by_regex(self, regex):
# Split text using a regular expression
return lambda text: re.findall(regex, text)
def _splits_by_fns(self, text):
for split_fn in self._split_fn_sentence:
splits = split_fn(text)
if len(splits) > 1:
return splits, True
for split_fn in self._split_fn_subsentence:
splits = split_fn(text)
if len(splits) > 1:
break
return splits, False
def _token_size(self, text):
# Calculate the token size of text
return len(text.split(' '))
def _split(self, text, chunk_size):
# Break text into splits that are smaller than chunk size
if self._token_size(text) <= chunk_size:
return [Split(text, is_sentence=True)]
text_splits = []
text_splits_by_fns, is_sentence = self._splits_by_fns(text)
for text_split_by_fns in text_splits_by_fns:
if self._token_size(text_split_by_fns) <= chunk_size:
text_splits.append(Split(text_split_by_fns, is_sentence=is_sentence))
else:
recursive_text_splits = self._split(text_split_by_fns, chunk_size=chunk_size)
text_splits.extend(recursive_text_splits)
return text_splits
def _merge(self, splits, chunk_size):
# Merge splits into chunks
chunks, cur_chunk, last_chunk = [], [], []
cur_chunk_len = 0
new_chunk = True
def close_chunk():
nonlocal chunks, cur_chunk, last_chunk, cur_chunk_len, new_chunk
chunks.append("".join([text for text, length in cur_chunk]))
last_chunk = cur_chunk
cur_chunk = []
cur_chunk_len = 0
new_chunk = True
# Add overlap to the new chunk from previous chunks
if len(last_chunk) > 0:
last_index = len(last_chunk) - 1
while (
last_index >= 0
and cur_chunk_len + last_chunk[last_index][1] <= self.chunk_overlap
):
text, length = last_chunk[last_index]
cur_chunk_len += length
cur_chunk.insert(0, (text, length))
last_index -= 1
while len(splits) > 0:
cur_split = splits[0]
cur_split_len = self._token_size(cur_split.text)
# Close the chunk if it exceeds chunk_size
if cur_chunk_len + cur_split_len > chunk_size and not new_chunk:
close_chunk()
else:
if (
cur_split.is_sentence
or cur_chunk_len + cur_split_len <= chunk_size
or new_chunk # new chunk, always add at least one split
):
# Add split to chunk
cur_chunk_len += cur_split_len
cur_chunk.append((cur_split.text, cur_split_len))
splits.pop(0)
new_chunk = False
else:
# Close out the chunk
close_chunk()
# Handle the last chunk
if not new_chunk:
chunk = "".join([text for text, length in cur_chunk])
chunks.append(chunk)
# Run post-processing to remove blank spaces
new_chunks = [chunk.strip() for chunk in chunks if chunk.strip() != ""]
return new_chunks
def split_texts(self, documents):
chunked_documents = []
for page_no, document in enumerate(documents):
text, metadata = document['text'], document['metadata']
if text == "":
continue
splits = self._split(text, self.chunk_size)
chunks = self._merge(splits, self.chunk_size)
for chunk_no, chunk in enumerate(chunks):
chunk_id = f"{metadata['file_name']}__{page_no}__{chunk_no}"
chunk_metadata = {'file_name': metadata['file_name'], 'page_no': page_no, 'chunk_no': chunk_no}
data = Document(chunk_id, chunk, chunk_metadata)
chunked_documents.append(data)
return chunked_documents
if __name__ == '__main__':
document = {
"text": "This is example texts",
"metadata": {"file_name": "example.pdf", "page_no": 1}
}
documents = [document] * 10
splitter = SentenceSplitter(chunk_size=100, chunk_overlap=30)
splitted_documents = splitter.split_texts(documents)
print(splitted_documents[0])
|