File size: 8,540 Bytes
a15256b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import re
from typing import Callable
from tools.i18n.i18n import I18nAuto

i18n = I18nAuto()

METHODS = dict()

def get_method(name:str)->Callable:
    method = METHODS.get(name, None)
    if method is None:
        raise ValueError(f"Method {name} not found")
    return method

def register_method(name):
    def decorator(func):
        METHODS[name] = func
        return func
    return decorator

splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", }



def split_big_text(text, max_len=510):
    # 定义全角和半角标点符号
    punctuation = "".join(splits)

    # 切割文本
    segments = re.split('([' + punctuation + '])', text)
    
    # 初始化结果列表和当前片段
    result = []
    current_segment = ''
    
    for segment in segments:
        # 如果当前片段加上新的片段长度超过max_len,就将当前片段加入结果列表,并重置当前片段
        if len(current_segment + segment) > max_len:
            result.append(current_segment)
            current_segment = segment
        else:
            current_segment += segment
    
    # 将最后一个片段加入结果列表
    if current_segment:
        result.append(current_segment)
    
    return result

def split(todo_text):
    todo_text = todo_text.replace("……", "。").replace("——", ",")
    if todo_text[-1] not in splits:
        todo_text += "。"
    i_split_head = i_split_tail = 0
    len_text = len(todo_text)
    todo_texts = []
    while 1:
        if i_split_head >= len_text:
            break  # 结尾一定有标点,所以直接跳出即可,最后一段在上次已加入
        if todo_text[i_split_head] in splits:
            i_split_head += 1
            todo_texts.append(todo_text[i_split_tail:i_split_head])
            i_split_tail = i_split_head
        else:
            i_split_head += 1
    return todo_texts

def cut_sentence_multilang(text, max_length=30):
    # 初始化计数器
    word_count = 0
    in_word = False
    
    
    for index, char in enumerate(text):
        if char.isspace():  # 如果当前字符是空格
            in_word = False
        elif char.isascii() and not in_word:  # 如果是ASCII字符(英文)并且不在单词内
            word_count += 1  # 新的英文单词
            in_word = True
        elif not char.isascii():  # 如果字符非英文
            word_count += 1  # 每个非英文字符单独计为一个字
        if word_count > max_length:
            return text[:index], text[index:]
    
    return text, ""

# contributed by XTer
# 简单的按长度切分,不希望出现超长的句子
def split_long_sentence(text, max_length=510):
    
    opts = []
    sentences = text.split('\n')
    for sentence in sentences:
        prev_text , sentence = cut_sentence_multilang(sentence, max_length)
        while sentence.strip() != "":
            opts.append(prev_text)
            prev_text , sentence = cut_sentence_multilang(sentence, max_length)
        opts.append(prev_text)
    return "\n".join(opts)

# 不切
@register_method("cut0")
def cut0(inp):
    return inp


# 凑四句一切
@register_method("cut1")
def cut1(inp):
    inp = inp.strip("\n")
    inps = split(inp)
    split_idx = list(range(0, len(inps), 4))
    split_idx[-1] = None
    if len(split_idx) > 1:
        opts = []
        for idx in range(len(split_idx) - 1):
            opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
    else:
        opts = [inp]
    return "\n".join(opts)


# 凑50字一切
@register_method("cut2")
def cut2(inp, max_length=50):
    inp = split_long_sentence(inp).strip("\n")
    inps = split(inp)
    if len(inps) < 2:
        return inp
    opts = []
    summ = 0
    tmp_str = ""
    for i in range(len(inps)):
        summ += len(inps[i])
        tmp_str += inps[i]
        if summ > max_length:
            summ = 0
            opts.append(tmp_str)
            tmp_str = ""
    if tmp_str != "":
        opts.append(tmp_str)
    # print(opts)
    if len(opts) > 1 and len(opts[-1]) < 50:  ##如果最后一个太短了,和前一个合一起
        opts[-2] = opts[-2] + opts[-1]
        opts = opts[:-1]
    return "\n".join(opts)


# 按中文句号。切
@register_method("cut3")
def cut3(inp):
    inp = split_long_sentence(inp).strip("\n")
    return "\n".join(["%s" % item for item in inp.strip("。").split("。")])


# 按英文句号.切
@register_method("cut4")
def cut4(inp):
    inp = inp.strip("\n")
    return "\n".join(["%s" % item for item in inp.strip(".").split(".")])

# 按标点符号切
# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
@register_method("cut5")
def cut5(inp):
    # if not re.search(r'[^\w\s]', inp[-1]):
    # inp += '。'
    inp = inp.strip("\n")
    punds = r'[,.;?!、,。?!;:…]'
    items = re.split(f'({punds})', inp)
    mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
    # 在句子不存在符号或句尾无符号的时候保证文本完整
    if len(items)%2 == 1:
        mergeitems.append(items[-1])
    opt = "\n".join(mergeitems)
    return opt

def count_words_multilang(text):
    # 初始化计数器
    word_count = 0
    in_word = False
    
    for char in text:
        if char.isspace():  # 如果当前字符是空格
            in_word = False
        elif char.isascii() and not in_word:  # 如果是ASCII字符(英文)并且不在单词内
            word_count += 1  # 新的英文单词
            in_word = True
        elif not char.isascii():  # 如果字符非英文
            word_count += 1  # 每个非英文字符单独计为一个字
    
    return word_count
    

# contributed by https://github.com/X-T-E-R/GPT-SoVITS-Inference/blob/main/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
@register_method("auto_cut")
def auto_cut(inp, max_length=30):
    # if not re.search(r'[^\w\s]', inp[-1]):
    # inp += '。'
    inp = inp.strip("\n")
    inp = inp.replace(". ", "。")
    erase_punds = r'[“”"‘’\'()()【】[\]{}<>《》〈〉〔〕〖〗〘〙〚〛〛〞〟]'
    inp = re.sub(erase_punds, '', inp)
    split_punds = r'[?!。?!~:]'
    if inp[-1] not in split_punds:
        inp+="。"
    items = re.split(f'({split_punds})', inp)
    items = ["".join(group) for group in zip(items[::2], items[1::2])]

    def process_commas(text, max_length):
    
        # Define separators and the regular expression for splitting
        separators = [',', ',', '、', '——', '…']
        # 使用正则表达式的捕获组来保留分隔符,分隔符两边的括号就是所谓的捕获组
        regex_pattern = '(' + '|'.join(map(re.escape, separators)) + ')'
        # 使用re.split函数分割文本,由于使用了捕获组,分隔符也会作为分割结果的一部分返回
        sentences = re.split(regex_pattern, text)
  
        processed_text = "" 
        current_line = "" 
        
        final_sentences = []
        
        for sentence in sentences:
            if count_words_multilang(sentence)>max_length:
                
                final_sentences+=split_long_sentence(sentence,max_length=max_length).split("\n")
            else:
                final_sentences.append(sentence)
        
        for sentence in final_sentences:
            # Add the length of the sentence plus one for the space or newline that will follow
            if count_words_multilang(current_line + sentence) <= max_length:
                # If adding the next sentence does not exceed max length, add it to the current line
                current_line += sentence
            else:
                # If the current line is too long, start a new line
                processed_text += current_line.strip() + '\n'
                current_line = sentence + " "  # Start the new line with the current sentence
        
        # Add any remaining text in current_line to processed_text
        processed_text += current_line.strip()

        return processed_text

    final_items = []
    for item in items:
        final_items+=process_commas(item,max_length=max_length).split("\n")
    
    final_items = [item for item in final_items if item.strip() and not (len(item.strip()) == 1 and item.strip() in "?!,,。?!~:")]

    return "\n".join(final_items)


if __name__ == '__main__':
    str1 = """我 有i一个j k 1"""
    print(count_words_multilang(str1))
    print(cut_sentence_multilang(str1, 20))