File size: 4,354 Bytes
01e655b
d2b7e94
01e655b
 
f367757
bed01bd
 
01e655b
f367757
 
 
 
bed01bd
f367757
 
 
 
bed01bd
f367757
 
bed01bd
f367757
 
bed01bd
f367757
 
 
bed01bd
f367757
bed01bd
f367757
 
 
01e655b
f367757
 
 
 
 
 
 
01e655b
f367757
 
01e655b
 
 
f367757
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01e655b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import re

import zhon

from modules.models import get_tokenizer
from modules.utils.detect_lang import guess_lang


# 解析文本 并根据停止符号分割成句子
# 可以设置最大阈值,即如果分割片段小于这个阈值会与下一段合并
class SentenceSplitter:
    SEP_TOKEN = " "

    def __init__(self, threshold=100):
        assert (
            isinstance(threshold, int) and threshold > 0
        ), "Threshold must be greater than 0."

        self.sentence_threshold = threshold
        self.tokenizer = get_tokenizer()

    def count_tokens(self, text: str):
        return len(self.tokenizer.tokenize(text))

    def parse(self, text: str):
        sentences = self.split_paragraph(text)
        sentences = self.merge_text_by_threshold(sentences)

        return sentences

    def merge_text_by_threshold(self, setences: list[str]):
        """
        Merge text by threshold.

        If the length of the text is less than the threshold, merge it with the next text.
        """
        merged_sentences: list[str] = []
        temp_sentence = ""
        for sentence in setences:
            if len(temp_sentence) + len(sentence) < self.sentence_threshold:
                temp_sentence += SentenceSplitter.SEP_TOKEN + sentence
            else:
                merged_sentences.append(temp_sentence)
                temp_sentence = sentence

        if temp_sentence:
            merged_sentences.append(temp_sentence)
        return merged_sentences

    def split_paragraph(self, text: str):
        """
        Split text into sentences.
        """
        lines = text.split("\n")
        sentences: list[str] = []
        for line in lines:
            if self.is_eng_sentence(line):
                sentences.extend(self.split_en_sentence(line))
            else:
                sentences.extend(self.split_zhon_sentence(line))
        return sentences

    def is_eng_sentence(self, text: str):
        return guess_lang(text) == "en"

    def split_en_sentence(self, text: str):
        """
        Split English text into sentences.
        """
        pattern = re.compile(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s")
        sentences = pattern.split(text)

        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

        return sentences

    def split_zhon_sentence(self, text: str):
        """
        Split Chinese text into sentences.
        """
        sentences: list[str] = []
        pattern = re.compile(zhon.hanzi.sentence)
        start = 0
        for match in pattern.finditer(text):
            end = match.end()
            sentences.append(text[start:end])
            start = end

        if start < len(text):
            sentences.append(text[start:])

        sentences = [t for t in sentences if t.strip()]
        return sentences


if __name__ == "__main__":
    max_threshold = 100
    parser = SentenceSplitter(max_threshold)
    text = """
中华美食,作为世界饮食文化的瑰宝,以其丰富的种类、独特的风味和精湛的烹饪技艺而闻名于世。中国地大物博,各地区的饮食习惯和烹饪方法各具特色,形成了独树一帜的美食体系。从北方的京鲁菜、东北菜,到南方的粤菜、闽菜,无不展现出中华美食的多样性。

在中华美食的世界里,五味调和,色香味俱全。无论是辣味浓郁的川菜,还是清淡鲜美的淮扬菜,都能够满足不同人的口味需求。除了味道上的独特,中华美食还注重色彩的搭配和形态的美感,让每一道菜品不仅是味觉的享受,更是一场视觉的盛宴。

中华美食不仅仅是食物,更是一种文化的传承。每一道菜背后都有着深厚的历史背景和文化故事。比如,北京的烤鸭,代表着皇家气派;而西安的羊肉泡馍,则体现了浓郁的地方风情。中华美食的精髓在于它追求的“天人合一”,讲究食材的自然性和烹饪过程中的和谐。

总之,中华美食博大精深,其丰富的口感和多样的烹饪技艺,构成了一个充满魅力和无限可能的美食世界。无论你来自哪里,都会被这独特的美食文化所吸引和感动。
    """
    result = parser.parse(text)
    for idx, sentence in enumerate(result):
        print(f"Sentence {idx + 1}: {sentence}")