Spaces:
Running
Running
Mahiruoshi
commited on
Update tools/sentence.py
Browse files- tools/sentence.py +4 -20
tools/sentence.py
CHANGED
@@ -127,36 +127,20 @@ def merge_adjacent_japanese(sentences):
|
|
127 |
def extrac(text):
|
128 |
text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
|
129 |
text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
|
130 |
-
#
|
131 |
-
preliminary_sentences = re.split(r'([\n。;!?\.\?!])', text)
|
132 |
final_sentences = []
|
133 |
|
134 |
-
preliminary_sentences = re.split(r'([\n。;!?\.\?!])', text)
|
135 |
-
|
136 |
for piece in preliminary_sentences:
|
137 |
if is_single_language(piece):
|
138 |
final_sentences.append(piece)
|
139 |
else:
|
140 |
sub_sentences = split_mixed_language(piece)
|
141 |
final_sentences.extend(sub_sentences)
|
142 |
-
|
143 |
-
# 处理长句子,使用jieba进行分词
|
144 |
-
split_sentences = []
|
145 |
-
for sentence in final_sentences:
|
146 |
-
split_sentences.extend(split_long_sentences(sentence))
|
147 |
-
|
148 |
-
# 合并相邻的日语句子
|
149 |
-
merged_japanese_sentences = merge_adjacent_japanese(split_sentences)
|
150 |
-
|
151 |
-
# 剔除只包含标点符号的元素
|
152 |
-
clean_sentences = [s for s in merged_japanese_sentences if not is_only_punctuation(s)]
|
153 |
-
|
154 |
-
# 移除空字符串并去除多余引号
|
155 |
-
return [s.replace('"','').strip() for s in clean_sentences if s]
|
156 |
-
|
157 |
|
|
|
|
|
158 |
|
159 |
-
# 移除空字符串
|
160 |
|
161 |
def is_mixed_language(sentence):
|
162 |
contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None
|
|
|
127 |
def extrac(text):
|
128 |
text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
|
129 |
text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
|
130 |
+
# 使用换行符和标点符号进行初步分割,确保标点符号保留在句子末尾
|
131 |
+
preliminary_sentences = re.split(r'(?<=[\n。;!?\.\?!])', text)
|
132 |
final_sentences = []
|
133 |
|
|
|
|
|
134 |
for piece in preliminary_sentences:
|
135 |
if is_single_language(piece):
|
136 |
final_sentences.append(piece)
|
137 |
else:
|
138 |
sub_sentences = split_mixed_language(piece)
|
139 |
final_sentences.extend(sub_sentences)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
+
# 移除双引号和空白字符
|
142 |
+
return [s.replace('"', '').strip() for s in final_sentences if s]
|
143 |
|
|
|
144 |
|
145 |
def is_mixed_language(sentence):
|
146 |
contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None
|