Mahiruoshi commited on
Commit
1a0a988
·
verified ·
1 Parent(s): a88d850

Update tools/sentence.py

Browse files
Files changed (1) hide show
  1. tools/sentence.py +4 -20
tools/sentence.py CHANGED
@@ -127,36 +127,20 @@ def merge_adjacent_japanese(sentences):
127
  def extrac(text):
128
  text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
129
  text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
130
- # 使用换行符和标点符号进行初步分割
131
- preliminary_sentences = re.split(r'([\n。;!?\.\?!])', text)
132
  final_sentences = []
133
 
134
- preliminary_sentences = re.split(r'([\n。;!?\.\?!])', text)
135
-
136
  for piece in preliminary_sentences:
137
  if is_single_language(piece):
138
  final_sentences.append(piece)
139
  else:
140
  sub_sentences = split_mixed_language(piece)
141
  final_sentences.extend(sub_sentences)
142
-
143
- # 处理长句子,使用jieba进行分词
144
- split_sentences = []
145
- for sentence in final_sentences:
146
- split_sentences.extend(split_long_sentences(sentence))
147
-
148
- # 合并相邻的日语句子
149
- merged_japanese_sentences = merge_adjacent_japanese(split_sentences)
150
-
151
- # 剔除只包含标点符号的元素
152
- clean_sentences = [s for s in merged_japanese_sentences if not is_only_punctuation(s)]
153
-
154
- # 移除空字符串并去除多余引号
155
- return [s.replace('"','').strip() for s in clean_sentences if s]
156
-
157
 
 
 
158
 
159
- # 移除空字符串
160
 
161
  def is_mixed_language(sentence):
162
  contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None
 
127
  def extrac(text):
128
  text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
129
  text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
130
+ # 使用换行符和标点符号进行初步分割,确保标点符号保留在句子末尾
131
+ preliminary_sentences = re.split(r'(?<=[\n。;!?\.\?!])', text)
132
  final_sentences = []
133
 
 
 
134
  for piece in preliminary_sentences:
135
  if is_single_language(piece):
136
  final_sentences.append(piece)
137
  else:
138
  sub_sentences = split_mixed_language(piece)
139
  final_sentences.extend(sub_sentences)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
+ # 移除双引号和空白字符
142
+ return [s.replace('"', '').strip() for s in final_sentences if s]
143
 
 
144
 
145
  def is_mixed_language(sentence):
146
  contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None