Mahiruoshi commited on
Commit
112320c
·
verified ·
1 Parent(s): 1e6af1e

Update tools/sentence.py

Browse files
Files changed (1) hide show
  1. tools/sentence.py +262 -164
tools/sentence.py CHANGED
@@ -1,173 +1,271 @@
1
- import logging
2
-
3
- import regex as re
4
-
5
- from tools.classify_language import classify_language, split_alpha_nonalpha
6
-
7
-
8
- def check_is_none(item) -> bool:
9
- """none -> True, not none -> False"""
10
- return (
11
- item is None
12
- or (isinstance(item, str) and str(item).isspace())
13
- or str(item) == ""
14
- )
15
-
16
-
17
- def markup_language(text: str, target_languages: list = None) -> str:
18
- pattern = (
19
- r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
20
- r"\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
21
- r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
22
- )
23
- sentences = re.split(pattern, text)
24
-
25
- pre_lang = ""
26
- p = 0
27
-
28
- if target_languages is not None:
29
- sorted_target_languages = sorted(target_languages)
30
- if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
31
- new_sentences = []
32
- for sentence in sentences:
33
- new_sentences.extend(split_alpha_nonalpha(sentence))
34
- sentences = new_sentences
35
 
36
  for sentence in sentences:
37
- if check_is_none(sentence):
38
- continue
39
-
40
- lang = classify_language(sentence, target_languages)
41
-
42
- if pre_lang == "":
43
- text = text[:p] + text[p:].replace(
44
- sentence, f"[{lang.upper()}]{sentence}", 1
45
- )
46
- p += len(f"[{lang.upper()}]")
47
- elif pre_lang != lang:
48
- text = text[:p] + text[p:].replace(
49
- sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1
50
- )
51
- p += len(f"[{pre_lang.upper()}][{lang.upper()}]")
52
- pre_lang = lang
53
- p += text[p:].index(sentence) + len(sentence)
54
- text += f"[{pre_lang.upper()}]"
55
-
56
- return text
57
-
58
-
59
- def split_by_language(text: str, target_languages: list = None) -> list:
60
- pattern = (
61
- r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
62
- r"\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
63
- r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
64
- )
65
- sentences = re.split(pattern, text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- pre_lang = ""
68
- start = 0
69
- end = 0
70
- sentences_list = []
71
 
72
- if target_languages is not None:
73
- sorted_target_languages = sorted(target_languages)
74
- if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
75
- new_sentences = []
76
- for sentence in sentences:
77
- new_sentences.extend(split_alpha_nonalpha(sentence))
78
- sentences = new_sentences
79
 
80
- for sentence in sentences:
81
- if check_is_none(sentence):
82
- continue
83
-
84
- lang = classify_language(sentence, target_languages)
85
-
86
- end += text[end:].index(sentence)
87
- if pre_lang != "" and pre_lang != lang:
88
- sentences_list.append((text[start:end], pre_lang))
89
- start = end
90
- end += len(sentence)
91
- pre_lang = lang
92
- sentences_list.append((text[start:], pre_lang))
93
-
94
- return sentences_list
95
-
96
-
97
- def sentence_split(text: str, max: int) -> list:
98
- pattern = r"[!(),—+\-.:;??。,、;:]+"
99
- sentences = re.split(pattern, text)
100
- discarded_chars = re.findall(pattern, text)
101
-
102
- sentences_list, count, p = [], 0, 0
103
-
104
- # 按被分割的符号遍历
105
- for i, discarded_chars in enumerate(discarded_chars):
106
- count += len(sentences[i]) + len(discarded_chars)
107
- if count >= max:
108
- sentences_list.append(text[p : p + count].strip())
109
- p += count
110
- count = 0
111
-
112
- # 加入最后剩余的文本
113
- if p < len(text):
114
- sentences_list.append(text[p:])
115
-
116
- return sentences_list
117
-
118
-
119
- def sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None):
120
- # 如果该speaker只支持一种语言
121
- if speaker_lang is not None and len(speaker_lang) == 1:
122
- if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
123
- logging.debug(
124
- f'lang "{lang}" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}'
125
- )
126
- lang = speaker_lang[0]
127
-
128
- sentences_list = []
129
- if lang.upper() != "MIX":
130
- if max <= 0:
131
- sentences_list.append(
132
- markup_language(text, speaker_lang)
133
- if lang.upper() == "AUTO"
134
- else f"[{lang.upper()}]{text}[{lang.upper()}]"
135
- )
136
  else:
137
- for i in sentence_split(text, max):
138
- if check_is_none(i):
139
- continue
140
- sentences_list.append(
141
- markup_language(i, speaker_lang)
142
- if lang.upper() == "AUTO"
143
- else f"[{lang.upper()}]{i}[{lang.upper()}]"
144
- )
145
- else:
146
- sentences_list.append(text)
147
-
148
- for i in sentences_list:
149
- logging.debug(i)
150
-
151
- return sentences_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  if __name__ == "__main__":
155
- text = "这几天心里颇不宁静。今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。我悄悄地披了大衫,带上门出去。"
156
- print(markup_language(text, target_languages=None))
157
- print(sentence_split(text, max=50))
158
- print(sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None))
159
-
160
- text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了GAN Duration predictor和transformer flow,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
161
- print(split_by_language(text, ["zh", "ja", "en"]))
162
-
163
- text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days"
164
-
165
- print(split_by_language(text, ["zh", "ja", "en"]))
166
- # output: [('vits', 'en'), ('和', 'ja'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')]
167
-
168
- print(split_by_language(text, ["zh", "en"]))
169
- # output: [('vits', 'en'), ('和', 'zh'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')]
170
-
171
- text = "vits 和 Bert-VITS2 是 tts 模型。花费 3 days. 花费 3天。Take 3 days"
172
- print(split_by_language(text, ["zh", "en"]))
173
- # output: [('vits ', 'en'), ('和 ', 'zh'), ('Bert-VITS2 ', 'en'), ('是 ', 'zh'), ('tts ', 'en'), ('模型。花费 ', 'zh'), ('3 days. ', 'en'), ('花费 3天。', 'zh'), ('Take 3 days', 'en')]
 
1
+ import re, os
2
+
3
+ from ebooklib import epub
4
+ import PyPDF2
5
+ from PyPDF2 import PdfReader
6
+ from bs4 import BeautifulSoup
7
+ import jieba
8
+ import romajitable
9
+
10
+ def is_japanese(string):
11
+ for ch in string:
12
+ if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
13
+ return True
14
+ return False
15
+
16
+ def is_chinese(string):
17
+ for ch in string:
18
+ if '\u4e00' <= ch <= '\u9fff':
19
+ return True
20
+ return False
21
+
22
+ def is_single_language(sentence):
23
+ # 检查句子是否为单一语言
24
+ contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None
25
+ contains_japanese = re.search(r'[\u3040-\u30ff\u31f0-\u31ff]', sentence) is not None
26
+ contains_english = re.search(r'[a-zA-Z]', sentence) is not None
27
+ language_count = sum([contains_chinese, contains_japanese, contains_english])
28
+ return language_count == 1
29
+
30
+ def merge_scattered_parts(sentences):
31
+ """合并零散的部分到相邻的句子中,并确保单一语言性"""
32
+ merged_sentences = []
33
+ buffer_sentence = ""
 
34
 
35
  for sentence in sentences:
36
+ # 检查是否是单一语言或者太短(可能是标点或单个词)
37
+ if is_single_language(sentence) and len(sentence) > 1:
38
+ # 如果缓冲区有内容,先将缓冲区的内容添加到列表
39
+ if buffer_sentence:
40
+ merged_sentences.append(buffer_sentence)
41
+ buffer_sentence = ""
42
+ merged_sentences.append(sentence)
43
+ else:
44
+ # 如果是零散的部分,将其添加到缓冲区
45
+ buffer_sentence += sentence
46
+
47
+ # 确保最后的缓冲区内容被添加
48
+ if buffer_sentence:
49
+ merged_sentences.append(buffer_sentence)
50
+
51
+ return merged_sentences
52
+
53
+ def is_only_punctuation(s):
54
+ """检查字符串是否只包含标点符号"""
55
+ # 此处列出中文、日文、英文常见标点符号
56
+ punctuation_pattern = re.compile(r'^[\s。*;,:“”()、!?《》\u3000\.,;:"\'?!()]+$')
57
+ return punctuation_pattern.match(s) is not None
58
+
59
+ def split_mixed_language(sentence):
60
+ # 分割混合语言句子
61
+ # 逐字符检查,分割不同语言部分
62
+ sub_sentences = []
63
+ current_language = None
64
+ current_part = ""
65
+
66
+ for char in sentence:
67
+ if re.match(r'[\u4e00-\u9fff]', char): # Chinese character
68
+ if current_language != 'chinese':
69
+ if current_part:
70
+ sub_sentences.append(current_part)
71
+ current_part = char
72
+ current_language = 'chinese'
73
+ else:
74
+ current_part += char
75
+ elif re.match(r'[\u3040-\u30ff\u31f0-\u31ff]', char): # Japanese character
76
+ if current_language != 'japanese':
77
+ if current_part:
78
+ sub_sentences.append(current_part)
79
+ current_part = char
80
+ current_language = 'japanese'
81
+ else:
82
+ current_part += char
83
+ elif re.match(r'[a-zA-Z]', char): # English character
84
+ if current_language != 'english':
85
+ if current_part:
86
+ sub_sentences.append(current_part)
87
+ current_part = char
88
+ current_language = 'english'
89
+ else:
90
+ current_part += char
91
+ else:
92
+ current_part += char # For punctuation and other characters
93
+
94
+ if current_part:
95
+ sub_sentences.append(current_part)
96
 
97
+ return sub_sentences
 
 
 
98
 
99
+ def replace_quotes(text):
100
+ # 替换中文、日文引号为英文引号
101
+ text = re.sub(r'[“”‘’『』「」()()]', '"', text)
102
+ return text
 
 
 
103
 
104
+ def remove_numeric_annotations(text):
105
+ # 定义用于匹配数字注释的正则表达式
106
+ # 包括 “”、【】和〔〕包裹的数字
107
+ pattern = r'“\d+”|【\d+】|〔\d+〕'
108
+ # 使用正则表达式替换掉这些注释
109
+ cleaned_text = re.sub(pattern, '', text)
110
+ return cleaned_text
111
+
112
+ def merge_adjacent_japanese(sentences):
113
+ """合并相邻且都只包含日语的句子"""
114
+ merged_sentences = []
115
+ i = 0
116
+ while i < len(sentences):
117
+ current_sentence = sentences[i]
118
+ if i + 1 < len(sentences) and is_japanese(current_sentence) and is_japanese(sentences[i + 1]):
119
+ # 当前句子和下一句都是日语,合并它们
120
+ while i + 1 < len(sentences) and is_japanese(sentences[i + 1]):
121
+ current_sentence += sentences[i + 1]
122
+ i += 1
123
+ merged_sentences.append(current_sentence)
124
+ i += 1
125
+ return merged_sentences
126
+
127
+ def extrac(text):
128
+ text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
129
+ text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
130
+ # 使用换行符和标点符号进行初步分割
131
+ preliminary_sentences = re.split(r'([\n。;!?\.\?!])', text)
132
+ final_sentences = []
133
+
134
+ preliminary_sentences = re.split(r'([\n。;!?\.\?!])', text)
135
+
136
+ for piece in preliminary_sentences:
137
+ if is_single_language(piece):
138
+ final_sentences.append(piece)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  else:
140
+ sub_sentences = split_mixed_language(piece)
141
+ final_sentences.extend(sub_sentences)
142
+
143
+ # 处理长句子,使用jieba进行分词
144
+ split_sentences = []
145
+ for sentence in final_sentences:
146
+ split_sentences.extend(split_long_sentences(sentence))
147
+
148
+ # 合并相邻的日语句子
149
+ merged_japanese_sentences = merge_adjacent_japanese(split_sentences)
150
+
151
+ # 剔除只包含标点符号的元素
152
+ clean_sentences = [s for s in merged_japanese_sentences if not is_only_punctuation(s)]
153
+
154
+ # 移除空字符串并去除多余引号
155
+ return [s.replace('"','').strip() for s in clean_sentences if s]
156
+
157
+
158
+
159
+ # 移除空字符串
160
+
161
+ def is_mixed_language(sentence):
162
+ contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None
163
+ contains_japanese = re.search(r'[\u3040-\u30ff\u31f0-\u31ff]', sentence) is not None
164
+ contains_english = re.search(r'[a-zA-Z]', sentence) is not None
165
+ languages_count = sum([contains_chinese, contains_japanese, contains_english])
166
+ return languages_count > 1
167
+
168
+ def split_mixed_language(sentence):
169
+ # 分割混合语言句子
170
+ sub_sentences = re.split(r'(?<=[。!?\.\?!])(?=")|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]|[a-zA-Z])', sentence)
171
+ return [s.strip() for s in sub_sentences if s.strip()]
172
+
173
+ def seconds_to_ass_time(seconds):
174
+ """将秒数转换为ASS时间格式"""
175
+ hours = int(seconds / 3600)
176
+ minutes = int((seconds % 3600) / 60)
177
+ seconds = int(seconds) % 60
178
+ milliseconds = int((seconds - int(seconds)) * 1000)
179
+ return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10))
180
+
181
+ def extract_text_from_epub(file_path):
182
+ book = epub.read_epub(file_path)
183
+ content = []
184
+ for item in book.items:
185
+ if isinstance(item, epub.EpubHtml):
186
+ soup = BeautifulSoup(item.content, 'html.parser')
187
+ content.append(soup.get_text())
188
+ return '\n'.join(content)
189
+
190
+ def extract_text_from_pdf(file_path):
191
+ with open(file_path, 'rb') as file:
192
+ reader = PdfReader(file)
193
+ content = [page.extract_text() for page in reader.pages]
194
+ return '\n'.join(content)
195
+
196
+ def remove_annotations(text):
197
+ # 移除方括号、尖括号和中文方括号中的内容
198
+ text = re.sub(r'\[.*?\]', '', text)
199
+ text = re.sub(r'\<.*?\>', '', text)
200
+ text = re.sub(r'&#8203;``【oaicite:1】``&#8203;', '', text)
201
+ return text
202
 
203
+ def extract_text_from_file(inputFile):
204
+ file_extension = os.path.splitext(inputFile)[1].lower()
205
+ if file_extension == ".epub":
206
+ return extract_text_from_epub(inputFile)
207
+ elif file_extension == ".pdf":
208
+ return extract_text_from_pdf(inputFile)
209
+ elif file_extension == ".txt":
210
+ with open(inputFile, 'r', encoding='utf-8') as f:
211
+ return f.read()
212
+ else:
213
+ raise ValueError(f"Unsupported file format: {file_extension}")
214
+
215
+ def split_by_punctuation(sentence):
216
+ """按照中文次级标点符号分割句子"""
217
+ # 常见的中文次级分隔符号:逗号、分号等
218
+ parts = re.split(r'([,,;;])', sentence)
219
+ # 将标点符号与前面的词语合并,避免单独标点符号成为一个部分
220
+ merged_parts = []
221
+ for part in parts:
222
+ if part and not part in ',,;;':
223
+ merged_parts.append(part)
224
+ elif merged_parts:
225
+ merged_parts[-1] += part
226
+ return merged_parts
227
+
228
+ def split_long_sentences(sentence, max_length=30):
229
+ """如果中文句子太长,先按标点分割,必要时使用jieba进行分词并分割"""
230
+ if len(sentence) > max_length and is_chinese(sentence):
231
+ # 首先尝试按照次级标点符号分割
232
+ preliminary_parts = split_by_punctuation(sentence)
233
+ new_sentences = []
234
+
235
+ for part in preliminary_parts:
236
+ # 如果部分仍然太长,使用jieba进行分词
237
+ if len(part) > max_length:
238
+ words = jieba.lcut(part)
239
+ current_sentence = ""
240
+ for word in words:
241
+ if len(current_sentence) + len(word) > max_length:
242
+ new_sentences.append(current_sentence)
243
+ current_sentence = word
244
+ else:
245
+ current_sentence += word
246
+ if current_sentence:
247
+ new_sentences.append(current_sentence)
248
+ else:
249
+ new_sentences.append(part)
250
+
251
+ return new_sentences
252
+ return [sentence] # 如果句子不长或不是中文,直接返回
253
+
254
+ def extract_and_convert(text):
255
+
256
+ # 使用正则表达式找出所有英文单词
257
+ english_parts = re.findall(r'\b[A-Za-z]+\b', text) # \b为单词边界标识
258
+
259
+ # 对每个英文单词进行片假名转换
260
+ kana_parts = ['\n{}\n'.format(romajitable.to_kana(word).katakana) for word in english_parts]
261
+
262
+ # 替换原文本中的英文部分
263
+ for eng, kana in zip(english_parts, kana_parts):
264
+ text = text.replace(eng, kana, 1) # 限制每次只替换一个实例
265
+
266
+ return text
267
 
268
  if __name__ == "__main__":
269
+ text = ",如“520”,【23】和〔83〕等。.我亲爱的读者,你也许在某一刻会遇上这样的情形,不禁对那著名哲学句子“那内在的就是那外在的,那外在的就是那内在的”“3”的正确性有了或多或少的怀疑。也许你自己就怀着某种秘密,对之你有着这样一种感觉:因为这秘密在它所具有的喜悦或者痛楚对你来说是太亲切了,以至于你不愿意让他人来和你共享它。也许你的生活使得你和一些人有所接触,对于他们你有着某种预感,隐约感觉到如此的某些事情是可能的,尽管你并不一定能够通过权力或者诱惑来揭示这隐秘。也许你感受到的这些情形并不对你和你的生活发生作用,然而你对这种怀疑却不陌生;它时而在你的思绪中像一种匆匆的形影飘忽而过。这样的一种怀疑来而又去,没有人知道它从哪里来或者它到什么地方去“4”。就我自己而言,我一直对哲学的这一点怀有一种异端的想法,并且因此也尽可能地习惯于自己去深思和考究;我从在这方面与我有同感的作家们那里听取了指导,简言之,我尽了我的努力来弥补那些哲学文本们所遗留下的匮乏。渐渐地,听觉对于我来说倒成了最亲密的感觉功能;因为,正如声音是那相对外在之物而言是无法比较的内在性的揭示,于是耳朵就是用来使这内在性得以被人领会的工具,而听觉就是用来获取这内在性的感觉功能的。每当我在我所见和所听之间发现一个矛盾时,我就觉得我的怀疑得到了强化,而我的观察愿望得到了放大。一个听忏悔的神父与忏悔者之间有窗格子隔开,这神父不看,他只是听。听着听着,他渐渐构想出一个与此相应的外在;这就是说,他不会进入矛盾。相反,在你同时看和听的时候则不同,你看着的是你和言述者之间的一道窗格子。就结果而言,我为在这方面进行观察而做出的努力是非常不同的。有时候我是幸运的,有时候则不,而想要在这些道路上赢得一些战利品,幸运总是一个必须被考虑进去的因素。然而我却从来没有失去继续进行我的调查研究的愿望。如果我真的在什么时候几乎对我的坚定感到了懊悔,那么一种意外幸运也就在这样的时候为我的努力进行了加冕。于是这就是一种意外的幸运,它以一种最奇怪的方式使得我拥有了这些文稿,因而我荣幸地在此向阅读着的关注者们展示这些文稿。在这些文稿中,我得到机会去审视进两个人的生活,这强化了我关于“那外在的不是那内在的”的怀疑。尤其是他们中的一个有着这样的情形。他的外在完全与他的内在相矛盾。而他们中另一个的情形在一定的程度上也是如此,只要他在一种较为无足轻重的外在之下隐藏起了一种更���意义重大的内在,那么他就是处在这样的矛盾中。也许,考虑到顺序,我最好还是先讲述一下,我是怎样获得这些文稿的。现在算来,差不多是在七年前,我在城里的一个旧货商家那里留意到一张文书写字柜“5”,一见之下,它就吸引了我的注意力。它不是出自现代的工艺,很陈旧,但它还是吸引住了我。要解说这一印象的依据,对于我来说是不可能的,但是大多数人在他们的生命中肯定也曾经历过类似的情形。我每天的路径使我经过那旧货商和他的柜桌,在任何一天经过那里时我都从不曾放过时机盯着它看。渐渐地,这个文书写字柜在我心中有了它的故事;看着它,对于我来说成了一种必然,到最后,即使是在我有必要走另一条路的时候,我也毫不犹豫地为它的缘故而绕一段远路。由于我总这样看它,它在我心中也渐渐唤醒一种想要拥有它的愿望。其实我完全能感觉到,这是一种奇怪的愿望,既然我并不需要这家具;对于我来说,买下它就是一种浪费。正如我们所知,愿望有着一种非常诡辩性的说服力。我去了那旧货商家,推说是询问一些别的东西,在我要离开的时候,我漫不经心地就那张文书写字柜问了一个非常低的价钱。我想着,那旧货商人可能会抬价。如果是那个价,那我就占了便宜。不管怎么说,我这样做不是为了钱的缘故,而是为了要在良心上说得过去。但没有成功,那旧货商人有着一种非同寻常的坚定。又是很长一段时间,我每天都去那里,然后以一种钟情着迷的目光看着这文书写字柜。你必须下决心,我寻思着,试想一下,如果它被卖掉了,那就太晚了;哪怕你终于又找到它,你也永远得不到对它的这种印象了。在我走进旧货商家的时候,我的心狂跳着。买下了它,付了钱。这是最后一次了,我想着,你这么浪费;对了,你买下它,这恰恰是一种幸运,因为你这么老是看着它,你就该想着你曾是多么浪费,以这个文书写字柜为起点,你生活中该有一个新的段落开始了。啊,愿望有着一种非常诡辩性的说服力,那些良好的意图总是现成地摆在那里。另外参看阿德勒尔(A.P.Adler)的《对黑格尔的客观逻辑的普及讲演》。“5”[文书写字柜(Secretair)] 法国式柜子,有着许多小的、有时是隐秘的抽屉用于保存文件,并且有一块垂直翻板可以拴出来并且当写字台用。"
270
+ #print("原文本:", text)
271
+ print("处理后的文本:", extrac(text))