Spaces:
Running
on
A10G
Running
on
A10G
File size: 1,348 Bytes
9d434bb 39af5eb 9d434bb c0d010f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import nltk
import jieba
import sudachipy
import langid
nltk.download('punkt')
langid.set_languages(['en', 'zh', 'ja'])
def split_text_into_sentences(text):
if langid.classify(text)[0] == "en":
sentences = nltk.tokenize.sent_tokenize(text)
return sentences
elif langid.classify(text)[0] == "zh":
sentences = []
segs = jieba.cut(text, cut_all=False)
segs = list(segs)
start = 0
for i, seg in enumerate(segs):
if seg in ["。", "!", "?", "……"]:
sentences.append("".join(segs[start:i + 1]))
start = i + 1
if start < len(segs):
sentences.append("".join(segs[start:]))
return sentences
elif langid.classify(text)[0] == "ja":
sentences = []
tokenizer = sudachipy.Dictionary().create()
tokens = tokenizer.tokenize(text)
current_sentence = ""
for token in tokens:
current_sentence += token.surface()
if token.part_of_speech()[0] == "補助記号" and token.part_of_speech()[1] == "句点":
sentences.append(current_sentence)
current_sentence = ""
if current_sentence:
sentences.append(current_sentence)
return sentences
raise RuntimeError("It is impossible to reach here.") |