|
|
|
|
|
import argparse |
|
import os |
|
import re |
|
from typing import List |
|
|
|
from project_settings import project_path |
|
|
|
os.environ['NLTK_DATA'] = (project_path / "thirdparty_data/nltk_data").as_posix() |
|
|
|
import nltk |
|
|
|
|
|
nltk_sent_tokenize_languages = [ |
|
"czech", "danish", "dutch", "flemish", "english", "estonian", |
|
"finnish", "french", "german", "italian", "norwegian", |
|
"polish", "portuguese", "russian", "spanish", "swedish", "turkish" |
|
] |
|
|
|
|
|
def get_args(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--text", |
|
default="M2M100 is a multilingual encoder-decoder (seq-to-seq) model trained for Many-to-Many multilingual translation. It was introduced in this paper and first released in this repository.", |
|
|
|
type=str, |
|
) |
|
parser.add_argument( |
|
"--language", |
|
default="english", |
|
|
|
choices=nltk_sent_tokenize_languages, |
|
type=str |
|
) |
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
def chinese_sent_tokenize(text: str): |
|
|
|
text = re.sub(r"([。!?\?])([^”’])", r"\1\n\2", text) |
|
|
|
text = re.sub(r"(\.{6})([^”’])", r"\1\n\2", text) |
|
|
|
text = re.sub(r"(\…{2})([^”’])", r"\1\n\2", text) |
|
|
|
text = re.sub(r"([。!?\?][”’])([^,。!?\?])", r"\1\n\2", text) |
|
|
|
|
|
text = text.rstrip() |
|
|
|
return text.split("\n") |
|
|
|
|
|
def sent_tokenize(text: str, language: str) -> List[str]: |
|
if language in ["chinese"]: |
|
sent_list = chinese_sent_tokenize(text) |
|
elif language in nltk_sent_tokenize_languages: |
|
sent_list = nltk.sent_tokenize(text, language) |
|
else: |
|
sent_list = [text] |
|
return sent_list |
|
|
|
|
|
def main(): |
|
args = get_args() |
|
|
|
sent_list = sent_tokenize(args.text, language=args.language) |
|
|
|
for sent in sent_list: |
|
print(sent) |
|
|
|
return |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|