qgyd2021 commited on
Commit
3b5b01c
·
1 Parent(s): aa4e278

[update]add sent_tokenize

Browse files
examples/sent_tokenize/sent_tokenize.py CHANGED
@@ -9,7 +9,6 @@ from project_settings import project_path
9
 
10
  os.environ['NLTK_DATA'] = (project_path / "thirdparty_data/nltk_data").as_posix()
11
 
12
- import jieba
13
  import nltk
14
 
15
 
@@ -58,8 +57,10 @@ def chinese_sent_tokenize(text: str):
58
  def sent_tokenize(text: str, language: str) -> List[str]:
59
  if language in ["chinese"]:
60
  sent_list = chinese_sent_tokenize(text)
61
- else:
62
  sent_list = nltk.sent_tokenize(text, language)
 
 
63
  return sent_list
64
 
65
 
 
9
 
10
  os.environ['NLTK_DATA'] = (project_path / "thirdparty_data/nltk_data").as_posix()
11
 
 
12
  import nltk
13
 
14
 
 
57
  def sent_tokenize(text: str, language: str) -> List[str]:
58
  if language in ["chinese"]:
59
  sent_list = chinese_sent_tokenize(text)
60
+ elif language in nltk_sent_tokenize_languages:
61
  sent_list = nltk.sent_tokenize(text, language)
62
+ else:
63
+ sent_list = [text]
64
  return sent_list
65
 
66
 
main.py CHANGED
@@ -17,6 +17,7 @@ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
17
 
18
 
19
  language_map = {
 
20
  "Chinese": "zh",
21
  "Czech": "cs",
22
  "Danish": "da",
@@ -65,8 +66,10 @@ def chinese_sent_tokenize(text: str):
65
  def sent_tokenize(text: str, language: str) -> List[str]:
66
  if language in ["chinese"]:
67
  sent_list = chinese_sent_tokenize(text)
68
- else:
69
  sent_list = nltk.sent_tokenize(text, language)
 
 
70
  return sent_list
71
 
72
 
@@ -159,8 +162,6 @@ It was introduced in this [paper](https://arxiv.org/abs/2010.11125) and first re
159
  gr.Textbox(lines=4, placeholder="text", label="Input Text"),
160
  gr.Dropdown(choices=list(language_map.keys()), value="English", label="Source Language"),
161
  gr.Dropdown(choices=list(language_map.keys()), value="Chinese", label="Target Language"),
162
- # gr.Textbox(lines=1, value="en", label="Source Language"),
163
- # gr.Textbox(lines=1, value="zh", label="Target Language"),
164
  gr.Dropdown(choices=model_choices, value="facebook/m2m100_418M", label="model_name")
165
  ]
166
 
 
17
 
18
 
19
  language_map = {
20
+ "Arabic": "ar",
21
  "Chinese": "zh",
22
  "Czech": "cs",
23
  "Danish": "da",
 
66
  def sent_tokenize(text: str, language: str) -> List[str]:
67
  if language in ["chinese"]:
68
  sent_list = chinese_sent_tokenize(text)
69
+ elif language in nltk_sent_tokenize_languages:
70
  sent_list = nltk.sent_tokenize(text, language)
71
+ else:
72
+ sent_list = [text]
73
  return sent_list
74
 
75
 
 
162
  gr.Textbox(lines=4, placeholder="text", label="Input Text"),
163
  gr.Dropdown(choices=list(language_map.keys()), value="English", label="Source Language"),
164
  gr.Dropdown(choices=list(language_map.keys()), value="Chinese", label="Target Language"),
 
 
165
  gr.Dropdown(choices=model_choices, value="facebook/m2m100_418M", label="model_name")
166
  ]
167