1-800-BAD-CODE commited on
Commit
2a1d736
·
1 Parent(s): dd44bba

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +98 -196
README.md CHANGED
@@ -60,6 +60,103 @@ This model accepts as input lower-cased, unpunctuated, unsegmented text in 47 la
60
  All languages are processed with the same algorithm with no need for language tags or language-specific branches in the graph.
61
  This includes continuous-script and non-continuous script languages, predicting language-specific punctuation, etc.
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  # Model Details
64
 
65
  This model generally follows the graph shown below, with brief descriptions for each step following.
@@ -136,201 +233,6 @@ This model predicts the following set of "post" punctuation tokens:
136
  | ¿ | Inverted question mark | Spanish |
137
 
138
 
139
- # Usage
140
- This model is released in two parts:
141
-
142
- 1. The ONNX graph
143
- 2. The SentencePiece tokenizer
144
-
145
-
146
- The following code snippet will instantiate a `SimplePCSWrapper`, which will download the model files from this repository.
147
- It will then run a few example sentences in a few languages, and print the processed output.
148
-
149
-
150
- <details>
151
- <summary>Example Code</summary>
152
-
153
- ```python
154
- import logging
155
-
156
- from sentencepiece import SentencePieceProcessor
157
- import onnxruntime as ort
158
- import numpy as np
159
- from huggingface_hub import hf_hub_download
160
- from typing import List
161
-
162
-
163
- class SimplePCSWrapper:
164
- def __init__(self):
165
- spe_path = hf_hub_download(
166
- repo_id="1-800-BAD-CODE/punct_cap_seg_47_language", filename="spe_unigram_64k_lowercase_47lang.model"
167
- )
168
- onnx_path = hf_hub_download(
169
- repo_id="1-800-BAD-CODE/punct_cap_seg_47_language", filename="punct_cap_seg_47lang.onnx"
170
- )
171
- self._tokenizer: SentencePieceProcessor = SentencePieceProcessor(spe_path)
172
- self._ort_session: ort.InferenceSession = ort.InferenceSession(onnx_path)
173
- # This model has max length 128. Real code should wrap inputs; example code will truncate.
174
- self._max_len = 128
175
-
176
- # Hard-coding labels, for now
177
- self._pre_labels = [
178
- "<NULL>",
179
- "¿",
180
- ]
181
-
182
- self._post_labels = [
183
- "<NULL>",
184
- ".",
185
- ",",
186
- "?",
187
- "?",
188
- ",",
189
- "。",
190
- "、",
191
- "・",
192
- "।",
193
- "؟",
194
- "،",
195
- ";",
196
- "።",
197
- "፣",
198
- "፧",
199
- ]
200
-
201
- def infer_one_text(self, text: str) -> List[str]:
202
- input_ids = self._tokenizer.EncodeAsIds(text)
203
- # Limit sequence to model's positional encoding limit. Leave 2 slots for BOS/EOS tags.
204
- if len(input_ids) > self._max_len - 2:
205
- logging.warning(f"Truncating input sequence from {len(input_ids)} to {self._max_len - 2}")
206
- input_ids = input_ids[: self._max_len - 2]
207
- # Append BOS and EOS.
208
- input_ids = [self._tokenizer.bos_id()] + input_ids + [self._tokenizer.eos_id()]
209
- # Add empty batch dimension. With real batches, sequence padding should be `self._tokenizer.pad_id()`.
210
- input_ids = [input_ids]
211
-
212
- # ORT input should be np.array
213
- input_ids = np.array(input_ids)
214
- # Get predictions.
215
- pre_preds, post_preds, cap_preds, seg_preds = self._ort_session.run(None, {"input_ids": input_ids})
216
- # Remove all batch dims. Remove BOS/EOS from time dim
217
- pre_preds = pre_preds[0, 1:-1]
218
- post_preds = post_preds[0, 1:-1]
219
- cap_preds = cap_preds[0, 1:-1]
220
- seg_preds = seg_preds[0, 1:-1]
221
-
222
- # Apply predictions to input tokens
223
- input_tokens = self._tokenizer.EncodeAsPieces(text)
224
- # Segmented sentences
225
- output_strings: List[str] = []
226
- # Current sentence, which is built until we hit a sentence boundary prediction
227
- current_chars: List[str] = []
228
- for token_idx, token in enumerate(input_tokens):
229
- # Simple SP decoding
230
- if token.startswith("▁") and current_chars:
231
- current_chars.append(" ")
232
- # Skip non-printable chars
233
- char_start = 1 if token.startswith("▁") else 0
234
- for token_char_idx, char in enumerate(token[char_start:], start=char_start):
235
- # If this is the first char in the subtoken, and we predict "pre-punct", insert it
236
- if token_char_idx == char_start and pre_preds[token_idx] != 0:
237
- current_chars.append(self._pre_labels[pre_preds[token_idx]])
238
- # If this char should be capitalized, apply upper case
239
- if cap_preds[token_idx][token_char_idx]:
240
- char = char.upper()
241
- # Append char after pre-punc and upper-casing, before post-punt
242
- current_chars.append(char)
243
- # If this is the final char in the subtoken, and we predict "post-punct", insert it
244
- if token_char_idx == len(token) - 1 and post_preds[token_idx] != 0:
245
- current_chars.append(self._post_labels[post_preds[token_idx]])
246
- # If this token is a sentence boundary, finalize the current sentence and reset
247
- if token_char_idx == len(token) - 1 and seg_preds[token_idx]:
248
- output_strings.append("".join(current_chars))
249
- current_chars = []
250
- return output_strings
251
-
252
-
253
- # Upon instantiation, will automatically download models from HF Hub
254
- pcs_wrapper: SimplePCSWrapper = SimplePCSWrapper()
255
-
256
-
257
- # Function for pretty-printing raw input and segmented output
258
- def print_processed_text(input_text: str, output_texts: List[str]):
259
- print(f"Input: {input_text}")
260
- print(f"Outputs:")
261
- for text in output_texts:
262
- print(f"\t{text}")
263
- print()
264
-
265
-
266
- # Process and print each text, one at a time
267
- texts = [
268
- "hola mundo cómo estás estamos bajo el sol y hace mucho calor santa coloma abre los huertos urbanos a las escuelas de la ciudad",
269
- "hello friend how's it going it's snowing outside right now in connecticut a large storm is moving in",
270
- "未來疫苗將有望覆蓋3歲以上全年齡段美國與北約軍隊已全部撤離還有鐵路公路在內的各項基建的來源都將枯竭",
271
- "በባለፈው ሳምንት ኢትዮጵያ ከሶማሊያ 3 ሺህ ወታደሮቿንም እንዳስወጣች የሶማሊያው ዳልሳን ሬድዮ ዘግቦ ነበር ጸጥታ ሃይሉና ህዝቡ ተቀናጅቶ በመስራቱ በመዲናዋ ላይ የታቀደው የጥፋት ሴራ ከሽፏል",
272
- "all human beings are born free and equal in dignity and rights they are endowed with reason and conscience and should act towards one another in a spirit of brotherhood",
273
- "सभी मनुष्य जन्म से मर्यादा और अधिकारों में स्वतंत्र और समान होते हैं वे तर्क और विवेक से संपन्न हैं तथा उन्हें भ्रातृत्व की भावना से परस्पर के प्रति कार्य करना चाहिए",
274
- "wszyscy ludzie rodzą się wolni i równi pod względem swej godności i swych praw są oni obdarzeni rozumem i sumieniem i powinni postępować wobec innych w duchu braterstwa",
275
- "tous les êtres humains naissent libres et égaux en dignité et en droits ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité",
276
- ]
277
- for text in texts:
278
- outputs = pcs_wrapper.infer_one_text(text)
279
- print_processed_text(text, outputs)
280
- ```
281
- </details>
282
-
283
-
284
- <details>
285
- <summary>Expected output</summary>
286
-
287
- ```text
288
- Input: hola mundo cómo estás estamos bajo el sol y hace mucho calor santa coloma abre los huertos urbanos a las escuelas de la ciudad
289
- Outputs:
290
- Hola Mundo, ¿cómo estás?
291
- Estamos bajo el sol y hace mucho calor.
292
- Santa Coloma abre los huertos urbanos a las escuelas de la ciudad.
293
-
294
- Input: hello friend how's it going it's snowing outside right now in connecticut a large storm is moving in
295
- Outputs:
296
- Hello Friend, how's it going?
297
- It's snowing outside right now.
298
- In Connecticut, a large storm is moving in.
299
-
300
- Input: 未來疫苗將有望覆蓋3歲以上全年齡段美國與北約軍隊已全部撤離還有鐵路公路在內的各項基建的來源都將枯竭
301
- Outputs:
302
- 未來,疫苗將有望覆蓋3歲以上全年齡段。
303
- 美國與北約軍隊已全部撤離。
304
- 還有鐵路公路在內的各項基建的來源都將枯竭。
305
-
306
- Input: በባለፈው ሳምንት ኢትዮጵያ ከሶማሊያ 3 ሺህ ወታደሮቿንም እንዳስወጣች የሶማሊያው ዳልሳን ሬድዮ ዘግቦ ነበር ጸጥታ ሃይሉና ህዝቡ ተቀናጅቶ በመስራቱ በመዲናዋ ላይ የታቀደው የጥፋት ሴራ ከሽፏል
307
- Outputs:
308
- በባለፈው ሳምንት ኢትዮጵያ ከሶማሊያ 3 ሺህ ወታደሮቿንም እንዳስወጣች የሶማሊያው ዳልሳን ሬድ��� ዘግቦ ነበር።
309
- ጸጥታ ሃይሉና ህዝቡ ተቀናጅቶ በመስራቱ በመዲናዋ ላይ የታቀደው የጥፋት ሴራ ከሽፏል።
310
-
311
- Input: all human beings are born free and equal in dignity and rights they are endowed with reason and conscience and should act towards one another in a spirit of brotherhood
312
- Outputs:
313
- All human beings are born free and equal in dignity and rights.
314
- They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood.
315
-
316
- Input: सभी मनुष्य जन्म से मर्यादा और अधिकारों में स्वतंत्र और समान होते हैं वे तर्क और विवेक से संपन्न हैं तथा उन्हें भ्रातृत्व की भावना से परस्पर के प्रति कार्य करना चाहिए
317
- Outputs:
318
- सभी मनुष्य जन्म से मर्यादा और अधिकारों में स्वतंत्र और समान होते हैं।
319
- वे तर्क और विवेक से संपन्न हैं तथा उन्हें भ्रातृत्व की भावना से परस्पर के प्रति कार्य करना चाहिए।
320
-
321
- Input: wszyscy ludzie rodzą się wolni i równi pod względem swej godności i swych praw są oni obdarzeni rozumem i sumieniem i powinni postępować wobec innych w duchu braterstwa
322
- Outputs:
323
- Wszyscy ludzie rodzą się wolni i równi pod względem swej godności i swych praw.
324
- Są oni obdarzeni rozumem i sumieniem i powinni postępować wobec innych w duchu braterstwa.
325
-
326
- Input: tous les êtres humains naissent libres et égaux en dignité et en droits ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité
327
- Outputs:
328
- Tous les êtres humains naissent libres et égaux, en dignité et en droits.
329
- Ils sont doués de raison et de conscience et doivent agir les uns envers les autres.
330
- Dans un esprit de fraternité.
331
- ```
332
- </details>
333
-
334
 
335
  # Training Details
336
  This model was trained in the NeMo framework.
@@ -346,7 +248,7 @@ Languages were chosen based on whether the News Crawl corpus contained enough re
346
  This model was trained on news data, and may not perform well on conversational or informal data.
347
 
348
  This model predicts punctuation only once per subword.
349
- This implies that some acronyms, e.g., 'U.S.', cannot properly be punctuation.
350
  This concession was accepted on two grounds:
351
  1. Such acronyms are rare, especially in the context of multi-lingual models
352
  2. Punctuated acronyms are typically pronounced as individual characters, e.g., 'U.S.' vs. 'NATO'.
 
60
  All languages are processed with the same algorithm with no need for language tags or language-specific branches in the graph.
61
  This includes continuous-script and non-continuous script languages, predicting language-specific punctuation, etc.
62
 
63
+ # Usage
64
+ The easy way to use this model is to install `punctuators`:
65
+
66
+ ```bash
67
+ pip install punctuators
68
+ ```
69
+
70
+ Running the following script should load this model and run some texts:
71
+ <details open>
72
+
73
+ <summary>Example Usage</summary>
74
+
75
+ ```
76
+ from punctuators.models import PunctCapSegModelONNX
77
+
78
+ # Instantiate this model
79
+ # This will download the ONNX and SPE models. To clean up, delete this model from your HF cache directory.
80
+ m = PunctCapSegModelONNX.from_pretrained("pcs_47lang")
81
+
82
+ # Define some input texts to punctuate
83
+ input_texts: List[str] = [
84
+ "hola mundo cómo estás estamos bajo el sol y hace mucho calor santa coloma abre los huertos urbanos a las escuelas de la ciudad",
85
+ "hello friend how's it going it's snowing outside right now in connecticut a large storm is moving in",
86
+ "未來疫苗將有望覆蓋3歲以上全年齡段美國與北約軍隊已全部撤離還有鐵路公路在內的各項基建的來源都將枯竭",
87
+ "በባለፈው ሳምንት ኢትዮጵያ ከሶማሊያ 3 ሺህ ወታደሮቿንም እንዳስወጣች የሶማሊያው ዳልሳን ሬድዮ ዘግቦ ነበር ጸጥታ ሃይሉና ህዝቡ ተቀናጅቶ በመስራቱ በመዲናዋ ላይ የታቀደው የጥፋት ሴራ ከሽፏል",
88
+ "all human beings are born free and equal in dignity and rights they are endowed with reason and conscience and should act towards one another in a spirit of brotherhood",
89
+ "सभी मनुष्य जन्म से मर्यादा और अधिकारों में स्वतंत्र और समान होते हैं वे तर्क और विवेक से संपन्न हैं तथा उन्हें भ्रातृत्व की भावना से परस्पर के प्रति कार्य करना चाहिए",
90
+ "wszyscy ludzie rodzą się wolni i równi pod względem swej godności i swych praw są oni obdarzeni rozumem i sumieniem i powinni postępować wobec innych w duchu braterstwa",
91
+ "tous les êtres humains naissent libres et égaux en dignité et en droits ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité",
92
+ ]
93
+ results: List[List[str]] = m.infer(input_texts)
94
+ for input_text, output_texts in zip(input_texts, results):
95
+ print(f"Input: {input_text}")
96
+ print(f"Outputs:")
97
+ for text in output_texts:
98
+ print(f"\t{text}")
99
+ print()
100
+
101
+ ```
102
+
103
+ </details>
104
+
105
+ <details open>
106
+
107
+ <summary>Expected Output</summary>
108
+
109
+ ```text
110
+ Input: hola mundo cómo estás estamos bajo el sol y hace mucho calor santa coloma abre los huertos urbanos a las escuelas de la ciudad
111
+ Outputs:
112
+ Hola Mundo, ¿cómo estás?
113
+ Estamos bajo el sol y hace mucho calor.
114
+ Santa Coloma abre los huertos urbanos a las escuelas de la ciudad.
115
+
116
+ Input: hello friend how's it going it's snowing outside right now in connecticut a large storm is moving in
117
+ Outputs:
118
+ Hello Friend, how's it going?
119
+ It's snowing outside right now.
120
+ In Connecticut, a large storm is moving in.
121
+
122
+ Input: 未來疫苗將有望覆蓋3歲以上全年齡段美國與北約軍隊已全部撤離還有鐵路公路在內的各項基建的來源都將枯竭
123
+ Outputs:
124
+ 未來,疫苗將有望覆蓋3歲以上全年齡段。
125
+ 美國與北約軍隊已全部撤離。
126
+ 還有鐵路公路在內的各項基建的來源都將枯竭。
127
+
128
+ Input: በባለፈው ሳምንት ኢትዮጵያ ከሶማሊያ 3 ሺህ ወታደሮቿንም እንዳስወጣች የሶማሊያው ዳልሳን ሬድዮ ዘግቦ ነበር ጸጥታ ሃይሉና ህዝቡ ተቀናጅቶ በመስራቱ በመዲናዋ ላይ የታቀደው የጥፋት ሴራ ከሽፏል
129
+ Outputs:
130
+ በባለፈው ሳምንት ኢትዮጵያ ከሶማሊያ 3 ሺህ ወታደሮቿንም እንዳስወጣች የሶማሊያው ዳልሳን ሬድዮ ዘግቦ ነበር።
131
+ ጸጥታ ሃይሉና ህዝቡ ተቀናጅቶ በመስራቱ በመዲናዋ ላይ የታቀደው የጥፋት ሴራ ከሽፏል።
132
+
133
+ Input: all human beings are born free and equal in dignity and rights they are endowed with reason and conscience and should act towards one another in a spirit of brotherhood
134
+ Outputs:
135
+ All human beings are born free and equal in dignity and rights.
136
+ They are endowed with reason and conscience and should act towards one another in a spirit of brotherhood.
137
+
138
+ Input: सभी मनुष्य जन्म से मर्यादा और अधिकारों में स्वतंत्र और समान होते हैं वे तर्क और विवेक से संपन्न हैं तथा ���न्हें भ्रातृत्व की भावना से परस्पर के प्रति कार्य करना चाहिए
139
+ Outputs:
140
+ सभी मनुष्य जन्म से मर्यादा और अधिकारों में स्वतंत्र और समान होते हैं।
141
+ वे तर्क और विवेक से संपन्न हैं तथा उन्हें भ्रातृत्व की भावना से परस्पर के प्रति कार्य करना चाहिए।
142
+
143
+ Input: wszyscy ludzie rodzą się wolni i równi pod względem swej godności i swych praw są oni obdarzeni rozumem i sumieniem i powinni postępować wobec innych w duchu braterstwa
144
+ Outputs:
145
+ Wszyscy ludzie rodzą się wolni i równi pod względem swej godności i swych praw.
146
+ Są oni obdarzeni rozumem i sumieniem i powinni postępować wobec innych w duchu braterstwa.
147
+
148
+ Input: tous les êtres humains naissent libres et égaux en dignité et en droits ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité
149
+ Outputs:
150
+ Tous les êtres humains naissent libres et égaux, en dignité et en droits.
151
+ Ils sont doués de raison et de conscience et doivent agir les uns envers les autres.
152
+ Dans un esprit de fraternité.
153
+
154
+ ```
155
+
156
+ Note that "Mundo" and "Friend" are proper nouns in this usage, which is why the model consistently upper-cases similar tokens in multiple languages.
157
+
158
+ </details>
159
+
160
  # Model Details
161
 
162
  This model generally follows the graph shown below, with brief descriptions for each step following.
 
233
  | ¿ | Inverted question mark | Spanish |
234
 
235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
  # Training Details
238
  This model was trained in the NeMo framework.
 
248
  This model was trained on news data, and may not perform well on conversational or informal data.
249
 
250
  This model predicts punctuation only once per subword.
251
+ This implies that some acronyms, e.g., 'U.S.', cannot properly be punctuated.
252
  This concession was accepted on two grounds:
253
  1. Such acronyms are rare, especially in the context of multi-lingual models
254
  2. Punctuated acronyms are typically pronounced as individual characters, e.g., 'U.S.' vs. 'NATO'.