rbawden commited on
Commit
d60f838
1 Parent(s): 758c67e

commit files to HF hub

Browse files
config.json CHANGED
@@ -1,38 +1,123 @@
1
  {
2
- "architectures": [
3
- "AutoModelForSeq2SeqLM"
4
- ],
5
- "model_type": "fsmt",
6
  "activation_dropout": 0.0,
7
  "activation_function": "relu",
 
 
 
8
  "attention_dropout": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "d_model": 256,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  "dropout": 0.3,
11
- "init_std": 0.02,
12
- "max_position_embeddings": 1024,
13
- "num_hidden_layers": 2,
14
- "src_vocab_size": 1000,
15
- "tgt_vocab_size": 1000,
16
- "langs": [
17
- "src",
18
- "trg"
19
- ],
20
  "encoder_attention_heads": 4,
21
  "encoder_ffn_dim": 1024,
22
  "encoder_layerdrop": 0,
23
  "encoder_layers": 2,
24
- "decoder_attention_heads": 8,
25
- "decoder_ffn_dim": 1024,
26
- "decoder_layerdrop": 0,
27
- "decoder_layers": 4,
28
- "bos_token_id": 0,
29
- "pad_token_id": 1,
30
  "eos_token_id": 2,
31
- "unk_token_id": 3,
 
32
  "is_encoder_decoder": true,
 
 
 
 
 
 
 
 
 
 
33
  "scale_embedding": true,
 
 
34
  "tie_word_embeddings": true,
35
- "num_beams": 5,
36
- "early_stopping": false,
37
- "length_penalty": 1.0
38
- }
 
 
1
  {
2
+ "_name_or_path": "rbawden/modern_french_normalisation",
 
 
 
3
  "activation_dropout": 0.0,
4
  "activation_function": "relu",
5
+ "architectures": [
6
+ "FSMTForConditionalGeneration"
7
+ ],
8
  "attention_dropout": 0.0,
9
+ "bos_token_id": 0,
10
+ "custom_pipelines": {
11
+ "modern-french-normalisation": {
12
+ "default": {
13
+ "model": {
14
+ "pt": [
15
+ "rbawden/modern_french_normalisation",
16
+ "main"
17
+ ]
18
+ }
19
+ },
20
+ "impl": "pipeline.NormalisationPipeline",
21
+ "pt": [
22
+ "AutoModelForSeq2SeqLM"
23
+ ],
24
+ "tf": []
25
+ }
26
+ },
27
  "d_model": 256,
28
+ "decoder": {
29
+ "_name_or_path": "",
30
+ "add_cross_attention": false,
31
+ "architectures": null,
32
+ "bad_words_ids": null,
33
+ "bos_token_id": 2,
34
+ "chunk_size_feed_forward": 0,
35
+ "cross_attention_hidden_size": null,
36
+ "decoder_start_token_id": null,
37
+ "diversity_penalty": 0.0,
38
+ "do_sample": false,
39
+ "early_stopping": false,
40
+ "encoder_no_repeat_ngram_size": 0,
41
+ "eos_token_id": null,
42
+ "exponential_decay_length_penalty": null,
43
+ "finetuning_task": null,
44
+ "forced_bos_token_id": null,
45
+ "forced_eos_token_id": null,
46
+ "id2label": {
47
+ "0": "LABEL_0",
48
+ "1": "LABEL_1"
49
+ },
50
+ "is_decoder": false,
51
+ "is_encoder_decoder": false,
52
+ "label2id": {
53
+ "LABEL_0": 0,
54
+ "LABEL_1": 1
55
+ },
56
+ "length_penalty": 1.0,
57
+ "max_length": 20,
58
+ "min_length": 0,
59
+ "model_type": "fsmt_decoder",
60
+ "no_repeat_ngram_size": 0,
61
+ "num_beam_groups": 1,
62
+ "num_beams": 1,
63
+ "num_return_sequences": 1,
64
+ "output_attentions": false,
65
+ "output_hidden_states": false,
66
+ "output_scores": false,
67
+ "pad_token_id": null,
68
+ "prefix": null,
69
+ "problem_type": null,
70
+ "pruned_heads": {},
71
+ "remove_invalid_values": false,
72
+ "repetition_penalty": 1.0,
73
+ "return_dict": true,
74
+ "return_dict_in_generate": false,
75
+ "sep_token_id": null,
76
+ "task_specific_params": null,
77
+ "temperature": 1.0,
78
+ "tf_legacy_loss": false,
79
+ "tie_encoder_decoder": false,
80
+ "tie_word_embeddings": true,
81
+ "tokenizer_class": null,
82
+ "top_k": 50,
83
+ "top_p": 1.0,
84
+ "torch_dtype": null,
85
+ "torchscript": false,
86
+ "transformers_version": "4.21.2",
87
+ "typical_p": 1.0,
88
+ "use_bfloat16": false,
89
+ "vocab_size": 1000
90
+ },
91
+ "decoder_attention_heads": 8,
92
+ "decoder_ffn_dim": 1024,
93
+ "decoder_layerdrop": 0,
94
+ "decoder_layers": 4,
95
+ "decoder_start_token_id": 2,
96
  "dropout": 0.3,
 
 
 
 
 
 
 
 
 
97
  "encoder_attention_heads": 4,
98
  "encoder_ffn_dim": 1024,
99
  "encoder_layerdrop": 0,
100
  "encoder_layers": 2,
 
 
 
 
 
 
101
  "eos_token_id": 2,
102
+ "forced_eos_token_id": 2,
103
+ "init_std": 0.02,
104
  "is_encoder_decoder": true,
105
+ "langs": [
106
+ "src",
107
+ "trg"
108
+ ],
109
+ "max_length": 200,
110
+ "max_position_embeddings": 1024,
111
+ "model_type": "fsmt",
112
+ "num_beams": 5,
113
+ "num_hidden_layers": 2,
114
+ "pad_token_id": 1,
115
  "scale_embedding": true,
116
+ "src_vocab_size": 1000,
117
+ "tgt_vocab_size": 1000,
118
  "tie_word_embeddings": true,
119
+ "torch_dtype": "float32",
120
+ "transformers_version": null,
121
+ "unk_token_id": 3,
122
+ "use_cache": true
123
+ }
pipeline.py ADDED
@@ -0,0 +1,856 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ from transformers import Pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
3
+ from transformers.tokenization_utils_base import TruncationStrategy
4
+ from torch import Tensor
5
+ import html.parser
6
+ import unicodedata
7
+ import sys, os
8
+ import re
9
+ import pickle
10
+ from tqdm.auto import tqdm
11
+ import operator
12
+ from datasets import load_dataset
13
+ from transformers.pipelines import PIPELINE_REGISTRY
14
+
15
+ def _create_modified_versions(entry=None):
16
+ if entry is None:
17
+ return []
18
+ return _remove_diacritics(entry), _vu_vowel_to_v_vowel(entry), _vowel_u_to_vowel_v(entry), _consonant_v_to_consonant_u(entry), _y_to_i(entry), _i_to_y(entry), _eacute_to_e_s(entry), _final_eacute_to_e_z(entry), _egrave_to_eacute(entry), _vowelcircumflex_to_vowel_s(entry), _ce_to_ee(entry)
19
+
20
+ def _create_further_modified_versions(entry=None):
21
+ if entry is None:
22
+ return []
23
+ return _s_to_f(entry), _ss_to_ff(entry), _s_to_ff(entry), _first_s_to_f(entry), _first_s_to_ff(entry), _last_s_to_f(entry), _last_s_to_ff(entry), _sit_to_st(entry), _ee_to_ce(entry), _z_to_s(entry)
24
+
25
+ def _remove_diacritics(s, allow_alter_length=True):
26
+ # 1-1 replacements only (must not change the number of characters
27
+ replace_from = "ǽǣáàâäąãăåćčçďéèêëęěğìíîĩĭıïĺľłńñňòóôõöøŕřśšşťţùúûũüǔỳýŷÿźẑżžÁÀÂÄĄÃĂÅĆČÇĎÉÈÊËĘĚĞÌÍÎĨĬİÏĹĽŁŃÑŇÒÓÔÕÖØŔŘŚŠŞŤŢÙÚÛŨÜǓỲÝŶŸŹẐŻŽſ"
28
+ replace_into = "ææaaaaaaaacccdeeeeeegiiiiiiilllnnnoooooorrsssttuuuuuuyyyyzzzzAAAAAAAACCCDEEEEEEGIIIIIIILLLNNNOOOOOORRSSSTTUUUUUUYYYYZZZZs"
29
+ table = s.maketrans(replace_from, replace_into)
30
+ s = s.translate(table)
31
+ # n-m replacemenets
32
+ if allow_alter_length:
33
+ for before, after in [('œ', 'oe'), ('æ', 'ae'), ('ƣ', 'oi'), ('ij', 'ij'),
34
+ ('ȣ', 'ou'), ('Œ', 'OE'), ('Æ', 'AE'), ('Ƣ', 'OI'), ('IJ', 'IJ'), ('Ȣ', 'OU')]:
35
+ s = s.replace(before, after)
36
+ s = s.strip('-')
37
+ return s
38
+
39
+ def _vu_vowel_to_v_vowel(s):
40
+ s = re.sub('v([aeiou])' , r'vu\1', s)
41
+ return s
42
+
43
+ def _vowel_u_to_vowel_v(s):
44
+ s = re.sub('([aeiou])u' , r'\1v', s)
45
+ return s
46
+
47
+ def _consonant_v_to_consonant_u(s):
48
+ s = re.sub('([^aeiou])v' , r'\1u', s)
49
+ return s
50
+
51
+ def _y_to_i(s):
52
+ s = s.replace('y', 'i')
53
+ return s
54
+
55
+ def _i_to_y(s):
56
+ s = s.replace('i', 'y')
57
+ return s
58
+
59
+ def _ss_to_ff(s):
60
+ s = s.replace('ss', 'ff')
61
+ return s
62
+
63
+ def _s_to_f(s):
64
+ s = s.replace('s', 'f')
65
+ return s
66
+
67
+ def _s_to_ff(s):
68
+ s = s.replace('s', 'ff')
69
+ return s
70
+
71
+ def _first_s_to_f(s):
72
+ s = re.sub('s' , r'f', s)
73
+ return s
74
+
75
+ def _last_s_to_f(s):
76
+ s = re.sub('^(.*)s' , r'\1f', s)
77
+ return s
78
+
79
+ def _first_s_to_ff(s):
80
+ s = re.sub('s' , r'ff', s)
81
+ return s
82
+
83
+ def _last_s_to_ff(s):
84
+ s = re.sub('^(.*)s' , r'\1ff', s)
85
+ return s
86
+
87
+ def _ee_to_ce(s):
88
+ s = s.replace('ee', 'ce')
89
+ return s
90
+
91
+ def _sit_to_st(s):
92
+ s = s.replace('sit', 'st')
93
+ return s
94
+
95
+ def _z_to_s(s):
96
+ s = s.replace('z', 's')
97
+ return s
98
+
99
+ def _ce_to_ee(s):
100
+ s = s.replace('ce', 'ee')
101
+ return s
102
+
103
+ def _eacute_to_e_s(s, allow_alter_length=True):
104
+ if allow_alter_length:
105
+ s = re.sub('é(.)' , r'es\1', s)
106
+ s = re.sub('ê(.)' , r'es\1', s)
107
+ return s
108
+
109
+ def _final_eacute_to_e_z(s, allow_alter_length=True):
110
+ if allow_alter_length:
111
+ s = re.sub('é$' , r'ez', s)
112
+ s = re.sub('ê$' , r'ez', s)
113
+ return s
114
+
115
+ def _egrave_to_eacute(s):
116
+ s = re.sub('è(.)' , r'é\1', s)
117
+ return s
118
+
119
+ def _vowelcircumflex_to_vowel_s(s, allow_alter_length=True):
120
+ if allow_alter_length:
121
+ for before, after in [('â', 'as'), ('ê', 'es'), ('î', 'is'), ('ô', 'os'), ('û', 'us')]:
122
+ s = s.replace(before, after)
123
+ return s
124
+
125
+
126
+ def basic_tokenise(string):
127
+ # separate punctuation
128
+ for char in r',.;?!:)("…-':
129
+ string = re.sub('(?<! )' + re.escape(char) + '+', ' ' + char, string)
130
+ for char in '\'"’':
131
+ string = re.sub(char + '(?! )' , char + ' ', string)
132
+ return string.strip()
133
+
134
+ def basic_tokenise_bs(string):
135
+ # separate punctuation
136
+ string = re.sub('(?<! )([,\.;\?!:\)\("…\'‘’”“«»\-])', r' \1', string)
137
+ string = re.sub('([,\.;\?!:\)\("…\'‘’”“«»\-])(?! )' , r'\1 ', string)
138
+ return string.strip()
139
+
140
+ def homogenise(sent, allow_alter_length=False):
141
+ '''
142
+ Homogenise an input sentence by lowercasing, removing diacritics, etc.
143
+ If allow_alter_length is False, then only applies changes that do not alter
144
+ the length of the original sentence (i.e. one-to-one modifications). If True,
145
+ then also apply n-m replacements.
146
+ '''
147
+ sent = sent.lower()
148
+ # n-m replacemenets
149
+ if allow_alter_length:
150
+ for before, after in [('ã', 'an'), ('xoe', 'œ')]:
151
+ sent = sent.replace(before, after)
152
+ sent = sent.strip('-')
153
+ # 1-1 replacements only (must not change the number of characters
154
+ replace_from = "ǽǣáàâäąãăåćčçďéèêëęěğìíîĩĭıïĺľłńñňòóôõöøŕřśšşťţùúûũüǔỳýŷÿźẑżžÁÀÂÄĄÃĂÅĆČÇĎÉÈÊËĘĚĞÌÍÎĨĬİÏĹĽŁŃÑŇÒÓÔÕÖØŔŘŚŠŞŤŢÙÚÛŨÜǓỲÝŶŸŹẐŻŽſ"
155
+ replace_into = "ææaaaaaaaacccdeeeeeegiiiiiiilllnnnoooooorrsssttuuuuuuyyyyzzzzAAAAAAAACCCDEEEEEEGIIIIIIILLLNNNOOOOOORRSSSTTUUUUUUYYYYZZZZs"
156
+ table = sent.maketrans(replace_from, replace_into)
157
+ return sent.translate(table)
158
+
159
+ def get_surrounding_punct(word):
160
+ beginning_match = re.match("^(['\-]*)", word)
161
+ beginning, end = '', ''
162
+ if beginning_match:
163
+ beginning = beginning_match.group(1)
164
+ end_match = re.match("(['\-]*)$", word)
165
+ if end_match:
166
+ end = end_match.group(1)
167
+ return beginning, end
168
+
169
+
170
+ def add_orig_punct(old_word, new_word):
171
+ beginning, end = get_surrounding_punct(old_word)
172
+ output = ''
173
+ if beginning != None and not re.match("^"+re.escape(beginning), new_word):
174
+ output += beginning
175
+ if new_word != None:
176
+ output += new_word
177
+ if end != None and not re.match(re.escape(end)+"$", new_word):
178
+ output += end
179
+ return output
180
+
181
+ def get_caps(word):
182
+ # remove any non-alphatic characters at begining or end
183
+ word = word.strip("-' ")
184
+ first, second, allcaps = False, False, False
185
+ if len(word) > 0 and word[0].lower() != word[0]:
186
+ first = True
187
+ if len(word) > 1 and word[1].lower() != word[1]:
188
+ second = True
189
+ if word.upper() == word and word.lower() != word:
190
+ allcaps = True
191
+ return first, second, allcaps
192
+
193
+ def set_caps(word, first, second, allcaps):
194
+ if word == None:
195
+ return None
196
+ if allcaps:
197
+ return word.upper()
198
+ elif first and second:
199
+ return word[0].upper() + word[1].upper() + word[2:]
200
+ elif first:
201
+ if len(word) > 1:
202
+ return word[0].upper() + word[1:]
203
+ elif len(word) == 1:
204
+ return word[0]
205
+ else:
206
+ return word
207
+ elif second:
208
+ if len(word) > 2:
209
+ return word[0] + word[1].upper() + word[2:]
210
+ elif len(word) > 1:
211
+ return word[0] + word[1].upper() + word[2:]
212
+ elif len(word) == 1:
213
+ return word[0]
214
+ else:
215
+ return word
216
+ else:
217
+ return word
218
+
219
+
220
+ ######## Edit distance functions #######
221
+ def _wedit_dist_init(len1, len2):
222
+ lev = []
223
+ for i in range(len1):
224
+ lev.append([0] * len2) # initialize 2D array to zero
225
+ for i in range(len1):
226
+ lev[i][0] = i # column 0: 0,1,2,3,4,...
227
+ for j in range(len2):
228
+ lev[0][j] = j # row 0: 0,1,2,3,4,...
229
+ return lev
230
+
231
+
232
+ def _wedit_dist_step(
233
+ lev, i, j, s1, s2, last_left, last_right, transpositions=False
234
+ ):
235
+ c1 = s1[i - 1]
236
+ c2 = s2[j - 1]
237
+
238
+ # skipping a character in s1
239
+ a = lev[i - 1][j] + _wedit_dist_deletion_cost(c1,c2)
240
+ # skipping a character in s2
241
+ b = lev[i][j - 1] + _wedit_dist_insertion_cost(c1,c2)
242
+ # substitution
243
+ c = lev[i - 1][j - 1] + (_wedit_dist_substitution_cost(c1, c2) if c1 != c2 else 0)
244
+
245
+ # pick the cheapest
246
+ lev[i][j] = min(a, b, c)#, d)
247
+
248
+ def _wedit_dist_backtrace(lev):
249
+ i, j = len(lev) - 1, len(lev[0]) - 1
250
+ alignment = [(i, j, lev[i][j])]
251
+
252
+ while (i, j) != (0, 0):
253
+ directions = [
254
+ (i - 1, j), # skip s1
255
+ (i, j - 1), # skip s2
256
+ (i - 1, j - 1), # substitution
257
+ ]
258
+
259
+ direction_costs = (
260
+ (lev[i][j] if (i >= 0 and j >= 0) else float("inf"), (i, j))
261
+ for i, j in directions
262
+ )
263
+ _, (i, j) = min(direction_costs, key=operator.itemgetter(0))
264
+
265
+ alignment.append((i, j, lev[i][j]))
266
+ return list(reversed(alignment))
267
+
268
+ def _wedit_dist_substitution_cost(c1, c2):
269
+ if c1 == ' ' and c2 != ' ':
270
+ return 1000000
271
+ if c2 == ' ' and c1 != ' ':
272
+ return 30
273
+ for c in ",.;-!?'":
274
+ if c1 == c and c2 != c:
275
+ return 20
276
+ if c2 == c and c1 != c:
277
+ return 20
278
+ return 1
279
+
280
+ def _wedit_dist_deletion_cost(c1, c2):
281
+ if c1 == ' ':
282
+ return 2
283
+ if c2 == ' ':
284
+ return 1000000
285
+ return 0.8
286
+
287
+ def _wedit_dist_insertion_cost(c1, c2):
288
+ if c1 == ' ':
289
+ return 1000000
290
+ if c2 == ' ':
291
+ return 2
292
+ return 0.8
293
+
294
+ def wedit_distance_align(s1, s2):
295
+ """
296
+ Calculate the minimum Levenshtein weighted edit-distance based alignment
297
+ mapping between two strings. The alignment finds the mapping
298
+ from string s1 to s2 that minimizes the edit distance cost, where each
299
+ operation is weighted by a dedicated weighting function.
300
+ For example, mapping "rain" to "shine" would involve 2
301
+ substitutions, 2 matches and an insertion resulting in
302
+ the following mapping:
303
+ [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)]
304
+ NB: (0, 0) is the start state without any letters associated
305
+ See more: https://web.stanford.edu/class/cs124/lec/med.pdf
306
+ In case of multiple valid minimum-distance alignments, the
307
+ backtrace has the following operation precedence:
308
+ 1. Skip s1 character
309
+ 2. Skip s2 character
310
+ 3. Substitute s1 and s2 characters
311
+ The backtrace is carried out in reverse string order.
312
+ This function does not support transposition.
313
+ :param s1, s2: The strings to be aligned
314
+ :type s1: str
315
+ :type s2: str
316
+ :rtype: List[Tuple(int, int)]
317
+ """
318
+ # set up a 2-D array
319
+ len1 = len(s1)
320
+ len2 = len(s2)
321
+ lev = _wedit_dist_init(len1 + 1, len2 + 1)
322
+
323
+ # iterate over the array
324
+ for i in range(len1):
325
+ for j in range(len2):
326
+ _wedit_dist_step(
327
+ lev,
328
+ i + 1,
329
+ j + 1,
330
+ s1,
331
+ s2,
332
+ 0,
333
+ 0,
334
+ transpositions=False,
335
+ )
336
+
337
+ # backtrace to find alignment
338
+ alignment = _wedit_dist_backtrace(lev)
339
+ return alignment
340
+
341
+ def _last_left_t_init(sigma):
342
+ return {c: 0 for c in sigma}
343
+
344
+ def wedit_distance(s1, s2):
345
+ """
346
+ Calculate the Levenshtein weighted edit-distance between two strings.
347
+ The weighted edit distance is the number of characters that need to be
348
+ substituted, inserted, or deleted, to transform s1 into s2, weighted
349
+ by a dedicated weighting function.
350
+ For example, transforming "rain" to "shine" requires three steps,
351
+ consisting of two substitutions and one insertion:
352
+ "rain" -> "sain" -> "shin" -> "shine". These operations could have
353
+ been done in other orders, but at least three steps are needed.
354
+
355
+ Allows specifying the cost of substitution edits (e.g., "a" -> "b"),
356
+ because sometimes it makes sense to assign greater penalties to
357
+ substitutions.
358
+
359
+ This also optionally allows transposition edits (e.g., "ab" -> "ba"),
360
+ though this is disabled by default.
361
+
362
+ :param s1, s2: The strings to be analysed
363
+ :param transpositions: Whether to allow transposition edits
364
+ :type s1: str
365
+ :type s2: str
366
+ :type substitution_cost: int
367
+ :type transpositions: bool
368
+ :rtype: int
369
+ """
370
+ # set up a 2-D array
371
+ len1 = len(s1)
372
+ len2 = len(s2)
373
+ lev = _wedit_dist_init(len1 + 1, len2 + 1)
374
+
375
+ # retrieve alphabet
376
+ sigma = set()
377
+ sigma.update(s1)
378
+ sigma.update(s2)
379
+
380
+ # set up table to remember positions of last seen occurrence in s1
381
+ last_left_t = _last_left_t_init(sigma)
382
+
383
+ # iterate over the array
384
+ # i and j start from 1 and not 0 to stay close to the wikipedia pseudo-code
385
+ # see https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
386
+ for i in range(len1):
387
+ last_right_buf = 0
388
+ for j in range(len2):
389
+ last_left = last_left_t[s2[j - 1]]
390
+ last_right = last_right_buf
391
+ if s1[i - 1] == s2[j - 1]:
392
+ last_right_buf = j
393
+ _wedit_dist_step(
394
+ lev,
395
+ i + 1,
396
+ j + 1,
397
+ s1,
398
+ s2,
399
+ last_left,
400
+ last_right,
401
+ transpositions=False,
402
+ )
403
+ last_left_t[s1[i - 1]] = i
404
+ return lev[len1-1][len2-1]
405
+
406
+ def space_after(idx, sent):
407
+ if idx < len(sent) -1 and sent[idx + 1] == ' ':
408
+ return True
409
+ return False
410
+
411
+ def space_before(idx, sent):
412
+ if idx > 0 and sent[idx - 1] == ' ':
413
+ return True
414
+ return False
415
+
416
+ ######## Normaliation pipeline #########
417
+ class NormalisationPipeline(Pipeline):
418
+
419
+ def __init__(self, beam_size=5, batch_size=32, tokenise_func=None, cache_file=None, no_postproc_lex=False,
420
+ no_post_clean=False, **kwargs):
421
+ self.beam_size = beam_size
422
+ # classic tokeniser function (used for alignments)
423
+ if tokenise_func is not None:
424
+ self.classic_tokenise = tokenise_func
425
+ else:
426
+ self.classic_tokenise = basic_tokenise
427
+
428
+ self.no_post_clean = no_post_clean
429
+ self.no_postproc_lex = no_postproc_lex
430
+ # load lexicon
431
+ if no_postproc_lex:
432
+ self.orig_lefff_words, self.mapping_to_lefff, self.mapping_to_lefff2 = None, None, None
433
+ else:
434
+ self.orig_lefff_words, self.mapping_to_lefff, self.mapping_to_lefff2 = self.load_lexicon(cache_file=cache_file)
435
+ super().__init__(**kwargs)
436
+
437
+
438
+ def load_lexicon(self, cache_file=None):
439
+ orig_lefff_words = []
440
+ mapping_to_lefff = {}
441
+ mapping_to_lefff2 = {}
442
+ remove = set([])
443
+ remove2 = set([])
444
+
445
+ # load pickled version if there
446
+ if cache_file is not None and os.path.exists(cache_file):
447
+ return pickle.load(open(cache_file, 'rb'))
448
+ dataset = load_dataset("sagot/lefff_morpho")
449
+
450
+ for entry in set([x['form'].lower() for x in dataset['test']]):
451
+ orig_lefff_words.append(entry)
452
+ orig_lefff_words.append("-"+entry)
453
+ for mod_entry in set(_create_modified_versions(entry)):
454
+ if mod_entry in mapping_to_lefff and mapping_to_lefff[mod_entry] != entry:
455
+ remove.add(mod_entry)
456
+ if mod_entry != mod_entry.upper():
457
+ remove.add(mod_entry)
458
+ if mod_entry not in mapping_to_lefff and mod_entry != entry:
459
+ mapping_to_lefff[mod_entry] = entry
460
+ if mod_entry != mod_entry.upper():
461
+ mapping_to_lefff2[mod_entry.upper()] = entry.upper()
462
+ for mod_entry2 in set(_create_modified_versions(mod_entry)):
463
+ if mod_entry2 in mapping_to_lefff2 and mapping_to_lefff2[mod_entry2] != entry:
464
+ remove2.add(mod_entry2)
465
+ if mod_entry2 != mod_entry2.upper():
466
+ remove2.add(mod_entry2)
467
+ if mod_entry2 not in mapping_to_lefff2 and mod_entry2 != entry:
468
+ mapping_to_lefff2[mod_entry2] = entry
469
+ if mod_entry2 != mod_entry2.upper():
470
+ mapping_to_lefff2[mod_entry2.upper()] = entry.upper()
471
+ for mod_entry2 in set(_create_further_modified_versions(mod_entry)):
472
+ if mod_entry2 in mapping_to_lefff2 and mapping_to_lefff2[mod_entry2] != entry:
473
+ remove2.add(mod_entry2)
474
+ if mod_entry2 != mod_entry2.upper():
475
+ remove2.add(mod_entry2)
476
+ if mod_entry2 not in mapping_to_lefff2 and mod_entry2 != entry:
477
+ mapping_to_lefff2[mod_entry2] = entry
478
+ if mod_entry2 != mod_entry2.upper():
479
+ mapping_to_lefff2[mod_entry2.upper()] = entry.upper()
480
+ for mod_entry2 in set(_create_further_modified_versions(entry)):
481
+ if mod_entry2 in mapping_to_lefff2 and mapping_to_lefff2[mod_entry2] != entry:
482
+ remove2.add(mod_entry2)
483
+ if mod_entry2 != mod_entry2.upper():
484
+ remove2.add(mod_entry2)
485
+ if mod_entry2 not in mapping_to_lefff2 and mod_entry2 != entry:
486
+ mapping_to_lefff2[mod_entry2] = entry
487
+ if mod_entry2 != mod_entry2.upper():
488
+ mapping_to_lefff2[mod_entry2.upper()] = entry.upper()
489
+
490
+ for mod_entry in list(mapping_to_lefff.keys()):
491
+ if mod_entry != "":
492
+ mapping_to_lefff["-"+mod_entry] = "-"+mapping_to_lefff[mod_entry]
493
+ for mod_entry2 in list(mapping_to_lefff2.keys()):
494
+ if mod_entry2 != "":
495
+ mapping_to_lefff2["-"+mod_entry2] = "-"+mapping_to_lefff2[mod_entry2]
496
+
497
+ for entry in remove:
498
+ del mapping_to_lefff[entry]
499
+ for entry in remove2:
500
+ del mapping_to_lefff2[entry]
501
+
502
+ if cache_file is not None:
503
+ pickle.dump((orig_lefff_words, mapping_to_lefff, mapping_to_lefff2), open(cache_file, 'wb'))
504
+ return orig_lefff_words, mapping_to_lefff, mapping_to_lefff2
505
+
506
+ def _sanitize_parameters(self, clean_up_tokenisation_spaces=None, truncation=None, **generate_kwargs):
507
+ preprocess_params = {}
508
+ if truncation is not None:
509
+ preprocess_params["truncation"] = truncation
510
+ forward_params = generate_kwargs
511
+ postprocess_params = {}
512
+ if clean_up_tokenisation_spaces is not None:
513
+ postprocess_params["clean_up_tokenisation_spaces"] = clean_up_tokenisation_spaces
514
+
515
+ return preprocess_params, forward_params, postprocess_params
516
+
517
+
518
+ def check_inputs(self, input_length: int, min_length: int, max_length: int):
519
+ """
520
+ Checks whether there might be something wrong with given input with regard to the model.
521
+ """
522
+ return True
523
+
524
+ def make_printable(self, s):
525
+ '''Replace non-printable characters in a string.'''
526
+ return s.translate(NOPRINT_TRANS_TABLE)
527
+
528
+
529
+ def normalise(self, line):
530
+ for before, after in [('[«»\“\”]', '"'), ('[‘’]', "'"), (' +', ' '), ('\"+', '"'),
531
+ ("'+", "'"), ('^ *', ''), (' *$', '')]:
532
+ line = re.sub(before, after, line)
533
+ return line.strip() + ' </s>'
534
+
535
+ def _parse_and_tokenise(self, *args, truncation):
536
+ prefix = ""
537
+ if isinstance(args[0], list):
538
+ if self.tokenizer.pad_token_id is None:
539
+ raise ValueError("Please make sure that the tokeniser has a pad_token_id when using a batch input")
540
+ args = ([prefix + arg for arg in args[0]],)
541
+ padding = True
542
+
543
+ elif isinstance(args[0], str):
544
+ args = (prefix + args[0],)
545
+ padding = False
546
+ else:
547
+ raise ValueError(
548
+ f" `args[0]`: {args[0]} have the wrong format. The should be either of type `str` or type `list`"
549
+ )
550
+ inputs = [self.normalise(x) for x in args]
551
+ inputs = self.tokenizer(inputs, padding=padding, truncation=truncation, return_tensors=self.framework)
552
+ toks = []
553
+ for tok_ids in inputs.input_ids:
554
+ toks.append(" ".join(self.tokenizer.convert_ids_to_tokens(tok_ids)))
555
+ # This is produced by tokenisers but is an invalid generate kwargs
556
+ if "token_type_ids" in inputs:
557
+ del inputs["token_type_ids"]
558
+ return inputs
559
+
560
+ def preprocess(self, inputs, truncation=TruncationStrategy.DO_NOT_TRUNCATE, **kwargs):
561
+ inputs = self._parse_and_tokenise(inputs, truncation=truncation, **kwargs)
562
+ return inputs
563
+
564
+ def _forward(self, model_inputs, **generate_kwargs):
565
+ in_b, input_length = model_inputs["input_ids"].shape
566
+ generate_kwargs["min_length"] = generate_kwargs.get("min_length", self.model.config.min_length)
567
+ generate_kwargs["max_length"] = generate_kwargs.get("max_length", self.model.config.max_length)
568
+ generate_kwargs['num_beams'] = self.beam_size
569
+ self.check_inputs(input_length, generate_kwargs["min_length"], generate_kwargs["max_length"])
570
+ output_ids = self.model.generate(**model_inputs, **generate_kwargs)
571
+ out_b = output_ids.shape[0]
572
+ output_ids = output_ids.reshape(in_b, out_b // in_b, *output_ids.shape[1:])
573
+ return {"output_ids": output_ids}
574
+
575
+ def postprocess(self, model_outputs, clean_up_tok_spaces=False):
576
+ records = []
577
+ for output_ids in model_outputs["output_ids"][0]:
578
+ record = {"text": self.tokenizer.decode(output_ids, skip_special_tokens=True,
579
+ clean_up_tokenisation_spaces=clean_up_tok_spaces).strip()}
580
+ records.append(record)
581
+ return records
582
+
583
+ def postprocess_correct_sent(self, alignment):
584
+ output = []
585
+ for i, (orig_word, pred_word, _) in enumerate(alignment):
586
+ if orig_word != '':
587
+ postproc_word = self.postprocess_correct_word(orig_word, pred_word, alignment)
588
+ alignment[i] = (orig_word, postproc_word, -1) # replace prediction in the alignment
589
+ return alignment
590
+
591
+ def postprocess_correct_word(self, orig_word, pred_word, alignment):
592
+ # pred_word exists in lexicon, take it
593
+ orig_caps = get_caps(orig_word)
594
+ if re.match("^[0-9]+$", orig_word) or re.match("^[XVUI]+$", orig_word):
595
+ orig_word = orig_word.replace('U', 'V')
596
+ return orig_word
597
+ if pred_word.lower() in self.orig_lefff_words:
598
+ return set_caps(pred_word, *orig_caps)
599
+ # otherwise, if original word exists, take that
600
+ if orig_word.lower() in self.orig_lefff_words:
601
+ return orig_word
602
+
603
+ pred_replacement = None
604
+ # otherwise if pred word is in the lexicon with some changes, take that
605
+ if pred_word != '' and pred_word != ' ':
606
+ pred_replacement = self.mapping_to_lefff.get(pred_word, None)
607
+ if pred_replacement is not None:
608
+ return add_orig_punct(pred_word, set_caps(pred_replacement, *orig_caps))
609
+ # otherwise if orig word is in the lexicon with some changes, take that
610
+ orig_replacement = self.mapping_to_lefff.get(orig_word, None)
611
+ if orig_replacement is not None:
612
+ return add_orig_punct(pred_word, set_caps(orig_replacement, *orig_caps))
613
+
614
+ # otherwise if pred word is in the lexicon with more changes, take that
615
+ if pred_word != '' and pred_word != ' ':
616
+ pred_replacement = self.mapping_to_lefff2.get(pred_word, None)
617
+ if pred_replacement is not None:
618
+ return add_orig_punct(pred_word, set_caps(pred_replacement, *orig_caps))
619
+ # otherwise if orig word is in the lexicon with more changes, take that
620
+ orig_replacement = self.mapping_to_lefff2.get(orig_word, None)
621
+ if orig_replacement is not None:
622
+ return add_orig_punct(pred_word, set_caps(orig_replacement, *orig_caps))
623
+
624
+ if orig_word == pred_word:
625
+ return orig_word
626
+ if orig_word == " " and pred_word == "":
627
+ return orig_word
628
+
629
+ wed = wedit_distance(pred_word,orig_word)
630
+ if wed > 2:
631
+ return orig_word
632
+ return add_orig_punct(pred_word, set_caps(pred_word, *orig_caps))
633
+
634
+
635
+ def __call__(self, input_sents, **kwargs):
636
+ r"""
637
+ Generate the output texts using texts given as inputs.
638
+ Args:
639
+ args (`List[str]`):
640
+ Input text for the encoder.
641
+ apply_postprocessing (`Bool`):
642
+ Apply postprocessing using the lexicon
643
+ generate_kwargs:
644
+ Additional keyword arguments to pass along to the generate method of the model (see the generate method
645
+ corresponding to your framework [here](./model#generative-models)).
646
+ Return:
647
+ A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
648
+ - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
649
+ - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
650
+ ids of the generated text.
651
+ """
652
+ result = super().__call__(input_sents, **kwargs)
653
+
654
+ output = []
655
+ for i in range(len(result)):
656
+ input_sent, pred_sent = input_sents[i].strip(), result[i][0]['text'].strip()
657
+ input_sent = input_sent.replace('ſ' , 's')
658
+ if not self.no_post_clean:
659
+ pred_sent = self.post_cleaning(pred_sent)
660
+ alignment, pred_sent_tok = self.align(input_sent, pred_sent)
661
+
662
+ if not self.no_postproc_lex:
663
+ alignment = self.postprocess_correct_sent(alignment)
664
+ pred_sent = self.get_pred_from_alignment(alignment)
665
+ if not self.no_post_clean:
666
+ pred_sent = self.post_cleaning(pred_sent)
667
+ char_spans = self.get_char_idx_align(input_sent, pred_sent, alignment)
668
+ output.append({'text': pred_sent, 'alignment': char_spans})
669
+ return output
670
+
671
+ def post_cleaning(self, s):
672
+ s = s.replace(' ' , '')
673
+ s = s.replace('ſ' , 's')
674
+ s = s.replace('ß' , 'ss')
675
+ s = s.replace('&' , 'et')
676
+ s = re.sub('ẽ([mbp])' , r'em\1', s)
677
+ s = s.replace('ẽ' , 'en')
678
+ s = re.sub('ã([mbp])' , r'am\1', s)
679
+ s = s.replace('ã' , 'an')
680
+ s = re.sub('õ([mbp])' , r'om\1', s)
681
+ s = s.replace('õ' , 'on')
682
+ s = re.sub('ũ([mbp])' , r'um\1', s)
683
+ s = s.replace('ũ' , 'un')
684
+ return s
685
+
686
+ def align(self, sent_ref, sent_pred):
687
+ sent_ref_tok = self.classic_tokenise(re.sub('[ ]', ' ', sent_ref))
688
+ sent_pred_tok = self.classic_tokenise(re.sub('[ ]', ' ', sent_pred))
689
+ backpointers = wedit_distance_align(homogenise(sent_ref_tok), homogenise(sent_pred_tok))
690
+ alignment, current_word, seen1, seen2, last_weight = [], ['', ''], [], [], 0
691
+ for i_ref, i_pred, weight in backpointers:
692
+ if i_ref == 0 and i_pred == 0:
693
+ continue
694
+ # next characters are both spaces -> add current word straight away
695
+ if i_ref <= len(sent_ref_tok) and sent_ref_tok[i_ref-1] == ' ' \
696
+ and i_pred <= len(sent_pred_tok) and sent_pred_tok[i_pred-1] == ' ' \
697
+ and i_ref not in seen1 and i_pred not in seen2:
698
+
699
+ # if current word is empty -> insert a space on both sides
700
+ if current_word[0] == '' and current_word[1] == '':
701
+ alignment.append((' ', ' ', weight-last_weight))
702
+ # else add the current word to both sides
703
+ else:
704
+ alignment.append((current_word[0], current_word[1], weight-last_weight))
705
+ last_weight = weight
706
+ current_word = ['', '']
707
+ seen1.append(i_ref)
708
+ seen2.append(i_pred)
709
+ # if space in ref and dash in pred
710
+ elif i_ref <= len(sent_ref_tok) and sent_ref_tok[i_ref-1] == ' ' \
711
+ and i_pred <= len(sent_pred_tok) and sent_pred_tok[i_pred-1] == '-' \
712
+ and i_ref not in seen1 and i_pred not in seen2 \
713
+ and current_word[0] == '' and current_word[1] == '':
714
+ alignment.append((' ', '', weight-last_weight))
715
+ last_weight = weight
716
+ current_word = ['', '-']
717
+ seen1.append(i_ref)
718
+ seen2.append(i_pred)
719
+ else:
720
+ end_space = '' #'░'
721
+ # add new character to ref
722
+ if i_ref <= len(sent_ref_tok) and i_ref not in seen1:
723
+ if i_ref > 0:
724
+ current_word[0] += sent_ref_tok[i_ref-1]
725
+ seen1.append(i_ref)
726
+ # add new character to pred
727
+ if i_pred <= len(sent_pred_tok) and i_pred not in seen2:
728
+ if i_pred > 0:
729
+ current_word[1] += sent_pred_tok[i_pred-1] if sent_pred_tok[i_pred-1] != ' ' else ' ' #'▁'
730
+ end_space = '' if space_after(i_pred, sent_pred_tok) else ''# '░'
731
+ seen2.append(i_pred)
732
+ if i_ref <= len(sent_ref_tok) and sent_ref_tok[i_ref-1] == ' ' and current_word[0].strip() != '':
733
+ alignment.append((current_word[0].strip(), current_word[1].strip() + end_space, weight-last_weight))
734
+ last_weight = weight
735
+ current_word = ['', '']
736
+ # space in ref but aligned to nothing in pred (under-translation)
737
+ elif i_ref <= len(sent_ref_tok) and sent_ref_tok[i_ref-1] == ' ' and current_word[1].strip() == '':
738
+ alignment.append((current_word[0], current_word[1], weight-last_weight))
739
+ last_weight = weight
740
+ current_word = ['', '']
741
+ seen1.append(i_ref)
742
+ seen2.append(i_pred)
743
+ # final word
744
+ alignment.append((current_word[0].strip(), current_word[1].strip(), weight-last_weight))
745
+ # check that both strings are entirely covered
746
+ recovered1 = re.sub(' +', ' ', ' '.join([x[0] for x in alignment]))
747
+ recovered2 = re.sub(' +', ' ', ' '.join([x[1] for x in alignment]))
748
+
749
+ assert re.sub('[  ]+', ' ', recovered1) == re.sub('[  ]+', ' ', sent_ref_tok), \
750
+ '\n1: *' + re.sub('[  ]+', ' ', recovered1) + "*\n1: *" + re.sub('[  ]+', ' ', sent_ref_tok) + '*'
751
+ assert re.sub('[░▁ ]+', '', recovered2) == re.sub('[▁ ]+', '', sent_pred_tok), \
752
+ '\n2: ' + re.sub('[  ]+', ' ', recovered2) + "\n2: " + re.sub('[  ]+', ' ', sent_pred_tok)
753
+ return alignment, sent_pred_tok
754
+
755
+ def get_pred_from_alignment(self, alignment):
756
+ return re.sub(' +', ' ', ''.join([x[1] if x[1] != '' else '\n' for x in alignment]).replace('\n', ''))
757
+
758
+ def get_char_idx_align(self, sent_ref, sent_pred, alignment):
759
+ covered_ref, covered_pred = 0, 0
760
+ ref_chars = [i for i, character in enumerate(sent_ref)] + [len(sent_ref)] #
761
+ pred_chars = [i for i, character in enumerate(sent_pred)] + [len(sent_pred)]# if character not in [' ']]
762
+ align_idx = []
763
+
764
+ for a_ref, a_pred, _ in alignment:
765
+ if a_ref == '' and a_pred == '':
766
+ covered_pred += 1
767
+ continue
768
+ a_pred = re.sub(' +', ' ', a_pred).strip()
769
+ span_ref = [ref_chars[covered_ref], ref_chars[covered_ref + len(a_ref)]]
770
+ covered_ref += len(a_ref)
771
+ span_pred = [pred_chars[covered_pred], pred_chars[covered_pred + len(a_pred)]]
772
+ covered_pred += len(a_pred)
773
+ align_idx.append((span_ref, span_pred))
774
+
775
+ return align_idx
776
+
777
+ def normalise_text(list_sents, batch_size=32, beam_size=5, cache_file=None, no_postproc_lex=False, no_post_clean=False):
778
+ tokeniser = AutoTokenizer.from_pretrained("rbawden/modern_french_normalisation")
779
+ model = AutoModelForSeq2SeqLM.from_pretrained("rbawden/modern_french_normalisation")
780
+ normalisation_pipeline = NormalisationPipeline(model=model,
781
+ tokenizer=tokeniser,
782
+ batch_size=batch_size,
783
+ beam_size=beam_size,
784
+ cache_file=cache_file,
785
+ no_postproc_lex=no_postproc_lex,
786
+ no_post_clean=no_post_clean)
787
+ normalised_outputs = normalisation_pipeline(list_sents)
788
+ return normalised_outputs
789
+
790
+ def normalise_from_stdin(batch_size=32, beam_size=5, cache_file=None, no_postproc_lex=False, no_post_clean=False):
791
+ tokeniser = AutoTokenizer.from_pretrained("rbawden/modern_french_normalisation")
792
+ model = AutoModelForSeq2SeqLM.from_pretrained("rbawden/modern_french_normalisation")
793
+ normalisation_pipeline = NormalisationPipeline(model=model,
794
+ tokenizer=tokeniser,
795
+ batch_size=batch_size,
796
+ beam_size=beam_size,
797
+ cache_file=cache_file,
798
+ no_postproc_lex=no_postproc_lex,
799
+ no_post_clean=no_post_clean
800
+ )
801
+ list_sents = []
802
+ ex = ["7. Qu'vne force plus grande de ſi peu que l'on voudra, que celle auec laquelle l'eau de la hauteur de trente & vn pieds, tend à couler en bas, ſuffit pour faire admettre ce vuide apparent, & meſme ſi grãd que l'on voudra, c'eſt à dire, pour faire des-vnir les corps d'vn ſi grand interualle que l'on voudra, pourueu qu'il n'y ait point d'autre obſtacle à leur ſeparation ny à leur eſloignement, que l'horreur que la Nature a pour ce vuide apparent."]
803
+ for sent in sys.stdin:
804
+ list_sents.append(sent.strip())
805
+ normalised_outputs = normalisation_pipeline(list_sents)
806
+ for s, sent in enumerate(normalised_outputs):
807
+ alignment=sent['alignment']
808
+
809
+ print(sent['text'])
810
+ # checking that the alignment makes sense
811
+ #for b, a in alignment:
812
+ # print('input: ' + ''.join([list_sents[s][x] for x in range(b[0], max(len(b), b[1]))]) + '')
813
+ # print('pred: ' + ''.join([sent['text'][x] for x in range(a[0], max(len(a), a[1]))]) + '')
814
+
815
+ return normalised_outputs
816
+
817
+
818
+ PIPELINE_REGISTRY.register_pipeline(
819
+ "modern-french-normalisation",
820
+ pipeline_class=NormalisationPipeline,
821
+ pt_model=AutoModelForSeq2SeqLM,
822
+ default={"pt": ("rbawden/modern_french_normalisation", "main")},
823
+ type="text",
824
+ )
825
+
826
+ if __name__ == '__main__':
827
+ import argparse
828
+ parser = argparse.ArgumentParser()
829
+ parser.add_argument('-k', '--batch_size', type=int, default=32, help='Set the batch size for decoding')
830
+ parser.add_argument('-b', '--beam_size', type=int, default=5, help='Set the beam size for decoding')
831
+ parser.add_argument('-i', '--input_file', type=str, default=None, help='Input file. If None, read from STDIN')
832
+ parser.add_argument('-c', '--cache_lexicon', type=str, default=None, help='Path to cache the lexicon file to speed up loading')
833
+ parser.add_argument('-n', '--no_postproc_lex', default=False, action='store_true', help='Deactivate postprocessing to speed up normalisation, but this may degrade the output')
834
+ parser.add_argument('-m', '--no_post_clean', default=False, action='store_true', help='Deactivate postprocessing to speed up normalisation, but this may degrade the output')
835
+
836
+ args = parser.parse_args()
837
+
838
+ if args.input_file is None:
839
+ normalise_from_stdin(batch_size=args.batch_size,
840
+ beam_size=args.beam_size,
841
+ cache_file=args.cache_lexicon,
842
+ no_postproc_lex=args.no_postproc_lex,
843
+ no_post_clean=args.no_post_clean)
844
+ else:
845
+ list_sents = []
846
+ with open(args.input_file) as fp:
847
+ for line in fp:
848
+ list_sents.append(line.strip())
849
+ output_sents = normalise_text(list_sents,
850
+ batch_size=args.batch_size,
851
+ beam_size=args.beam_size,
852
+ cache_file=args.cache_lexicon,
853
+ no_postproc_lex=args.no_postproc_lex,
854
+ no_post_clean=args.no_post_clean)
855
+ for output_sent in output_sents:
856
+ print(output_sent['text'])
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:703a6fe3ab4f7fe7682f887a5a8c4f0c4bc0abbdc93af2555084c99baccf72ec
3
- size 25266477
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:777e3ebf4be88372d6fa982cdff430b06d61461574236c7a213a37d70bd47085
3
+ size 25265973
special_tokens_map.json CHANGED
@@ -1 +1,6 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
tokenizer.json CHANGED
@@ -2,7 +2,44 @@
2
  "version": "1.0",
3
  "truncation": null,
4
  "padding": null,
5
- "added_tokens": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "normalizer": {
7
  "type": "Sequence",
8
  "normalizers": [
 
2
  "version": "1.0",
3
  "truncation": null,
4
  "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<s>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<pad>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "</s>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<unk>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
  "normalizer": {
44
  "type": "Sequence",
45
  "normalizers": [
tokenizer_config.json CHANGED
@@ -1 +1,9 @@
1
- {"unk_token": "<unk>", "eos_token": "</s>", "bos_token": "<s>", "pad_token": "<pad>", "tokenizer_class": "PreTrainedTokenizerFast"}
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "name_or_path": "rbawden/modern_french_normalisation",
5
+ "pad_token": "<pad>",
6
+ "special_tokens_map_file": "/home/rbawden/.cache/huggingface/transformers/b256f782c7622ee7cd8f990f24154fee35ec73f5b93466b241d479575da80255.9d6cd81ef646692fb1c169a880161ea1cb95f49694f220aced9b704b457e51dd",
7
+ "tokenizer_class": "PreTrainedTokenizerFast",
8
+ "unk_token": "<unk>"
9
+ }