Update Tokenizer
Browse files- special_tokens_map.json +2 -1
- tokenizer.json +2 -2
- tokenizer_config.json +9 -3
special_tokens_map.json
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
{
|
2 |
"additional_special_tokens": [
|
3 |
"[LANG_EN]",
|
4 |
-
"[LANG_AR]"
|
|
|
5 |
],
|
6 |
"cls_token": "[CLS]",
|
7 |
"mask_token": "[MASK]",
|
|
|
1 |
{
|
2 |
"additional_special_tokens": [
|
3 |
"[LANG_EN]",
|
4 |
+
"[LANG_AR]",
|
5 |
+
"[LANG_AR_EG]"
|
6 |
],
|
7 |
"cls_token": "[CLS]",
|
8 |
"mask_token": "[MASK]",
|
tokenizer.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae919b28537a24ab14728cdb016a2c3b0bdda997cb9e53779b168ab5500caf89
|
3 |
+
size 14975662
|
tokenizer_config.json
CHANGED
@@ -55,6 +55,14 @@
|
|
55 |
"rstrip": false,
|
56 |
"single_word": false,
|
57 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
}
|
59 |
},
|
60 |
"additional_special_tokens": [
|
@@ -64,11 +72,9 @@
|
|
64 |
"clean_up_tokenization_spaces": true,
|
65 |
"cls_token": "[CLS]",
|
66 |
"mask_token": "[MASK]",
|
67 |
-
"model_max_length":
|
68 |
"pad_token": "[PAD]",
|
69 |
-
"padding_side": "right",
|
70 |
"sep_token": "[SEP]",
|
71 |
"tokenizer_class": "PreTrainedTokenizerFast",
|
72 |
-
"truncation_side": "right",
|
73 |
"unk_token": "[UNK]"
|
74 |
}
|
|
|
55 |
"rstrip": false,
|
56 |
"single_word": false,
|
57 |
"special": true
|
58 |
+
},
|
59 |
+
"7": {
|
60 |
+
"content": "[LANG_AR_EG]",
|
61 |
+
"lstrip": false,
|
62 |
+
"normalized": false,
|
63 |
+
"rstrip": false,
|
64 |
+
"single_word": false,
|
65 |
+
"special": true
|
66 |
}
|
67 |
},
|
68 |
"additional_special_tokens": [
|
|
|
72 |
"clean_up_tokenization_spaces": true,
|
73 |
"cls_token": "[CLS]",
|
74 |
"mask_token": "[MASK]",
|
75 |
+
"model_max_length": 1000000000000000019884624838656,
|
76 |
"pad_token": "[PAD]",
|
|
|
77 |
"sep_token": "[SEP]",
|
78 |
"tokenizer_class": "PreTrainedTokenizerFast",
|
|
|
79 |
"unk_token": "[UNK]"
|
80 |
}
|