miscovery
/

tokenizer

Model card Files Files and versions Community

mahrnoud commited on Apr 4

Commit

6825cf2

·

1 Parent(s): f832d2f

Update Tokenizer

Files changed (3) hide show

special_tokens_map.json +2 -1
tokenizer.json +2 -2
tokenizer_config.json +9 -3

special_tokens_map.json CHANGED Viewed

@@ -1,7 +1,8 @@
 {
   "additional_special_tokens": [
     "[LANG_EN]",
-    "[LANG_AR]"
   ],
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",

 {
   "additional_special_tokens": [
     "[LANG_EN]",
+    "[LANG_AR]",
+    "[LANG_AR_EG]"
   ],
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c46dde2c5e1933f0e68ad13b9d6adbcabc2d07d6b35b137263b6fa688e4ff0b0
-size 20618844

 version https://git-lfs.github.com/spec/v1
+oid sha256:ae919b28537a24ab14728cdb016a2c3b0bdda997cb9e53779b168ab5500caf89
+size 14975662

tokenizer_config.json CHANGED Viewed

@@ -55,6 +55,14 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "additional_special_tokens": [
@@ -64,11 +72,9 @@
   "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",
-  "model_max_length": 1024,
   "pad_token": "[PAD]",
-  "padding_side": "right",
   "sep_token": "[SEP]",
   "tokenizer_class": "PreTrainedTokenizerFast",
-  "truncation_side": "right",
   "unk_token": "[UNK]"
 }

       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "7": {
+      "content": "[LANG_AR_EG]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
   "additional_special_tokens": [
   "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
   "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "[UNK]"
 }