mahrnoud commited on
Commit
6825cf2
·
1 Parent(s): f832d2f

Update Tokenizer

Browse files
special_tokens_map.json CHANGED
@@ -1,7 +1,8 @@
1
  {
2
  "additional_special_tokens": [
3
  "[LANG_EN]",
4
- "[LANG_AR]"
 
5
  ],
6
  "cls_token": "[CLS]",
7
  "mask_token": "[MASK]",
 
1
  {
2
  "additional_special_tokens": [
3
  "[LANG_EN]",
4
+ "[LANG_AR]",
5
+ "[LANG_AR_EG]"
6
  ],
7
  "cls_token": "[CLS]",
8
  "mask_token": "[MASK]",
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c46dde2c5e1933f0e68ad13b9d6adbcabc2d07d6b35b137263b6fa688e4ff0b0
3
- size 20618844
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae919b28537a24ab14728cdb016a2c3b0bdda997cb9e53779b168ab5500caf89
3
+ size 14975662
tokenizer_config.json CHANGED
@@ -55,6 +55,14 @@
55
  "rstrip": false,
56
  "single_word": false,
57
  "special": true
 
 
 
 
 
 
 
 
58
  }
59
  },
60
  "additional_special_tokens": [
@@ -64,11 +72,9 @@
64
  "clean_up_tokenization_spaces": true,
65
  "cls_token": "[CLS]",
66
  "mask_token": "[MASK]",
67
- "model_max_length": 1024,
68
  "pad_token": "[PAD]",
69
- "padding_side": "right",
70
  "sep_token": "[SEP]",
71
  "tokenizer_class": "PreTrainedTokenizerFast",
72
- "truncation_side": "right",
73
  "unk_token": "[UNK]"
74
  }
 
55
  "rstrip": false,
56
  "single_word": false,
57
  "special": true
58
+ },
59
+ "7": {
60
+ "content": "[LANG_AR_EG]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
  }
67
  },
68
  "additional_special_tokens": [
 
72
  "clean_up_tokenization_spaces": true,
73
  "cls_token": "[CLS]",
74
  "mask_token": "[MASK]",
75
+ "model_max_length": 1000000000000000019884624838656,
76
  "pad_token": "[PAD]",
 
77
  "sep_token": "[SEP]",
78
  "tokenizer_class": "PreTrainedTokenizerFast",
 
79
  "unk_token": "[UNK]"
80
  }