miscovery commited on
Commit
f832d2f
·
verified ·
1 Parent(s): cab0cd2

Update Tokenizer

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
special_tokens_map.json CHANGED
@@ -1,4 +1,8 @@
1
  {
 
 
 
 
2
  "cls_token": "[CLS]",
3
  "mask_token": "[MASK]",
4
  "pad_token": "[PAD]",
 
1
  {
2
+ "additional_special_tokens": [
3
+ "[LANG_EN]",
4
+ "[LANG_AR]"
5
+ ],
6
  "cls_token": "[CLS]",
7
  "mask_token": "[MASK]",
8
  "pad_token": "[PAD]",
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -39,12 +39,32 @@
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  }
43
  },
 
 
 
 
44
  "clean_up_tokenization_spaces": true,
45
  "cls_token": "[CLS]",
46
  "mask_token": "[MASK]",
47
- "model_max_length": 512,
48
  "pad_token": "[PAD]",
49
  "padding_side": "right",
50
  "sep_token": "[SEP]",
 
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
+ },
43
+ "5": {
44
+ "content": "[LANG_EN]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "[LANG_AR]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
  }
59
  },
60
+ "additional_special_tokens": [
61
+ "[LANG_EN]",
62
+ "[LANG_AR]"
63
+ ],
64
  "clean_up_tokenization_spaces": true,
65
  "cls_token": "[CLS]",
66
  "mask_token": "[MASK]",
67
+ "model_max_length": 1024,
68
  "pad_token": "[PAD]",
69
  "padding_side": "right",
70
  "sep_token": "[SEP]",