eliebak HF Staff commited on
Commit
fddfcaa
·
1 Parent(s): c969ba6

commit of fast smoltokenizer

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json DELETED
@@ -1,10 +0,0 @@
1
- {
2
- "<file_sep>": 264,
3
- "<fim_middle>": 260,
4
- "<fim_pad>": 262,
5
- "<fim_prefix>": 259,
6
- "<fim_suffix>": 261,
7
- "<repo_name>": 263,
8
- "<|im_end|>": 265,
9
- "<|im_start|>": 266
10
- }
 
 
 
 
 
 
 
 
 
 
 
special_tokens_map.json CHANGED
@@ -5,30 +5,7 @@
5
  "<fim_middle>",
6
  "<fim_suffix>",
7
  "<fim_pad>",
8
- "<repo_name>",
9
- "<file_sep>",
10
  "<|im_end|>",
11
  "<|im_start|>"
12
- ],
13
- "eos_token": {
14
- "content": "<|endoftext|>",
15
- "lstrip": true,
16
- "normalized": true,
17
- "rstrip": true,
18
- "single_word": false
19
- },
20
- "pad_token": {
21
- "content": "<pad>",
22
- "lstrip": true,
23
- "normalized": true,
24
- "rstrip": true,
25
- "single_word": false
26
- },
27
- "unk_token": {
28
- "content": "<unk>",
29
- "lstrip": true,
30
- "normalized": true,
31
- "rstrip": true,
32
- "single_word": false
33
- }
34
  }
 
5
  "<fim_middle>",
6
  "<fim_suffix>",
7
  "<fim_pad>",
 
 
8
  "<|im_end|>",
9
  "<|im_start|>"
10
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5078c05e94e5980118e913ed49426e7b1cdf93dcc6ccefa7967fafd2f7e255e
3
+ size 6166
tokenizer_config.json CHANGED
@@ -1,31 +1,7 @@
1
  {
2
  "added_tokens_decoder": {
3
- "0": {
4
- "content": "<pad>",
5
- "lstrip": true,
6
- "normalized": true,
7
- "rstrip": true,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "<|endoftext|>",
13
- "lstrip": true,
14
- "normalized": true,
15
- "rstrip": true,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "<unk>",
21
- "lstrip": true,
22
- "normalized": true,
23
- "rstrip": true,
24
- "single_word": false,
25
- "special": true
26
- },
27
  "259": {
28
- "content": "<fim_prefix>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
@@ -33,7 +9,7 @@
33
  "special": true
34
  },
35
  "260": {
36
- "content": "<fim_middle>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
@@ -41,7 +17,7 @@
41
  "special": true
42
  },
43
  "261": {
44
- "content": "<fim_suffix>",
45
  "lstrip": false,
46
  "normalized": false,
47
  "rstrip": false,
@@ -49,7 +25,7 @@
49
  "special": true
50
  },
51
  "262": {
52
- "content": "<fim_pad>",
53
  "lstrip": false,
54
  "normalized": false,
55
  "rstrip": false,
@@ -57,7 +33,7 @@
57
  "special": true
58
  },
59
  "263": {
60
- "content": "<repo_name>",
61
  "lstrip": false,
62
  "normalized": false,
63
  "rstrip": false,
@@ -65,14 +41,6 @@
65
  "special": true
66
  },
67
  "264": {
68
- "content": "<file_sep>",
69
- "lstrip": false,
70
- "normalized": false,
71
- "rstrip": false,
72
- "single_word": false,
73
- "special": true
74
- },
75
- "265": {
76
  "content": "<|im_end|>",
77
  "lstrip": false,
78
  "normalized": false,
@@ -80,7 +48,7 @@
80
  "single_word": false,
81
  "special": true
82
  },
83
- "266": {
84
  "content": "<|im_start|>",
85
  "lstrip": false,
86
  "normalized": false,
@@ -95,16 +63,10 @@
95
  "<fim_middle>",
96
  "<fim_suffix>",
97
  "<fim_pad>",
98
- "<repo_name>",
99
- "<file_sep>",
100
  "<|im_end|>",
101
  "<|im_start|>"
102
  ],
103
  "clean_up_tokenization_spaces": true,
104
- "eos_token": "<|endoftext|>",
105
- "extra_ids": 0,
106
  "model_max_length": 1000000000000000019884624838656,
107
- "pad_token": "<pad>",
108
- "tokenizer_class": "ByT5Tokenizer",
109
- "unk_token": "<unk>"
110
  }
 
1
  {
2
  "added_tokens_decoder": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "259": {
4
+ "content": "<|endoftext|>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
 
9
  "special": true
10
  },
11
  "260": {
12
+ "content": "<fim_prefix>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
 
17
  "special": true
18
  },
19
  "261": {
20
+ "content": "<fim_middle>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
 
25
  "special": true
26
  },
27
  "262": {
28
+ "content": "<fim_suffix>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
 
33
  "special": true
34
  },
35
  "263": {
36
+ "content": "<fim_pad>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
 
41
  "special": true
42
  },
43
  "264": {
 
 
 
 
 
 
 
 
44
  "content": "<|im_end|>",
45
  "lstrip": false,
46
  "normalized": false,
 
48
  "single_word": false,
49
  "special": true
50
  },
51
+ "265": {
52
  "content": "<|im_start|>",
53
  "lstrip": false,
54
  "normalized": false,
 
63
  "<fim_middle>",
64
  "<fim_suffix>",
65
  "<fim_pad>",
 
 
66
  "<|im_end|>",
67
  "<|im_start|>"
68
  ],
69
  "clean_up_tokenization_spaces": true,
 
 
70
  "model_max_length": 1000000000000000019884624838656,
71
+ "tokenizer_class": "PreTrainedTokenizerFast"
 
 
72
  }