lchakkei commited on
Commit
3fd5d53
·
verified ·
1 Parent(s): eaee752

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<|im_end|>": 32000,
3
+ "<|im_start|>": 32001
4
+ }
special_tokens_map.json CHANGED
@@ -1,24 +1,24 @@
1
  {
2
  "bos_token": {
3
  "content": "<s>",
4
- "lstrip": false,
5
  "normalized": false,
6
- "rstrip": false,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "</s>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": "</s>",
17
  "unk_token": {
18
  "content": "<unk>",
19
- "lstrip": false,
20
  "normalized": false,
21
- "rstrip": false,
22
  "single_word": false
23
  }
24
  }
 
1
  {
2
  "bos_token": {
3
  "content": "<s>",
4
+ "lstrip": true,
5
  "normalized": false,
6
+ "rstrip": true,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "<|im_end|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": "<|im_end|>",
17
  "unk_token": {
18
  "content": "<unk>",
19
+ "lstrip": true,
20
  "normalized": false,
21
+ "rstrip": true,
22
  "single_word": false
23
  }
24
  }
tokenizer.json CHANGED
@@ -12,8 +12,8 @@
12
  "id": 0,
13
  "content": "<unk>",
14
  "single_word": false,
15
- "lstrip": false,
16
- "rstrip": false,
17
  "normalized": false,
18
  "special": true
19
  },
@@ -21,8 +21,8 @@
21
  "id": 1,
22
  "content": "<s>",
23
  "single_word": false,
24
- "lstrip": false,
25
- "rstrip": false,
26
  "normalized": false,
27
  "special": true
28
  },
@@ -34,6 +34,24 @@
34
  "rstrip": false,
35
  "normalized": false,
36
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  }
38
  ],
39
  "normalizer": {
@@ -70,7 +88,7 @@
70
  },
71
  {
72
  "SpecialToken": {
73
- "id": "</s>",
74
  "type_id": 0
75
  }
76
  }
@@ -90,7 +108,7 @@
90
  },
91
  {
92
  "SpecialToken": {
93
- "id": "</s>",
94
  "type_id": 0
95
  }
96
  },
@@ -108,28 +126,28 @@
108
  },
109
  {
110
  "SpecialToken": {
111
- "id": "</s>",
112
  "type_id": 1
113
  }
114
  }
115
  ],
116
  "special_tokens": {
117
- "</s>": {
118
- "id": "</s>",
119
  "ids": [
120
- 2
121
  ],
122
  "tokens": [
123
- "</s>"
124
  ]
125
  },
126
- "<s>": {
127
- "id": "<s>",
128
  "ids": [
129
- 1
130
  ],
131
  "tokens": [
132
- "<s>"
133
  ]
134
  }
135
  }
 
12
  "id": 0,
13
  "content": "<unk>",
14
  "single_word": false,
15
+ "lstrip": true,
16
+ "rstrip": true,
17
  "normalized": false,
18
  "special": true
19
  },
 
21
  "id": 1,
22
  "content": "<s>",
23
  "single_word": false,
24
+ "lstrip": true,
25
+ "rstrip": true,
26
  "normalized": false,
27
  "special": true
28
  },
 
34
  "rstrip": false,
35
  "normalized": false,
36
  "special": true
37
+ },
38
+ {
39
+ "id": 32000,
40
+ "content": "<|im_end|>",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": false,
45
+ "special": true
46
+ },
47
+ {
48
+ "id": 32001,
49
+ "content": "<|im_start|>",
50
+ "single_word": false,
51
+ "lstrip": true,
52
+ "rstrip": true,
53
+ "normalized": false,
54
+ "special": true
55
  }
56
  ],
57
  "normalizer": {
 
88
  },
89
  {
90
  "SpecialToken": {
91
+ "id": "<|im_end|>",
92
  "type_id": 0
93
  }
94
  }
 
108
  },
109
  {
110
  "SpecialToken": {
111
+ "id": "<|im_end|>",
112
  "type_id": 0
113
  }
114
  },
 
126
  },
127
  {
128
  "SpecialToken": {
129
+ "id": "<|im_end|>",
130
  "type_id": 1
131
  }
132
  }
133
  ],
134
  "special_tokens": {
135
+ "<s>": {
136
+ "id": "<s>",
137
  "ids": [
138
+ 1
139
  ],
140
  "tokens": [
141
+ "<s>"
142
  ]
143
  },
144
+ "<|im_end|>": {
145
+ "id": "<|im_end|>",
146
  "ids": [
147
+ 32000
148
  ],
149
  "tokens": [
150
+ "<|im_end|>"
151
  ]
152
  }
153
  }
tokenizer_config.json CHANGED
@@ -4,17 +4,17 @@
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
7
- "lstrip": false,
8
  "normalized": false,
9
- "rstrip": false,
10
  "single_word": false,
11
  "special": true
12
  },
13
  "1": {
14
  "content": "<s>",
15
- "lstrip": false,
16
  "normalized": false,
17
- "rstrip": false,
18
  "single_word": false,
19
  "special": true
20
  },
@@ -25,17 +25,33 @@
25
  "rstrip": false,
26
  "single_word": false,
27
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  }
29
  },
30
  "additional_special_tokens": [],
31
  "bos_token": "<s>",
32
- "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
33
  "clean_up_tokenization_spaces": false,
34
- "eos_token": "</s>",
35
  "legacy": true,
36
  "max_length": 1024,
37
  "model_max_length": 1000000000000000019884624838656,
38
- "pad_token": "</s>",
39
  "padding_side": "left",
40
  "sp_model_kwargs": {},
41
  "spaces_between_special_tokens": false,
@@ -43,6 +59,8 @@
43
  "tokenizer_class": "LlamaTokenizer",
44
  "truncation_side": "right",
45
  "truncation_strategy": "longest_first",
 
46
  "unk_token": "<unk>",
47
- "use_default_system_prompt": false
 
48
  }
 
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
7
+ "lstrip": true,
8
  "normalized": false,
9
+ "rstrip": true,
10
  "single_word": false,
11
  "special": true
12
  },
13
  "1": {
14
  "content": "<s>",
15
+ "lstrip": true,
16
  "normalized": false,
17
+ "rstrip": true,
18
  "single_word": false,
19
  "special": true
20
  },
 
25
  "rstrip": false,
26
  "single_word": false,
27
  "special": true
28
+ },
29
+ "32000": {
30
+ "content": "<|im_end|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "32001": {
38
+ "content": "<|im_start|>",
39
+ "lstrip": true,
40
+ "normalized": false,
41
+ "rstrip": true,
42
+ "single_word": false,
43
+ "special": true
44
  }
45
  },
46
  "additional_special_tokens": [],
47
  "bos_token": "<s>",
48
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
49
  "clean_up_tokenization_spaces": false,
50
+ "eos_token": "<|im_end|>",
51
  "legacy": true,
52
  "max_length": 1024,
53
  "model_max_length": 1000000000000000019884624838656,
54
+ "pad_token": "<|im_end|>",
55
  "padding_side": "left",
56
  "sp_model_kwargs": {},
57
  "spaces_between_special_tokens": false,
 
59
  "tokenizer_class": "LlamaTokenizer",
60
  "truncation_side": "right",
61
  "truncation_strategy": "longest_first",
62
+ "trust_remote_code": false,
63
  "unk_token": "<unk>",
64
+ "use_default_system_prompt": true,
65
+ "use_fast": true
66
  }