mlabonne commited on
Commit
df27122
1 Parent(s): 33a62b5

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +21 -19
  2. tokenizer.json +0 -18
  3. tokenizer_config.json +5 -29
special_tokens_map.json CHANGED
@@ -1,21 +1,23 @@
1
  {
2
- "additional_special_tokens": [
3
- {
4
- "content": "<|im_start|>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "<|im_end|>",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- }
17
- ],
18
- "bos_token": "<|im_start|>",
19
- "eos_token": "<|im_end|>",
20
- "pad_token": "<|im_end|>"
 
 
21
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<|begin_of_text|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end_of_text|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|finetune_right_pad_id|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
  }
tokenizer.json CHANGED
@@ -2306,24 +2306,6 @@
2306
  "rstrip": false,
2307
  "normalized": false,
2308
  "special": true
2309
- },
2310
- {
2311
- "id": 128256,
2312
- "content": "<|im_start|>",
2313
- "single_word": false,
2314
- "lstrip": false,
2315
- "rstrip": false,
2316
- "normalized": false,
2317
- "special": true
2318
- },
2319
- {
2320
- "id": 128257,
2321
- "content": "<|im_end|>",
2322
- "single_word": false,
2323
- "lstrip": false,
2324
- "rstrip": false,
2325
- "normalized": false,
2326
- "special": true
2327
  }
2328
  ],
2329
  "normalizer": null,
 
2306
  "rstrip": false,
2307
  "normalized": false,
2308
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2309
  }
2310
  ],
2311
  "normalizer": null,
tokenizer_config.json CHANGED
@@ -2047,41 +2047,17 @@
2047
  "rstrip": false,
2048
  "single_word": false,
2049
  "special": true
2050
- },
2051
- "128256": {
2052
- "content": "<|im_start|>",
2053
- "lstrip": false,
2054
- "normalized": false,
2055
- "rstrip": false,
2056
- "single_word": false,
2057
- "special": true
2058
- },
2059
- "128257": {
2060
- "content": "<|im_end|>",
2061
- "lstrip": false,
2062
- "normalized": false,
2063
- "rstrip": false,
2064
- "single_word": false,
2065
- "special": true
2066
  }
2067
  },
2068
- "additional_special_tokens": [
2069
- "<|im_start|>",
2070
- "<|im_end|>"
2071
- ],
2072
- "bos_token": "<|im_start|>",
2073
- "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
2074
  "clean_up_tokenization_spaces": true,
2075
- "eos_token": "<|im_end|>",
2076
- "max_length": 2048,
2077
  "model_input_names": [
2078
  "input_ids",
2079
  "attention_mask"
2080
  ],
2081
  "model_max_length": 131072,
2082
- "pad_token": "<|im_end|>",
2083
- "stride": 0,
2084
- "tokenizer_class": "PreTrainedTokenizerFast",
2085
- "truncation_side": "right",
2086
- "truncation_strategy": "longest_first"
2087
  }
 
2047
  "rstrip": false,
2048
  "single_word": false,
2049
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2050
  }
2051
  },
2052
+ "bos_token": "<|begin_of_text|>",
 
 
 
 
 
2053
  "clean_up_tokenization_spaces": true,
2054
+ "eos_token": "<|end_of_text|>",
 
2055
  "model_input_names": [
2056
  "input_ids",
2057
  "attention_mask"
2058
  ],
2059
  "model_max_length": 131072,
2060
+ "pad_token": "<|finetune_right_pad_id|>",
2061
+ "padding_side": "left",
2062
+ "tokenizer_class": "PreTrainedTokenizerFast"
 
 
2063
  }