armandnlp commited on
Commit
2aa787c
·
1 Parent(s): beb3420

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|action|>": 50259,
3
+ "<|belief|>": 50257,
4
+ "<|context|>": 50263,
5
+ "<|endofaction|>": 50260,
6
+ "<|endofbelief|>": 50258,
7
+ "<|endofcontext|>": 50264,
8
+ "<|endofresponse|>": 50262,
9
+ "<|endofuseremotion|>": 50268,
10
+ "<|response|>": 50261,
11
+ "<|system|>": 50266,
12
+ "<|useremotion|>": 50267,
13
+ "<|user|>": 50265,
14
+ "[address]": 50269,
15
+ "[area]": 50270,
16
+ "[arriveby]": 50271,
17
+ "[bookday]": 50272,
18
+ "[bookpeople]": 50273,
19
+ "[bookstay]": 50274,
20
+ "[booktime]": 50275,
21
+ "[choice]": 50276,
22
+ "[day]": 50277,
23
+ "[department]": 50278,
24
+ "[departure]": 50279,
25
+ "[destination]": 50280,
26
+ "[duration]": 50281,
27
+ "[entrancefee]": 50282,
28
+ "[food]": 50283,
29
+ "[leaveat]": 50284,
30
+ "[name]": 50285,
31
+ "[openhours]": 50286,
32
+ "[phone]": 50287,
33
+ "[postcode]": 50288,
34
+ "[price]": 50289,
35
+ "[pricerange]": 50290,
36
+ "[ref]": 50291,
37
+ "[stars]": 50292,
38
+ "[trainid]": 50293,
39
+ "[type]": 50294
40
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|belief|>",
4
+ "<|endofbelief|>",
5
+ "<|action|>",
6
+ "<|endofaction|>",
7
+ "<|response|>",
8
+ "<|endofresponse|>",
9
+ "<|context|>",
10
+ "<|endofcontext|>",
11
+ "<|user|>",
12
+ "<|system|>",
13
+ "<|useremotion|>",
14
+ "<|endofuseremotion|>",
15
+ "[address]",
16
+ "[area]",
17
+ "[arriveby]",
18
+ "[bookday]",
19
+ "[bookpeople]",
20
+ "[bookstay]",
21
+ "[booktime]",
22
+ "[choice]",
23
+ "[day]",
24
+ "[department]",
25
+ "[departure]",
26
+ "[destination]",
27
+ "[duration]",
28
+ "[entrancefee]",
29
+ "[food]",
30
+ "[leaveat]",
31
+ "[name]",
32
+ "[openhours]",
33
+ "[phone]",
34
+ "[postcode]",
35
+ "[price]",
36
+ "[pricerange]",
37
+ "[ref]",
38
+ "[stars]",
39
+ "[trainid]",
40
+ "[type]"
41
+ ],
42
+ "bos_token": "<|endoftext|>",
43
+ "eos_token": "<|endoftext|>",
44
+ "pad_token": "<|endoftext|>",
45
+ "unk_token": "<|endoftext|>"
46
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|endoftext|>",
6
+ "model_max_length": 1024,
7
+ "special_tokens_map_file": "checkpoint-5100/special_tokens_map.json",
8
+ "tokenizer_class": "GPT2Tokenizer",
9
+ "unk_token": "<|endoftext|>"
10
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff