srujan00123 commited on
Commit
78f1089
1 Parent(s): 218b839

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +6 -0
  2. tokenizer_config.json +35 -0
  3. vocab.json +46 -37
added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "</s>": 49,
3
+ "<s>": 48,
4
+ "[PAD]": 47,
5
+ "[UNK]": 46
6
+ }
tokenizer_config.json CHANGED
@@ -1,4 +1,39 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": "<s>",
3
  "clean_up_tokenization_spaces": true,
4
  "do_lower_case": false,
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "46": {
4
+ "content": "[UNK]",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "47": {
12
+ "content": "[PAD]",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "48": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "49": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "additional_special_tokens": [],
37
  "bos_token": "<s>",
38
  "clean_up_tokenization_spaces": true,
39
  "do_lower_case": false,
vocab.json CHANGED
@@ -1,41 +1,50 @@
1
  {
2
- "&": 23,
3
- "0": 34,
4
- "1": 32,
5
- "2": 33,
6
- "3": 36,
7
- "5": 18,
8
- "8": 22,
9
- "9": 19,
10
- "[": 29,
11
- "[PAD]": 38,
12
- "[UNK]": 37,
13
- "]": 31,
14
- "a": 24,
15
- "b": 15,
16
- "c": 14,
17
- "d": 8,
18
- "e": 10,
19
- "f": 30,
20
- "g": 35,
21
- "h": 27,
22
- "i": 3,
 
 
 
 
 
 
 
 
 
23
  "j": 13,
24
- "k": 0,
25
  "l": 12,
26
- "m": 1,
27
- "n": 11,
28
- "o": 4,
29
- "p": 7,
30
- "q": 5,
31
- "r": 16,
32
- "s": 17,
33
- "t": 28,
34
- "u": 20,
35
- "v": 6,
36
- "w": 26,
37
- "x": 25,
38
- "y": 9,
39
- "z": 21,
40
- "|": 2
41
  }
 
1
  {
2
+ "%": 14,
3
+ "'": 16,
4
+ "(": 15,
5
+ ")": 3,
6
+ "+": 25,
7
+ "/": 31,
8
+ "0": 20,
9
+ "1": 11,
10
+ "2": 19,
11
+ "3": 26,
12
+ "4": 27,
13
+ "5": 2,
14
+ "6": 32,
15
+ "7": 6,
16
+ "8": 35,
17
+ "9": 29,
18
+ "[": 30,
19
+ "[PAD]": 47,
20
+ "[UNK]": 46,
21
+ "\\": 17,
22
+ "]": 22,
23
+ "a": 43,
24
+ "b": 40,
25
+ "c": 38,
26
+ "d": 45,
27
+ "e": 23,
28
+ "f": 5,
29
+ "g": 42,
30
+ "h": 39,
31
+ "i": 34,
32
  "j": 13,
33
+ "k": 10,
34
  "l": 12,
35
+ "m": 37,
36
+ "n": 18,
37
+ "o": 8,
38
+ "p": 44,
39
+ "q": 24,
40
+ "r": 41,
41
+ "s": 28,
42
+ "t": 33,
43
+ "u": 1,
44
+ "v": 4,
45
+ "w": 21,
46
+ "x": 9,
47
+ "y": 7,
48
+ "z": 36,
49
+ "|": 0
50
  }