rohitp1 commited on
Commit
2806ec0
·
1 Parent(s): b606154

Upload tokenizer

Browse files
Files changed (4) hide show
  1. added_tokens.json +2 -106
  2. special_tokens_map.json +46 -110
  3. tokenizer_config.json +10 -32
  4. vocab.json +0 -0
added_tokens.json CHANGED
@@ -1,108 +1,4 @@
1
  {
2
- "<|af|>": 50326,
3
- "<|am|>": 50333,
4
- "<|ar|>": 50271,
5
- "<|as|>": 50349,
6
- "<|az|>": 50303,
7
- "<|ba|>": 50354,
8
- "<|be|>": 50329,
9
- "<|bg|>": 50291,
10
- "<|bn|>": 50301,
11
- "<|bo|>": 50346,
12
- "<|br|>": 50308,
13
- "<|bs|>": 50314,
14
- "<|ca|>": 50269,
15
- "<|cs|>": 50282,
16
- "<|cy|>": 50296,
17
- "<|da|>": 50284,
18
- "<|de|>": 50260,
19
- "<|el|>": 50280,
20
- "<|en|>": 50258,
21
- "<|es|>": 50261,
22
- "<|et|>": 50306,
23
- "<|eu|>": 50309,
24
- "<|fa|>": 50299,
25
- "<|fi|>": 50276,
26
- "<|fo|>": 50337,
27
- "<|fr|>": 50264,
28
- "<|gl|>": 50318,
29
- "<|gu|>": 50332,
30
- "<|haw|>": 50351,
31
- "<|ha|>": 50353,
32
- "<|hi|>": 50275,
33
- "<|hr|>": 50290,
34
- "<|ht|>": 50338,
35
- "<|hu|>": 50285,
36
- "<|hy|>": 50311,
37
- "<|id|>": 50274,
38
- "<|is|>": 50310,
39
- "<|it|>": 50273,
40
- "<|iw|>": 50278,
41
- "<|ja|>": 50265,
42
- "<|jw|>": 50355,
43
- "<|ka|>": 50328,
44
- "<|kk|>": 50315,
45
- "<|km|>": 50322,
46
- "<|kn|>": 50305,
47
- "<|ko|>": 50263,
48
- "<|la|>": 50293,
49
- "<|lb|>": 50344,
50
- "<|ln|>": 50352,
51
- "<|lo|>": 50335,
52
- "<|lt|>": 50292,
53
- "<|lv|>": 50300,
54
- "<|mg|>": 50348,
55
- "<|mi|>": 50294,
56
- "<|mk|>": 50307,
57
- "<|ml|>": 50295,
58
- "<|mn|>": 50313,
59
- "<|mr|>": 50319,
60
- "<|ms|>": 50281,
61
- "<|mt|>": 50342,
62
- "<|my|>": 50345,
63
- "<|ne|>": 50312,
64
- "<|nl|>": 50270,
65
- "<|nn|>": 50341,
66
- "<|nocaptions|>": 50361,
67
- "<|notimestamps|>": 50362,
68
- "<|no|>": 50287,
69
- "<|oc|>": 50327,
70
- "<|pa|>": 50320,
71
- "<|pl|>": 50268,
72
- "<|ps|>": 50339,
73
- "<|pt|>": 50266,
74
- "<|ro|>": 50283,
75
- "<|ru|>": 50262,
76
- "<|sa|>": 50343,
77
- "<|sd|>": 50331,
78
- "<|si|>": 50321,
79
- "<|sk|>": 50297,
80
- "<|sl|>": 50304,
81
- "<|sn|>": 50323,
82
- "<|so|>": 50325,
83
- "<|sq|>": 50316,
84
- "<|sr|>": 50302,
85
- "<|startoflm|>": 50359,
86
- "<|startofprev|>": 50360,
87
- "<|startoftranscript|>": 50257,
88
- "<|su|>": 50356,
89
- "<|sv|>": 50272,
90
- "<|sw|>": 50317,
91
- "<|ta|>": 50286,
92
- "<|te|>": 50298,
93
- "<|tg|>": 50330,
94
- "<|th|>": 50288,
95
- "<|tk|>": 50340,
96
- "<|tl|>": 50347,
97
- "<|transcribe|>": 50358,
98
- "<|translate|>": 50357,
99
- "<|tr|>": 50267,
100
- "<|tt|>": 50350,
101
- "<|uk|>": 50279,
102
- "<|ur|>": 50289,
103
- "<|uz|>": 50336,
104
- "<|vi|>": 50277,
105
- "<|yi|>": 50334,
106
- "<|yo|>": 50324,
107
- "<|zh|>": 50259
108
  }
 
1
  {
2
+ "</s>": 30,
3
+ "<s>": 29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  }
special_tokens_map.json CHANGED
@@ -1,114 +1,50 @@
1
  {
2
  "additional_special_tokens": [
3
- "<|startoftranscript|>",
4
- "<|en|>",
5
- "<|zh|>",
6
- "<|de|>",
7
- "<|es|>",
8
- "<|ru|>",
9
- "<|ko|>",
10
- "<|fr|>",
11
- "<|ja|>",
12
- "<|pt|>",
13
- "<|tr|>",
14
- "<|pl|>",
15
- "<|ca|>",
16
- "<|nl|>",
17
- "<|ar|>",
18
- "<|sv|>",
19
- "<|it|>",
20
- "<|id|>",
21
- "<|hi|>",
22
- "<|fi|>",
23
- "<|vi|>",
24
- "<|iw|>",
25
- "<|uk|>",
26
- "<|el|>",
27
- "<|ms|>",
28
- "<|cs|>",
29
- "<|ro|>",
30
- "<|da|>",
31
- "<|hu|>",
32
- "<|ta|>",
33
- "<|no|>",
34
- "<|th|>",
35
- "<|ur|>",
36
- "<|hr|>",
37
- "<|bg|>",
38
- "<|lt|>",
39
- "<|la|>",
40
- "<|mi|>",
41
- "<|ml|>",
42
- "<|cy|>",
43
- "<|sk|>",
44
- "<|te|>",
45
- "<|fa|>",
46
- "<|lv|>",
47
- "<|bn|>",
48
- "<|sr|>",
49
- "<|az|>",
50
- "<|sl|>",
51
- "<|kn|>",
52
- "<|et|>",
53
- "<|mk|>",
54
- "<|br|>",
55
- "<|eu|>",
56
- "<|is|>",
57
- "<|hy|>",
58
- "<|ne|>",
59
- "<|mn|>",
60
- "<|bs|>",
61
- "<|kk|>",
62
- "<|sq|>",
63
- "<|sw|>",
64
- "<|gl|>",
65
- "<|mr|>",
66
- "<|pa|>",
67
- "<|si|>",
68
- "<|km|>",
69
- "<|sn|>",
70
- "<|yo|>",
71
- "<|so|>",
72
- "<|af|>",
73
- "<|oc|>",
74
- "<|ka|>",
75
- "<|be|>",
76
- "<|tg|>",
77
- "<|sd|>",
78
- "<|gu|>",
79
- "<|am|>",
80
- "<|yi|>",
81
- "<|lo|>",
82
- "<|uz|>",
83
- "<|fo|>",
84
- "<|ht|>",
85
- "<|ps|>",
86
- "<|tk|>",
87
- "<|nn|>",
88
- "<|mt|>",
89
- "<|sa|>",
90
- "<|lb|>",
91
- "<|my|>",
92
- "<|bo|>",
93
- "<|tl|>",
94
- "<|mg|>",
95
- "<|as|>",
96
- "<|tt|>",
97
- "<|haw|>",
98
- "<|ln|>",
99
- "<|ha|>",
100
- "<|ba|>",
101
- "<|jw|>",
102
- "<|su|>",
103
- "<|translate|>",
104
- "<|transcribe|>",
105
- "<|startoflm|>",
106
- "<|startofprev|>",
107
- "<|nocaptions|>",
108
- "<|notimestamps|>"
109
  ],
110
- "bos_token": "<|endoftext|>",
111
- "eos_token": "<|endoftext|>",
112
- "pad_token": "<|endoftext|>",
113
- "unk_token": "<|endoftext|>"
114
  }
 
1
  {
2
  "additional_special_tokens": [
3
+ {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</s>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<s>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "</s>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<s>",
33
+ "lstrip": false,
34
+ "normalized": true,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "</s>",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  ],
46
+ "bos_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "pad_token": "[PAD]",
49
+ "unk_token": "[UNK]"
50
  }
tokenizer_config.json CHANGED
@@ -1,35 +1,13 @@
1
  {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "bos_token": {
5
- "__type": "AddedToken",
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
  "clean_up_tokenization_spaces": true,
13
- "eos_token": {
14
- "__type": "AddedToken",
15
- "content": "<|endoftext|>",
16
- "lstrip": false,
17
- "normalized": true,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "errors": "replace",
22
- "model_max_length": 1024,
23
- "pad_token": null,
24
- "processor_class": "WhisperProcessor",
25
- "return_attention_mask": false,
26
- "tokenizer_class": "WhisperTokenizer",
27
- "unk_token": {
28
- "__type": "AddedToken",
29
- "content": "<|endoftext|>",
30
- "lstrip": false,
31
- "normalized": true,
32
- "rstrip": false,
33
- "single_word": false
34
- }
35
  }
 
1
  {
2
+ "bos_token": "<s>",
 
 
 
 
 
 
 
 
 
3
  "clean_up_tokenization_spaces": true,
4
+ "do_lower_case": false,
5
+ "eos_token": "</s>",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "pad_token": "[PAD]",
8
+ "replace_word_delimiter_char": " ",
9
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
10
+ "tokenizer_file": null,
11
+ "unk_token": "[UNK]",
12
+ "word_delimiter_token": "|"
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff