Hezar: Upload tokenizer and config
Browse files
preprocessor/tokenizer_config.yaml
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: whisper_bpe_tokenizer
|
2 |
+
config_type: preprocessor
|
3 |
+
pretrained_path: hezarai/whisper-small
|
4 |
+
max_length: 512
|
5 |
+
truncation_strategy: longest_first
|
6 |
+
truncation_direction: right
|
7 |
+
stride: 0
|
8 |
+
padding_strategy: longest
|
9 |
+
padding_direction: right
|
10 |
+
pad_to_multiple_of: 0
|
11 |
+
pad_token_id: 0
|
12 |
+
pad_token: <pad>
|
13 |
+
pad_token_type_id: 0
|
14 |
+
unk_token: <|endoftext|>
|
15 |
+
special_tokens:
|
16 |
+
- <|endoftext|>
|
17 |
+
- <|endoftext|>
|
18 |
+
- <|startoftranscript|>
|
19 |
+
- <|en|>
|
20 |
+
- <|zh|>
|
21 |
+
- <|de|>
|
22 |
+
- <|es|>
|
23 |
+
- <|ru|>
|
24 |
+
- <|ko|>
|
25 |
+
- <|fr|>
|
26 |
+
- <|ja|>
|
27 |
+
- <|pt|>
|
28 |
+
- <|tr|>
|
29 |
+
- <|pl|>
|
30 |
+
- <|ca|>
|
31 |
+
- <|nl|>
|
32 |
+
- <|ar|>
|
33 |
+
- <|sv|>
|
34 |
+
- <|it|>
|
35 |
+
- <|id|>
|
36 |
+
- <|hi|>
|
37 |
+
- <|fi|>
|
38 |
+
- <|vi|>
|
39 |
+
- <|he|>
|
40 |
+
- <|uk|>
|
41 |
+
- <|el|>
|
42 |
+
- <|ms|>
|
43 |
+
- <|cs|>
|
44 |
+
- <|ro|>
|
45 |
+
- <|da|>
|
46 |
+
- <|hu|>
|
47 |
+
- <|ta|>
|
48 |
+
- <|no|>
|
49 |
+
- <|th|>
|
50 |
+
- <|ur|>
|
51 |
+
- <|hr|>
|
52 |
+
- <|bg|>
|
53 |
+
- <|lt|>
|
54 |
+
- <|la|>
|
55 |
+
- <|mi|>
|
56 |
+
- <|ml|>
|
57 |
+
- <|cy|>
|
58 |
+
- <|sk|>
|
59 |
+
- <|te|>
|
60 |
+
- <|fa|>
|
61 |
+
- <|lv|>
|
62 |
+
- <|bn|>
|
63 |
+
- <|sr|>
|
64 |
+
- <|az|>
|
65 |
+
- <|sl|>
|
66 |
+
- <|kn|>
|
67 |
+
- <|et|>
|
68 |
+
- <|mk|>
|
69 |
+
- <|br|>
|
70 |
+
- <|eu|>
|
71 |
+
- <|is|>
|
72 |
+
- <|hy|>
|
73 |
+
- <|ne|>
|
74 |
+
- <|mn|>
|
75 |
+
- <|bs|>
|
76 |
+
- <|kk|>
|
77 |
+
- <|sq|>
|
78 |
+
- <|sw|>
|
79 |
+
- <|gl|>
|
80 |
+
- <|mr|>
|
81 |
+
- <|pa|>
|
82 |
+
- <|si|>
|
83 |
+
- <|km|>
|
84 |
+
- <|sn|>
|
85 |
+
- <|yo|>
|
86 |
+
- <|so|>
|
87 |
+
- <|af|>
|
88 |
+
- <|oc|>
|
89 |
+
- <|ka|>
|
90 |
+
- <|be|>
|
91 |
+
- <|tg|>
|
92 |
+
- <|sd|>
|
93 |
+
- <|gu|>
|
94 |
+
- <|am|>
|
95 |
+
- <|yi|>
|
96 |
+
- <|lo|>
|
97 |
+
- <|uz|>
|
98 |
+
- <|fo|>
|
99 |
+
- <|ht|>
|
100 |
+
- <|ps|>
|
101 |
+
- <|tk|>
|
102 |
+
- <|nn|>
|
103 |
+
- <|mt|>
|
104 |
+
- <|sa|>
|
105 |
+
- <|lb|>
|
106 |
+
- <|my|>
|
107 |
+
- <|bo|>
|
108 |
+
- <|tl|>
|
109 |
+
- <|mg|>
|
110 |
+
- <|as|>
|
111 |
+
- <|tt|>
|
112 |
+
- <|haw|>
|
113 |
+
- <|ln|>
|
114 |
+
- <|ha|>
|
115 |
+
- <|ba|>
|
116 |
+
- <|jw|>
|
117 |
+
- <|su|>
|
118 |
+
- <|translate|>
|
119 |
+
- <|transcribe|>
|
120 |
+
- <|startoflm|>
|
121 |
+
- <|startofprev|>
|
122 |
+
- <|nocaptions|>
|
123 |
+
- <|notimestamps|>
|
124 |
+
continuing_subword_prefix: ''
|
125 |
+
end_of_word_suffix: ''
|
126 |
+
fuse_unk: false
|
127 |
+
vocab_size: 50364
|
128 |
+
min_frequency: 2
|
129 |
+
limit_alphabet: 1000
|
130 |
+
initial_alphabet: []
|
131 |
+
show_progress: true
|
132 |
+
unk_token_id: 50257
|
133 |
+
bos_token: <|startoftranscript|>
|
134 |
+
bos_token_id: 50257
|
135 |
+
eos_token: <|endoftext|>
|
136 |
+
eos_token_id: 50257
|
137 |
+
add_prefix_space: false
|
138 |
+
add_bos_token: false
|
139 |
+
model_max_length: 1024
|
140 |
+
predict_timestamps: false
|