name: whisper_bpe_tokenizer | |
config_type: preprocessor | |
pretrained_path: hezarai/whisper-small | |
max_length: 512 | |
truncation_strategy: longest_first | |
truncation_direction: right | |
stride: 0 | |
padding_strategy: longest | |
padding_direction: right | |
pad_to_multiple_of: 0 | |
pad_token_id: 0 | |
pad_token: <pad> | |
pad_token_type_id: 0 | |
unk_token: <|endoftext|> | |
special_tokens: | |
- <|endoftext|> | |
- <|endoftext|> | |
- <|startoftranscript|> | |
- <|en|> | |
- <|zh|> | |
- <|de|> | |
- <|es|> | |
- <|ru|> | |
- <|ko|> | |
- <|fr|> | |
- <|ja|> | |
- <|pt|> | |
- <|tr|> | |
- <|pl|> | |
- <|ca|> | |
- <|nl|> | |
- <|ar|> | |
- <|sv|> | |
- <|it|> | |
- <|id|> | |
- <|hi|> | |
- <|fi|> | |
- <|vi|> | |
- <|he|> | |
- <|uk|> | |
- <|el|> | |
- <|ms|> | |
- <|cs|> | |
- <|ro|> | |
- <|da|> | |
- <|hu|> | |
- <|ta|> | |
- <|no|> | |
- <|th|> | |
- <|ur|> | |
- <|hr|> | |
- <|bg|> | |
- <|lt|> | |
- <|la|> | |
- <|mi|> | |
- <|ml|> | |
- <|cy|> | |
- <|sk|> | |
- <|te|> | |
- <|fa|> | |
- <|lv|> | |
- <|bn|> | |
- <|sr|> | |
- <|az|> | |
- <|sl|> | |
- <|kn|> | |
- <|et|> | |
- <|mk|> | |
- <|br|> | |
- <|eu|> | |
- <|is|> | |
- <|hy|> | |
- <|ne|> | |
- <|mn|> | |
- <|bs|> | |
- <|kk|> | |
- <|sq|> | |
- <|sw|> | |
- <|gl|> | |
- <|mr|> | |
- <|pa|> | |
- <|si|> | |
- <|km|> | |
- <|sn|> | |
- <|yo|> | |
- <|so|> | |
- <|af|> | |
- <|oc|> | |
- <|ka|> | |
- <|be|> | |
- <|tg|> | |
- <|sd|> | |
- <|gu|> | |
- <|am|> | |
- <|yi|> | |
- <|lo|> | |
- <|uz|> | |
- <|fo|> | |
- <|ht|> | |
- <|ps|> | |
- <|tk|> | |
- <|nn|> | |
- <|mt|> | |
- <|sa|> | |
- <|lb|> | |
- <|my|> | |
- <|bo|> | |
- <|tl|> | |
- <|mg|> | |
- <|as|> | |
- <|tt|> | |
- <|haw|> | |
- <|ln|> | |
- <|ha|> | |
- <|ba|> | |
- <|jw|> | |
- <|su|> | |
- <|translate|> | |
- <|transcribe|> | |
- <|startoflm|> | |
- <|startofprev|> | |
- <|nocaptions|> | |
- <|notimestamps|> | |
continuing_subword_prefix: '' | |
end_of_word_suffix: '' | |
fuse_unk: false | |
vocab_size: 50364 | |
min_frequency: 2 | |
limit_alphabet: 1000 | |
initial_alphabet: [] | |
show_progress: true | |
unk_token_id: 50257 | |
bos_token: <|startoftranscript|> | |
bos_token_id: 50257 | |
eos_token: <|endoftext|> | |
eos_token_id: 50257 | |
add_prefix_space: false | |
add_bos_token: false | |
model_max_length: 1024 | |
predict_timestamps: false | |