hezarai
/

whisper-small-fa

Automatic Speech Recognition

Hezar

Persian

Model card Files Files and versions Community

arxyzan commited on Aug 15, 2023

Commit

72c619d

•

1 Parent(s): 8d057f4

Hezar: Upload tokenizer and config

Browse files

Files changed (1) hide show

preprocessor/tokenizer_config.yaml +140 -0

preprocessor/tokenizer_config.yaml ADDED Viewed

	@@ -0,0 +1,140 @@

+name: whisper_bpe_tokenizer
+config_type: preprocessor
+pretrained_path: hezarai/whisper-small
+max_length: 512
+truncation_strategy: longest_first
+truncation_direction: right
+stride: 0
+padding_strategy: longest
+padding_direction: right
+pad_to_multiple_of: 0
+pad_token_id: 0
+pad_token: <pad>
+pad_token_type_id: 0
+unk_token: <|endoftext|>
+special_tokens:
+- <|endoftext|>
+- <|endoftext|>
+- <|startoftranscript|>
+- <|en|>
+- <|zh|>
+- <|de|>
+- <|es|>
+- <|ru|>
+- <|ko|>
+- <|fr|>
+- <|ja|>
+- <|pt|>
+- <|tr|>
+- <|pl|>
+- <|ca|>
+- <|nl|>
+- <|ar|>
+- <|sv|>
+- <|it|>
+- <|id|>
+- <|hi|>
+- <|fi|>
+- <|vi|>
+- <|he|>
+- <|uk|>
+- <|el|>
+- <|ms|>
+- <|cs|>
+- <|ro|>
+- <|da|>
+- <|hu|>
+- <|ta|>
+- <|no|>
+- <|th|>
+- <|ur|>
+- <|hr|>
+- <|bg|>
+- <|lt|>
+- <|la|>
+- <|mi|>
+- <|ml|>
+- <|cy|>
+- <|sk|>
+- <|te|>
+- <|fa|>
+- <|lv|>
+- <|bn|>
+- <|sr|>
+- <|az|>
+- <|sl|>
+- <|kn|>
+- <|et|>
+- <|mk|>
+- <|br|>
+- <|eu|>
+- <|is|>
+- <|hy|>
+- <|ne|>
+- <|mn|>
+- <|bs|>
+- <|kk|>
+- <|sq|>
+- <|sw|>
+- <|gl|>
+- <|mr|>
+- <|pa|>
+- <|si|>
+- <|km|>
+- <|sn|>
+- <|yo|>
+- <|so|>
+- <|af|>
+- <|oc|>
+- <|ka|>
+- <|be|>
+- <|tg|>
+- <|sd|>
+- <|gu|>
+- <|am|>
+- <|yi|>
+- <|lo|>
+- <|uz|>
+- <|fo|>
+- <|ht|>
+- <|ps|>
+- <|tk|>
+- <|nn|>
+- <|mt|>
+- <|sa|>
+- <|lb|>
+- <|my|>
+- <|bo|>
+- <|tl|>
+- <|mg|>
+- <|as|>
+- <|tt|>
+- <|haw|>
+- <|ln|>
+- <|ha|>
+- <|ba|>
+- <|jw|>
+- <|su|>
+- <|translate|>
+- <|transcribe|>
+- <|startoflm|>
+- <|startofprev|>
+- <|nocaptions|>
+- <|notimestamps|>
+continuing_subword_prefix: ''
+end_of_word_suffix: ''
+fuse_unk: false
+vocab_size: 50364
+min_frequency: 2
+limit_alphabet: 1000
+initial_alphabet: []
+show_progress: true
+unk_token_id: 50257
+bos_token: <|startoftranscript|>
+bos_token_id: 50257
+eos_token: <|endoftext|>
+eos_token_id: 50257
+add_prefix_space: false
+add_bos_token: false
+model_max_length: 1024
+predict_timestamps: false