name: whisper_speech_recognition | |
config_type: model | |
vocab_size: 51865 | |
num_mel_bins: 80 | |
encoder_layers: 12 | |
encoder_attention_heads: 12 | |
decoder_layers: 12 | |
decoder_attention_heads: 12 | |
num_hidden_layers: 12 | |
decoder_ffn_dim: 3072 | |
encoder_ffn_dim: 3072 | |
encoder_layerdrop: 0.0 | |
decoder_layerdrop: 0.0 | |
decoder_start_token_id: 50258 | |
use_cache: true | |
sampling_rate: 16000 | |
is_encoder_decoder: true | |
activation_function: gelu | |
d_model: 768 | |
dropout: 0.0 | |
torch_dtype: float32 | |
attention_dropout: 0.0 | |
activation_dropout: 0.0 | |
init_std: 0.02 | |
scale_embedding: false | |
max_source_positions: 1500 | |
max_target_positions: 448 | |
pad_token_id: 50257 | |
bos_token_id: 50257 | |
eos_token_id: 50257 | |
suppress_tokens: [] | |
begin_suppress_tokens: | |
- 220 | |
- 50257 | |
use_weighted_layer_sum: false | |
classifier_proj_size: 256 | |
apply_spec_augment: false | |
mask_time_prob: 0.05 | |
mask_time_length: 10 | |
mask_time_min_masks: 2 | |
mask_feature_prob: 0.0 | |
mask_feature_length: 10 | |
mask_feature_min_masks: 0 | |
max_new_tokens: 448 | |