checkpoints / sensevoice /sensevoice.yaml
Night-Quiet's picture
zjt
5e0c4e5
model:
_target_: modules.sensevoice.model.SenseVoice
model:
_target_: modules.sensevoice.sensevoicesmall.SenseVoiceSmall
specaug:
_target_: modules.sensevoice.specaug.SpecAugLFR
apply_freq_mask: true
apply_time_mask: true
apply_time_warp: false
freq_mask_width_range: [0, 30]
lfr_rate: 6
num_freq_mask: 1
num_time_mask: 1
time_mask_width_range: [0, 12]
time_warp_mode: bicubic
time_warp_window: 5
encoder:
_target_: modules.sensevoice.sensevoicesmall.SenseVoiceEncoderSmall
attention_dropout_rate: 0.1
attention_heads: 4
dropout_rate: 0.1
kernel_size: 11
linear_units: 2048
normalize_before: true
num_blocks: 50
output_size: 512
sanm_shfit: 0
tp_blocks: 20
input_size: 560
tokenizer:
_target_: modules.sensevoice.tokenizer.SentencepiecesTokenizer
bpemodel: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model
unk_symbol: <unk>
split_with_space: true
frontend:
_target_: modules.sensevoice.frontend.WavFrontend
fs: 16000
window: hamming
n_mels: 80
frame_length: 25
frame_shift: 10
lfr_m: 7
lfr_n: 6
length_normalized_loss: true
input_size: 560
vocab_size: 25055
sos: 1
eos: 2
ignore_id: -1
vad_model:
_target_: modules.sensevoice.vad.FsmnVADStreaming
encoder:
_target_: modules.sensevoice.vad.FSMN
input_dim: 400
input_affine_dim: 140
fsmn_layers: 4
linear_dim: 250
proj_dim: 128
lorder: 20
rorder: 0
lstride: 1
rstride: 0
output_affine_dim: 140
output_dim: 248
frontend:
_target_: modules.sensevoice.frontend.WavFrontendOnline
fs: 16000
window: hamming
n_mels: 80
frame_length: 25
frame_shift: 10
dither: 0.0
lfr_m: 5
lfr_n: 1
cmvn_file: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/am.mvn
sample_rate: 16000
detect_mode: 1
snr_mode: 0
max_end_silence_time: 800
max_start_silence_time: 3000
do_start_point_detection: True
do_end_point_detection: True
window_size_ms: 200
sil_to_speech_time_thres: 150
speech_to_sil_time_thres: 150
speech_2_noise_ratio: 1.0
do_extend: 1
lookback_time_start_point: 200
lookahead_time_end_point: 100
max_single_segment_time: 60000
snr_thres: -100.0
noise_frame_num_used_for_snr: 100
decibel_thres: -100.0
speech_noise_thres: 0.6
fe_prior_thres: 0.0001
silence_pdf_num: 1
sil_pdf_ids: [0]
speech_noise_thresh_low: -0.1
speech_noise_thresh_high: 0.3
output_frame_probs: False
frame_in_ms: 10
frame_length_ms: 25
tokenizer: None
vocab_size: -1
input_size: 400
kwargs:
specaug:
apply_freq_mask: true
apply_time_mask: true
apply_time_warp: false
freq_mask_width_range: [0, 30]
lfr_rate: 6
num_freq_mask: 1
num_time_mask: 1
time_mask_width_range: [0, 12]
time_warp_mode: bicubic
time_warp_window: 5
encoder:
attention_dropout_rate: 0.1
attention_heads: 4
dropout_rate: 0.1
kernel_size: 11
linear_units: 2048
normalize_before: true
num_blocks: 50
output_size: 512
sanm_shfit: 0
tp_blocks: 20
input_size: 560
tokenizer:
bpemodel: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model
unk_symbol: <unk>
split_with_space: true
frontend:
fs: 16000
window: hamming
n_mels: 80
frame_length: 25
frame_shift: 10
lfr_m: 7
lfr_n: 6
length_normalized_loss: true
input_size: 560
vocab_size: 25055
sos: 1
eos: 2
ignore_id: -1
vad_kwargs:
encoder:
input_dim: 400
input_affine_dim: 140
fsmn_layers: 4
linear_dim: 250
proj_dim: 128
lorder: 20
rorder: 0
lstride: 1
rstride: 0
output_affine_dim: 140
output_dim: 248
frontend:
fs: 16000
window: hamming
n_mels: 80
frame_length: 25
frame_shift: 10
dither: 0.0
lfr_m: 5
lfr_n: 1
cmvn_file: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/am.mvn
sample_rate: 16000
detect_mode: 1
snr_mode: 0
max_end_silence_time: 800
max_start_silence_time: 3000
do_start_point_detection: True
do_end_point_detection: True
window_size_ms: 200
sil_to_speech_time_thres: 150
speech_to_sil_time_thres: 150
speech_2_noise_ratio: 1.0
do_extend: 1
lookback_time_start_point: 200
lookahead_time_end_point: 100
max_single_segment_time: 60000
snr_thres: -100.0
noise_frame_num_used_for_snr: 100
decibel_thres: -100.0
speech_noise_thres: 0.6
fe_prior_thres: 0.0001
silence_pdf_num: 1
sil_pdf_ids: [0]
speech_noise_thresh_low: -0.1
speech_noise_thresh_high: 0.3
output_frame_probs: False
frame_in_ms: 10
frame_length_ms: 25
tokenizer: None
vocab_size: -1
input_size: 400