|
model: |
|
_target_: modules.sensevoice.model.SenseVoice |
|
model: |
|
_target_: modules.sensevoice.sensevoicesmall.SenseVoiceSmall |
|
specaug: |
|
_target_: modules.sensevoice.specaug.SpecAugLFR |
|
apply_freq_mask: true |
|
apply_time_mask: true |
|
apply_time_warp: false |
|
freq_mask_width_range: [0, 30] |
|
lfr_rate: 6 |
|
num_freq_mask: 1 |
|
num_time_mask: 1 |
|
time_mask_width_range: [0, 12] |
|
time_warp_mode: bicubic |
|
time_warp_window: 5 |
|
encoder: |
|
_target_: modules.sensevoice.sensevoicesmall.SenseVoiceEncoderSmall |
|
attention_dropout_rate: 0.1 |
|
attention_heads: 4 |
|
dropout_rate: 0.1 |
|
kernel_size: 11 |
|
linear_units: 2048 |
|
normalize_before: true |
|
num_blocks: 50 |
|
output_size: 512 |
|
sanm_shfit: 0 |
|
tp_blocks: 20 |
|
input_size: 560 |
|
tokenizer: |
|
_target_: modules.sensevoice.tokenizer.SentencepiecesTokenizer |
|
bpemodel: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model |
|
unk_symbol: <unk> |
|
split_with_space: true |
|
frontend: |
|
_target_: modules.sensevoice.frontend.WavFrontend |
|
fs: 16000 |
|
window: hamming |
|
n_mels: 80 |
|
frame_length: 25 |
|
frame_shift: 10 |
|
lfr_m: 7 |
|
lfr_n: 6 |
|
length_normalized_loss: true |
|
input_size: 560 |
|
vocab_size: 25055 |
|
sos: 1 |
|
eos: 2 |
|
ignore_id: -1 |
|
vad_model: |
|
_target_: modules.sensevoice.vad.FsmnVADStreaming |
|
encoder: |
|
_target_: modules.sensevoice.vad.FSMN |
|
input_dim: 400 |
|
input_affine_dim: 140 |
|
fsmn_layers: 4 |
|
linear_dim: 250 |
|
proj_dim: 128 |
|
lorder: 20 |
|
rorder: 0 |
|
lstride: 1 |
|
rstride: 0 |
|
output_affine_dim: 140 |
|
output_dim: 248 |
|
frontend: |
|
_target_: modules.sensevoice.frontend.WavFrontendOnline |
|
fs: 16000 |
|
window: hamming |
|
n_mels: 80 |
|
frame_length: 25 |
|
frame_shift: 10 |
|
dither: 0.0 |
|
lfr_m: 5 |
|
lfr_n: 1 |
|
cmvn_file: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/am.mvn |
|
sample_rate: 16000 |
|
detect_mode: 1 |
|
snr_mode: 0 |
|
max_end_silence_time: 800 |
|
max_start_silence_time: 3000 |
|
do_start_point_detection: True |
|
do_end_point_detection: True |
|
window_size_ms: 200 |
|
sil_to_speech_time_thres: 150 |
|
speech_to_sil_time_thres: 150 |
|
speech_2_noise_ratio: 1.0 |
|
do_extend: 1 |
|
lookback_time_start_point: 200 |
|
lookahead_time_end_point: 100 |
|
max_single_segment_time: 60000 |
|
snr_thres: -100.0 |
|
noise_frame_num_used_for_snr: 100 |
|
decibel_thres: -100.0 |
|
speech_noise_thres: 0.6 |
|
fe_prior_thres: 0.0001 |
|
silence_pdf_num: 1 |
|
sil_pdf_ids: [0] |
|
speech_noise_thresh_low: -0.1 |
|
speech_noise_thresh_high: 0.3 |
|
output_frame_probs: False |
|
frame_in_ms: 10 |
|
frame_length_ms: 25 |
|
tokenizer: None |
|
vocab_size: -1 |
|
input_size: 400 |
|
kwargs: |
|
specaug: |
|
apply_freq_mask: true |
|
apply_time_mask: true |
|
apply_time_warp: false |
|
freq_mask_width_range: [0, 30] |
|
lfr_rate: 6 |
|
num_freq_mask: 1 |
|
num_time_mask: 1 |
|
time_mask_width_range: [0, 12] |
|
time_warp_mode: bicubic |
|
time_warp_window: 5 |
|
encoder: |
|
attention_dropout_rate: 0.1 |
|
attention_heads: 4 |
|
dropout_rate: 0.1 |
|
kernel_size: 11 |
|
linear_units: 2048 |
|
normalize_before: true |
|
num_blocks: 50 |
|
output_size: 512 |
|
sanm_shfit: 0 |
|
tp_blocks: 20 |
|
input_size: 560 |
|
tokenizer: |
|
bpemodel: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/chn_jpn_yue_eng_ko_spectok.bpe.model |
|
unk_symbol: <unk> |
|
split_with_space: true |
|
frontend: |
|
fs: 16000 |
|
window: hamming |
|
n_mels: 80 |
|
frame_length: 25 |
|
frame_shift: 10 |
|
lfr_m: 7 |
|
lfr_n: 6 |
|
length_normalized_loss: true |
|
input_size: 560 |
|
vocab_size: 25055 |
|
sos: 1 |
|
eos: 2 |
|
ignore_id: -1 |
|
vad_kwargs: |
|
encoder: |
|
input_dim: 400 |
|
input_affine_dim: 140 |
|
fsmn_layers: 4 |
|
linear_dim: 250 |
|
proj_dim: 128 |
|
lorder: 20 |
|
rorder: 0 |
|
lstride: 1 |
|
rstride: 0 |
|
output_affine_dim: 140 |
|
output_dim: 248 |
|
frontend: |
|
fs: 16000 |
|
window: hamming |
|
n_mels: 80 |
|
frame_length: 25 |
|
frame_shift: 10 |
|
dither: 0.0 |
|
lfr_m: 5 |
|
lfr_n: 1 |
|
cmvn_file: /storage-root/datasets/zhaojingtong/checkpoints/sensevoice/am.mvn |
|
sample_rate: 16000 |
|
detect_mode: 1 |
|
snr_mode: 0 |
|
max_end_silence_time: 800 |
|
max_start_silence_time: 3000 |
|
do_start_point_detection: True |
|
do_end_point_detection: True |
|
window_size_ms: 200 |
|
sil_to_speech_time_thres: 150 |
|
speech_to_sil_time_thres: 150 |
|
speech_2_noise_ratio: 1.0 |
|
do_extend: 1 |
|
lookback_time_start_point: 200 |
|
lookahead_time_end_point: 100 |
|
max_single_segment_time: 60000 |
|
snr_thres: -100.0 |
|
noise_frame_num_used_for_snr: 100 |
|
decibel_thres: -100.0 |
|
speech_noise_thres: 0.6 |
|
fe_prior_thres: 0.0001 |
|
silence_pdf_num: 1 |
|
sil_pdf_ids: [0] |
|
speech_noise_thresh_low: -0.1 |
|
speech_noise_thresh_high: 0.3 |
|
output_frame_probs: False |
|
frame_in_ms: 10 |
|
frame_length_ms: 25 |
|
tokenizer: None |
|
vocab_size: -1 |
|
input_size: 400 |