ESPnet
multilingual
audio
codec
ftshijt commited on
Commit
c8247b0
·
1 Parent(s): de598d2

Update model

Browse files
Files changed (29) hide show
  1. README.md +367 -3
  2. exp_16k/codec_train_dac_fs16000_raw_fs16000/120epoch.pth +3 -0
  3. exp_16k/codec_train_dac_fs16000_raw_fs16000/config.yaml +292 -0
  4. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/adv_loss.png +0 -0
  5. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/codec_commit_loss.png +0 -0
  6. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/codec_loss.png +0 -0
  7. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/codec_quantization_loss.png +0 -0
  8. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/discriminator_backward_time.png +0 -0
  9. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/discriminator_forward_time.png +0 -0
  10. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/discriminator_loss.png +0 -0
  11. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/discriminator_optim_step_time.png +0 -0
  12. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/discriminator_train_time.png +0 -0
  13. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/fake_loss.png +0 -0
  14. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/feat_match_loss.png +0 -0
  15. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/generator_backward_time.png +0 -0
  16. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/generator_forward_time.png +0 -0
  17. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/generator_optim_step_time.png +0 -0
  18. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/generator_train_time.png +0 -0
  19. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/gpu_max_cached_mem_GB.png +0 -0
  20. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/iter_time.png +0 -0
  21. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/loss.png +0 -0
  22. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/mel_loss.png +0 -0
  23. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/mel_loss_real.png +0 -0
  24. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/optim0_lr0.png +0 -0
  25. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/optim1_lr0.png +0 -0
  26. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/real_loss.png +0 -0
  27. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/reconstruct_loss.png +0 -0
  28. exp_16k/codec_train_dac_fs16000_raw_fs16000/images/train_time.png +0 -0
  29. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,367 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - codec
6
+ language: multilingual
7
+ datasets:
8
+ - amuse
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 Codec model
13
+
14
+ ### `espnet/amuse_dac_16k`
15
+
16
+ This model was trained by ftshijt using amuse recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 5201685018b0e8fb9826bc51a710623140a06627
26
+ pip install -e .
27
+ cd egs2/amuse/codec1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/amuse_dac_16k
29
+ ```
30
+
31
+
32
+
33
+ ## Codec config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/train_dac_fs16000.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: chunk
44
+ valid_iterator_type: null
45
+ output_dir: exp_16k/codec_train_dac_fs16000_raw_fs16000
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 1
49
+ num_att_plot: 0
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: 4
53
+ dist_rank: 0
54
+ local_rank: 0
55
+ dist_master_addr: localhost
56
+ dist_master_port: 50493
57
+ dist_launcher: null
58
+ multiprocessing_distributed: true
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: false
64
+ use_tf32: true
65
+ collect_stats: false
66
+ write_collected_feats: false
67
+ max_epoch: 120
68
+ patience: null
69
+ val_scheduler_criterion:
70
+ - valid
71
+ - loss
72
+ early_stopping_criterion:
73
+ - valid
74
+ - loss
75
+ - min
76
+ best_model_criterion:
77
+ - - valid
78
+ - mel_loss
79
+ - min
80
+ - - train
81
+ - mel_loss
82
+ - min
83
+ - - train
84
+ - total_count
85
+ - max
86
+ keep_nbest_models: 5
87
+ nbest_averaging_interval: 0
88
+ grad_clip: -1
89
+ grad_clip_type: 2.0
90
+ grad_noise: false
91
+ accum_grad: 1
92
+ no_forward_run: false
93
+ resume: true
94
+ train_dtype: float32
95
+ use_amp: false
96
+ log_interval: 1000
97
+ use_matplotlib: true
98
+ use_tensorboard: true
99
+ create_graph_in_tensorboard: false
100
+ use_wandb: false
101
+ wandb_project: null
102
+ wandb_id: null
103
+ wandb_entity: null
104
+ wandb_name: null
105
+ wandb_model_log_interval: -1
106
+ detect_anomaly: false
107
+ use_adapter: false
108
+ adapter: lora
109
+ save_strategy: all
110
+ adapter_conf: {}
111
+ pretrain_path: null
112
+ init_param: []
113
+ ignore_init_mismatch: false
114
+ freeze_param: []
115
+ num_iters_per_epoch: 5000
116
+ batch_size: 64
117
+ valid_batch_size: null
118
+ batch_bins: 1000000
119
+ valid_batch_bins: null
120
+ train_shape_file:
121
+ - exp_16k/codec_stats_raw/train/audio_shape
122
+ valid_shape_file:
123
+ - exp_16k/codec_stats_raw/valid/audio_shape
124
+ batch_type: unsorted
125
+ valid_batch_type: null
126
+ fold_length:
127
+ - 256000
128
+ sort_in_batch: descending
129
+ shuffle_within_batch: false
130
+ sort_batch: descending
131
+ multiple_iterator: false
132
+ chunk_length: 32000
133
+ chunk_shift_ratio: 0.5
134
+ num_cache_chunks: 128
135
+ chunk_excluded_key_prefixes: []
136
+ chunk_default_fs: null
137
+ train_data_path_and_name_and_type:
138
+ - - dump_16k/raw/train/wav.scp
139
+ - audio
140
+ - kaldi_ark
141
+ valid_data_path_and_name_and_type:
142
+ - - dump_16k/raw/dev-small/wav.scp
143
+ - audio
144
+ - kaldi_ark
145
+ multi_task_dataset: false
146
+ allow_variable_data_keys: false
147
+ max_cache_size: 0.0
148
+ max_cache_fd: 32
149
+ allow_multi_rates: false
150
+ valid_max_cache_size: null
151
+ exclude_weight_decay: false
152
+ exclude_weight_decay_conf: {}
153
+ optim: adam
154
+ optim_conf:
155
+ lr: 0.0002
156
+ betas:
157
+ - 0.5
158
+ - 0.9
159
+ eps: 1.0e-09
160
+ weight_decay: 0.0
161
+ scheduler: exponentiallr
162
+ scheduler_conf:
163
+ gamma: 0.999875
164
+ optim2: adam
165
+ optim2_conf:
166
+ lr: 0.0002
167
+ betas:
168
+ - 0.5
169
+ - 0.9
170
+ eps: 1.0e-09
171
+ weight_decay: 0.0
172
+ scheduler2: exponentiallr
173
+ scheduler2_conf:
174
+ gamma: 0.999875
175
+ generator_first: true
176
+ skip_discriminator_prob: 0.0
177
+ model_conf: {}
178
+ use_preprocessor: true
179
+ codec: dac
180
+ codec_conf:
181
+ sampling_rate: 16000
182
+ generator_params:
183
+ hidden_dim: 512
184
+ codebook_dim: 512
185
+ encdec_channels: 1
186
+ encdec_n_filters: 32
187
+ encdec_n_residual_layers: 3
188
+ encdec_ratios:
189
+ - 8
190
+ - 5
191
+ - 4
192
+ - 2
193
+ encdec_activation: Snake
194
+ encdec_norm: weight_norm
195
+ encdec_kernel_size: 7
196
+ encdec_residual_kernel_size: 7
197
+ encdec_last_kernel_size: 7
198
+ encdec_dilation_base: 2
199
+ encdec_causal: false
200
+ encdec_pad_mode: reflect
201
+ encdec_true_skip: false
202
+ encdec_compress: 2
203
+ encdec_lstm: 2
204
+ decoder_trim_right_ratio: 1.0
205
+ decoder_final_activation: null
206
+ decoder_final_activation_params: null
207
+ quantizer_n_q: 32
208
+ quantizer_bins: 1024
209
+ quantizer_decay: 0.99
210
+ quantizer_kmeans_init: true
211
+ quantizer_kmeans_iters: 50
212
+ quantizer_threshold_ema_dead_code: 2
213
+ quantizer_target_bandwidth:
214
+ - 2
215
+ - 4
216
+ - 8
217
+ - 16
218
+ - 32
219
+ quantizer_dropout: true
220
+ sample_rate: 16000
221
+ discriminator_params:
222
+ scales: 3
223
+ scale_downsample_pooling: AvgPool1d
224
+ scale_downsample_pooling_params:
225
+ kernel_size: 4
226
+ stride: 2
227
+ padding: 2
228
+ scale_discriminator_params:
229
+ in_channels: 1
230
+ out_channels: 1
231
+ kernel_sizes:
232
+ - 15
233
+ - 41
234
+ - 5
235
+ - 3
236
+ channels: 128
237
+ max_downsample_channels: 1024
238
+ max_groups: 16
239
+ bias: true
240
+ downsample_scales:
241
+ - 2
242
+ - 2
243
+ - 4
244
+ - 4
245
+ - 1
246
+ nonlinear_activation: LeakyReLU
247
+ nonlinear_activation_params:
248
+ negative_slope: 0.1
249
+ scale_follow_official_norm: false
250
+ msmpmb_discriminator_params:
251
+ rates: []
252
+ sample_rate: 16000
253
+ fft_sizes:
254
+ - 2048
255
+ - 1024
256
+ - 512
257
+ periods:
258
+ - 2
259
+ - 3
260
+ - 5
261
+ - 7
262
+ - 11
263
+ period_discriminator_params:
264
+ in_channels: 1
265
+ out_channels: 1
266
+ kernel_sizes:
267
+ - 5
268
+ - 3
269
+ channels: 32
270
+ downsample_scales:
271
+ - 3
272
+ - 3
273
+ - 3
274
+ - 3
275
+ - 1
276
+ max_downsample_channels: 1024
277
+ bias: true
278
+ nonlinear_activation: LeakyReLU
279
+ nonlinear_activation_params:
280
+ negative_slope: 0.1
281
+ use_weight_norm: true
282
+ use_spectral_norm: false
283
+ band_discriminator_params:
284
+ hop_factor: 0.25
285
+ sample_rate: 16000
286
+ bands:
287
+ - - 0.0
288
+ - 0.1
289
+ - - 0.1
290
+ - 0.25
291
+ - - 0.25
292
+ - 0.5
293
+ - - 0.5
294
+ - 0.75
295
+ - - 0.75
296
+ - 1.0
297
+ channel: 32
298
+ generator_adv_loss_params:
299
+ average_by_discriminators: false
300
+ loss_type: mse
301
+ discriminator_adv_loss_params:
302
+ average_by_discriminators: false
303
+ loss_type: mse
304
+ use_feat_match_loss: true
305
+ feat_match_loss_params:
306
+ average_by_discriminators: false
307
+ average_by_layers: false
308
+ include_final_outputs: true
309
+ use_mel_loss: true
310
+ mel_loss_params:
311
+ range_start: 6
312
+ range_end: 11
313
+ window: hann
314
+ n_mels: 80
315
+ fmin: 0
316
+ fmax: null
317
+ log_base: null
318
+ fs: 16000
319
+ lambda_quantization: 0.25
320
+ lambda_commit: 1.0
321
+ lambda_reconstruct: 1.0
322
+ lambda_adv: 1.0
323
+ lambda_mel: 45.0
324
+ lambda_feat_match: 2.0
325
+ cache_generator_outputs: true
326
+ required:
327
+ - output_dir
328
+ version: '202402'
329
+ distributed: true
330
+ ```
331
+
332
+ </details>
333
+
334
+
335
+
336
+ ### Citing ESPnet
337
+
338
+ ```BibTex
339
+ @inproceedings{watanabe2018espnet,
340
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
341
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
342
+ year={2018},
343
+ booktitle={Proceedings of Interspeech},
344
+ pages={2207--2211},
345
+ doi={10.21437/Interspeech.2018-1456},
346
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
347
+ }
348
+
349
+
350
+
351
+
352
+
353
+
354
+ ```
355
+
356
+ or arXiv:
357
+
358
+ ```bibtex
359
+ @misc{watanabe2018espnet,
360
+ title={ESPnet: End-to-End Speech Processing Toolkit},
361
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
362
+ year={2018},
363
+ eprint={1804.00015},
364
+ archivePrefix={arXiv},
365
+ primaryClass={cs.CL}
366
+ }
367
+ ```
exp_16k/codec_train_dac_fs16000_raw_fs16000/120epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba776ac0027d65d5e835be6675ac6e2fa589a6fb14ec299cf0d7a1b491695102
3
+ size 383902703
exp_16k/codec_train_dac_fs16000_raw_fs16000/config.yaml ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_dac_fs16000.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: chunk
7
+ valid_iterator_type: null
8
+ output_dir: exp_16k/codec_train_dac_fs16000_raw_fs16000
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 1
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: 4
16
+ dist_rank: 0
17
+ local_rank: 0
18
+ dist_master_addr: localhost
19
+ dist_master_port: 50493
20
+ dist_launcher: null
21
+ multiprocessing_distributed: true
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ use_tf32: true
28
+ collect_stats: false
29
+ write_collected_feats: false
30
+ max_epoch: 120
31
+ patience: null
32
+ val_scheduler_criterion:
33
+ - valid
34
+ - loss
35
+ early_stopping_criterion:
36
+ - valid
37
+ - loss
38
+ - min
39
+ best_model_criterion:
40
+ - - valid
41
+ - mel_loss
42
+ - min
43
+ - - train
44
+ - mel_loss
45
+ - min
46
+ - - train
47
+ - total_count
48
+ - max
49
+ keep_nbest_models: 5
50
+ nbest_averaging_interval: 0
51
+ grad_clip: -1
52
+ grad_clip_type: 2.0
53
+ grad_noise: false
54
+ accum_grad: 1
55
+ no_forward_run: false
56
+ resume: true
57
+ train_dtype: float32
58
+ use_amp: false
59
+ log_interval: 1000
60
+ use_matplotlib: true
61
+ use_tensorboard: true
62
+ create_graph_in_tensorboard: false
63
+ use_wandb: false
64
+ wandb_project: null
65
+ wandb_id: null
66
+ wandb_entity: null
67
+ wandb_name: null
68
+ wandb_model_log_interval: -1
69
+ detect_anomaly: false
70
+ use_adapter: false
71
+ adapter: lora
72
+ save_strategy: all
73
+ adapter_conf: {}
74
+ pretrain_path: null
75
+ init_param: []
76
+ ignore_init_mismatch: false
77
+ freeze_param: []
78
+ num_iters_per_epoch: 5000
79
+ batch_size: 64
80
+ valid_batch_size: null
81
+ batch_bins: 1000000
82
+ valid_batch_bins: null
83
+ train_shape_file:
84
+ - exp_16k/codec_stats_raw/train/audio_shape
85
+ valid_shape_file:
86
+ - exp_16k/codec_stats_raw/valid/audio_shape
87
+ batch_type: unsorted
88
+ valid_batch_type: null
89
+ fold_length:
90
+ - 256000
91
+ sort_in_batch: descending
92
+ shuffle_within_batch: false
93
+ sort_batch: descending
94
+ multiple_iterator: false
95
+ chunk_length: 32000
96
+ chunk_shift_ratio: 0.5
97
+ num_cache_chunks: 128
98
+ chunk_excluded_key_prefixes: []
99
+ chunk_default_fs: null
100
+ train_data_path_and_name_and_type:
101
+ - - dump_16k/raw/train/wav.scp
102
+ - audio
103
+ - kaldi_ark
104
+ valid_data_path_and_name_and_type:
105
+ - - dump_16k/raw/dev-small/wav.scp
106
+ - audio
107
+ - kaldi_ark
108
+ multi_task_dataset: false
109
+ allow_variable_data_keys: false
110
+ max_cache_size: 0.0
111
+ max_cache_fd: 32
112
+ allow_multi_rates: false
113
+ valid_max_cache_size: null
114
+ exclude_weight_decay: false
115
+ exclude_weight_decay_conf: {}
116
+ optim: adam
117
+ optim_conf:
118
+ lr: 0.0002
119
+ betas:
120
+ - 0.5
121
+ - 0.9
122
+ eps: 1.0e-09
123
+ weight_decay: 0.0
124
+ scheduler: exponentiallr
125
+ scheduler_conf:
126
+ gamma: 0.999875
127
+ optim2: adam
128
+ optim2_conf:
129
+ lr: 0.0002
130
+ betas:
131
+ - 0.5
132
+ - 0.9
133
+ eps: 1.0e-09
134
+ weight_decay: 0.0
135
+ scheduler2: exponentiallr
136
+ scheduler2_conf:
137
+ gamma: 0.999875
138
+ generator_first: true
139
+ skip_discriminator_prob: 0.0
140
+ model_conf: {}
141
+ use_preprocessor: true
142
+ codec: dac
143
+ codec_conf:
144
+ sampling_rate: 16000
145
+ generator_params:
146
+ hidden_dim: 512
147
+ codebook_dim: 512
148
+ encdec_channels: 1
149
+ encdec_n_filters: 32
150
+ encdec_n_residual_layers: 3
151
+ encdec_ratios:
152
+ - 8
153
+ - 5
154
+ - 4
155
+ - 2
156
+ encdec_activation: Snake
157
+ encdec_norm: weight_norm
158
+ encdec_kernel_size: 7
159
+ encdec_residual_kernel_size: 7
160
+ encdec_last_kernel_size: 7
161
+ encdec_dilation_base: 2
162
+ encdec_causal: false
163
+ encdec_pad_mode: reflect
164
+ encdec_true_skip: false
165
+ encdec_compress: 2
166
+ encdec_lstm: 2
167
+ decoder_trim_right_ratio: 1.0
168
+ decoder_final_activation: null
169
+ decoder_final_activation_params: null
170
+ quantizer_n_q: 32
171
+ quantizer_bins: 1024
172
+ quantizer_decay: 0.99
173
+ quantizer_kmeans_init: true
174
+ quantizer_kmeans_iters: 50
175
+ quantizer_threshold_ema_dead_code: 2
176
+ quantizer_target_bandwidth:
177
+ - 2
178
+ - 4
179
+ - 8
180
+ - 16
181
+ - 32
182
+ quantizer_dropout: true
183
+ sample_rate: 16000
184
+ discriminator_params:
185
+ scales: 3
186
+ scale_downsample_pooling: AvgPool1d
187
+ scale_downsample_pooling_params:
188
+ kernel_size: 4
189
+ stride: 2
190
+ padding: 2
191
+ scale_discriminator_params:
192
+ in_channels: 1
193
+ out_channels: 1
194
+ kernel_sizes:
195
+ - 15
196
+ - 41
197
+ - 5
198
+ - 3
199
+ channels: 128
200
+ max_downsample_channels: 1024
201
+ max_groups: 16
202
+ bias: true
203
+ downsample_scales:
204
+ - 2
205
+ - 2
206
+ - 4
207
+ - 4
208
+ - 1
209
+ nonlinear_activation: LeakyReLU
210
+ nonlinear_activation_params:
211
+ negative_slope: 0.1
212
+ scale_follow_official_norm: false
213
+ msmpmb_discriminator_params:
214
+ rates: []
215
+ sample_rate: 16000
216
+ fft_sizes:
217
+ - 2048
218
+ - 1024
219
+ - 512
220
+ periods:
221
+ - 2
222
+ - 3
223
+ - 5
224
+ - 7
225
+ - 11
226
+ period_discriminator_params:
227
+ in_channels: 1
228
+ out_channels: 1
229
+ kernel_sizes:
230
+ - 5
231
+ - 3
232
+ channels: 32
233
+ downsample_scales:
234
+ - 3
235
+ - 3
236
+ - 3
237
+ - 3
238
+ - 1
239
+ max_downsample_channels: 1024
240
+ bias: true
241
+ nonlinear_activation: LeakyReLU
242
+ nonlinear_activation_params:
243
+ negative_slope: 0.1
244
+ use_weight_norm: true
245
+ use_spectral_norm: false
246
+ band_discriminator_params:
247
+ hop_factor: 0.25
248
+ sample_rate: 16000
249
+ bands:
250
+ - - 0.0
251
+ - 0.1
252
+ - - 0.1
253
+ - 0.25
254
+ - - 0.25
255
+ - 0.5
256
+ - - 0.5
257
+ - 0.75
258
+ - - 0.75
259
+ - 1.0
260
+ channel: 32
261
+ generator_adv_loss_params:
262
+ average_by_discriminators: false
263
+ loss_type: mse
264
+ discriminator_adv_loss_params:
265
+ average_by_discriminators: false
266
+ loss_type: mse
267
+ use_feat_match_loss: true
268
+ feat_match_loss_params:
269
+ average_by_discriminators: false
270
+ average_by_layers: false
271
+ include_final_outputs: true
272
+ use_mel_loss: true
273
+ mel_loss_params:
274
+ range_start: 6
275
+ range_end: 11
276
+ window: hann
277
+ n_mels: 80
278
+ fmin: 0
279
+ fmax: null
280
+ log_base: null
281
+ fs: 16000
282
+ lambda_quantization: 0.25
283
+ lambda_commit: 1.0
284
+ lambda_reconstruct: 1.0
285
+ lambda_adv: 1.0
286
+ lambda_mel: 45.0
287
+ lambda_feat_match: 2.0
288
+ cache_generator_outputs: true
289
+ required:
290
+ - output_dir
291
+ version: '202402'
292
+ distributed: true
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/adv_loss.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/codec_commit_loss.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/codec_loss.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/codec_quantization_loss.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/discriminator_backward_time.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/discriminator_forward_time.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/discriminator_loss.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/discriminator_optim_step_time.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/discriminator_train_time.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/fake_loss.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/feat_match_loss.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/generator_backward_time.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/generator_forward_time.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/generator_optim_step_time.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/generator_train_time.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/gpu_max_cached_mem_GB.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/iter_time.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/loss.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/mel_loss.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/mel_loss_real.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/optim0_lr0.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/optim1_lr0.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/real_loss.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/reconstruct_loss.png ADDED
exp_16k/codec_train_dac_fs16000_raw_fs16000/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202402'
2
+ files:
3
+ model_file: exp_16k/codec_train_dac_fs16000_raw_fs16000/120epoch.pth
4
+ python: 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:45:18) [GCC 12.3.0]
5
+ timestamp: 1719068420.822983
6
+ torch: 2.3.0+cu118
7
+ yaml_files:
8
+ train_config: exp_16k/codec_train_dac_fs16000_raw_fs16000/config.yaml