File size: 2,897 Bytes
af1973e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
seed_everything: 3407

data:
  class_path: decoder.dataset.VocosDataModule
  init_args:
    train_params:
      filelist_path: ./WavTokenizer/data/train/libritts_train
      sampling_rate: 24000
      num_samples: 72000
      batch_size: 40  # 20
      num_workers: 8

    val_params:
      filelist_path: ./WavTokenizer/data/infer/librttts_val
      sampling_rate: 24000
      num_samples: 72000
      batch_size: 5   # 10
      num_workers: 8

model:
  class_path: decoder.experiment.WavTokenizer
  init_args:
    sample_rate: 24000
    initial_learning_rate: 2e-4
    mel_loss_coeff: 45
    mrd_loss_coeff: 1.0
    num_warmup_steps: 0 # Optimizers warmup steps
    pretrain_mel_steps: 0  # 0 means GAN objective from the first iteration

    # automatic evaluation
    evaluate_utmos: true
    evaluate_pesq: true
    evaluate_periodicty: true

    resume: false
    resume_config: ./WavTokenizer/configs/wavtokenizer_smalldata_frame75_3s_nq1_code16384_dim512_kmeans800_attn.yaml
    resume_model: ./WavTokenizer/result/train/wavtokenizer_smalldata_frame75_3s_nq1_code16384_dim512_kmeans800_attn/vocos_checkpoint_epoch=35_step=187848_val_loss=4.1611.ckpt

    feature_extractor:
      class_path: decoder.feature_extractors.EncodecFeatures
      init_args:
        encodec_model: encodec_24khz
        bandwidths: [6.6, 6.6, 6.6, 6.6]
        train_codebooks: true
        num_quantizers: 1  
        dowmsamples: [8, 5, 4, 2]
        vq_bins: 4096
        vq_kmeans: 200

    backbone:
      class_path: decoder.models.VocosBackbone
      init_args:
        input_channels: 512
        dim: 768
        intermediate_dim: 2304
        num_layers: 12
        adanorm_num_embeddings: 4  

    head:
      class_path: decoder.heads.ISTFTHead
      init_args:
        dim: 768
        n_fft: 1280 
        hop_length: 320  
        padding: same

trainer:
  logger:
    class_path: pytorch_lightning.loggers.TensorBoardLogger
    init_args:
      save_dir: ./WavTokenizer/result/train/wavtokenizer_smalldata_frame75_3s_nq1_code4096_dim512_kmeans200_attn/
  callbacks:
    - class_path: pytorch_lightning.callbacks.LearningRateMonitor
    - class_path: pytorch_lightning.callbacks.ModelSummary
      init_args:
        max_depth: 2
    - class_path: pytorch_lightning.callbacks.ModelCheckpoint
      init_args:
        monitor: val_loss
        filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
        save_top_k: 10
        save_last: true
    - class_path: decoder.helpers.GradNormCallback

  # Lightning calculates max_steps across all optimizer steps (rather than number of batches)
  # This equals to 1M steps per generator and 1M per discriminator
  max_steps: 20000000
  # You might want to limit val batches when evaluating all the metrics, as they are time-consuming
  limit_val_batches: 100
  accelerator: gpu
  strategy: ddp
  devices: [0,1,2,3,4,5,6,7]
  log_every_n_steps: 1000