poiqazwsx commited on
Commit
9db67e9
·
1 Parent(s): 84ccf60

Delete configs

Browse files
configs/config_dnr_bandit_bsrnn_multi_mus64.yaml DELETED
@@ -1,78 +0,0 @@
1
- name: "MultiMaskMultiSourceBandSplitRNN"
2
- audio:
3
- chunk_size: 264600
4
- num_channels: 2
5
- sample_rate: 44100
6
- min_mean_abs: 0.001
7
-
8
- model:
9
- in_channel: 1
10
- stems: ['speech', 'music', 'effects']
11
- band_specs: "musical"
12
- n_bands: 64
13
- fs: 44100
14
- require_no_overlap: false
15
- require_no_gap: true
16
- normalize_channel_independently: false
17
- treat_channel_as_feature: true
18
- n_sqm_modules: 8
19
- emb_dim: 128
20
- rnn_dim: 256
21
- bidirectional: true
22
- rnn_type: "GRU"
23
- mlp_dim: 512
24
- hidden_activation: "Tanh"
25
- hidden_activation_kwargs: null
26
- complex_mask: true
27
- n_fft: 2048
28
- win_length: 2048
29
- hop_length: 512
30
- window_fn: "hann_window"
31
- wkwargs: null
32
- power: null
33
- center: true
34
- normalized: true
35
- pad_mode: "constant"
36
- onesided: true
37
-
38
- training:
39
- batch_size: 4
40
- gradient_accumulation_steps: 4
41
- grad_clip: 0
42
- instruments:
43
- - speech
44
- - music
45
- - effects
46
- lr: 9.0e-05
47
- patience: 2
48
- reduce_factor: 0.95
49
- target_instrument: null
50
- num_epochs: 1000
51
- num_steps: 1000
52
- q: 0.95
53
- coarse_loss_clip: true
54
- ema_momentum: 0.999
55
- optimizer: adam
56
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
57
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
58
-
59
- augmentations:
60
- enable: true # enable or disable all augmentations (to fast disable if needed)
61
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
62
- loudness_min: 0.5
63
- loudness_max: 1.5
64
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
65
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
66
- - 0.2
67
- - 0.02
68
- mixup_loudness_min: 0.5
69
- mixup_loudness_max: 1.5
70
- all:
71
- channel_shuffle: 0.5 # Set 0 or lower to disable
72
- random_inverse: 0.1 # inverse track (better lower probability)
73
- random_polarity: 0.5 # polarity change (multiply waveform to -1)
74
-
75
- inference:
76
- batch_size: 1
77
- dim_t: 256
78
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_htdemucs_6stems.yaml DELETED
@@ -1,127 +0,0 @@
1
- audio:
2
- chunk_size: 485100 # samplerate * segment
3
- min_mean_abs: 0.001
4
- hop_length: 1024
5
-
6
- training:
7
- batch_size: 8
8
- gradient_accumulation_steps: 1
9
- grad_clip: 0
10
- segment: 11
11
- shift: 1
12
- samplerate: 44100
13
- channels: 2
14
- normalize: true
15
- instruments: ['drums', 'bass', 'other', 'vocals', 'guitar', 'piano']
16
- target_instrument: null
17
- num_epochs: 1000
18
- num_steps: 1000
19
- optimizer: adam
20
- lr: 9.0e-05
21
- patience: 2
22
- reduce_factor: 0.95
23
- q: 0.95
24
- coarse_loss_clip: true
25
- ema_momentum: 0.999
26
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
27
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
28
-
29
- augmentations:
30
- enable: true # enable or disable all augmentations (to fast disable if needed)
31
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
32
- loudness_min: 0.5
33
- loudness_max: 1.5
34
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
35
- mixup_probs: [0.2, 0.02]
36
- mixup_loudness_min: 0.5
37
- mixup_loudness_max: 1.5
38
- all:
39
- channel_shuffle: 0.5 # Set 0 or lower to disable
40
- random_inverse: 0.1 # inverse track (better lower probability)
41
- random_polarity: 0.5 # polarity change (multiply waveform to -1)
42
-
43
- inference:
44
- num_overlap: 4
45
- batch_size: 8
46
-
47
- model: htdemucs
48
-
49
- htdemucs: # see demucs/htdemucs.py for a detailed description
50
- # Channels
51
- channels: 48
52
- channels_time:
53
- growth: 2
54
- # STFT
55
- num_subbands: 1
56
- nfft: 4096
57
- wiener_iters: 0
58
- end_iters: 0
59
- wiener_residual: false
60
- cac: true
61
- # Main structure
62
- depth: 4
63
- rewrite: true
64
- # Frequency Branch
65
- multi_freqs: []
66
- multi_freqs_depth: 3
67
- freq_emb: 0.2
68
- emb_scale: 10
69
- emb_smooth: true
70
- # Convolutions
71
- kernel_size: 8
72
- stride: 4
73
- time_stride: 2
74
- context: 1
75
- context_enc: 0
76
- # normalization
77
- norm_starts: 4
78
- norm_groups: 4
79
- # DConv residual branch
80
- dconv_mode: 3
81
- dconv_depth: 2
82
- dconv_comp: 8
83
- dconv_init: 1e-3
84
- # Before the Transformer
85
- bottom_channels: 0
86
- # CrossTransformer
87
- # ------ Common to all
88
- # Regular parameters
89
- t_layers: 5
90
- t_hidden_scale: 4.0
91
- t_heads: 8
92
- t_dropout: 0.0
93
- t_layer_scale: True
94
- t_gelu: True
95
- # ------------- Positional Embedding
96
- t_emb: sin
97
- t_max_positions: 10000 # for the scaled embedding
98
- t_max_period: 10000.0
99
- t_weight_pos_embed: 1.0
100
- t_cape_mean_normalize: True
101
- t_cape_augment: True
102
- t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
103
- t_sin_random_shift: 0
104
- # ------------- norm before a transformer encoder
105
- t_norm_in: True
106
- t_norm_in_group: False
107
- # ------------- norm inside the encoder
108
- t_group_norm: False
109
- t_norm_first: True
110
- t_norm_out: True
111
- # ------------- optim
112
- t_weight_decay: 0.0
113
- t_lr:
114
- # ------------- sparsity
115
- t_sparse_self_attn: False
116
- t_sparse_cross_attn: False
117
- t_mask_type: diag
118
- t_mask_random_seed: 42
119
- t_sparse_attn_window: 400
120
- t_global_window: 100
121
- t_sparsity: 0.95
122
- t_auto_sparsity: False
123
- # Cross Encoder First (False)
124
- t_cross_first: False
125
- # Weight init
126
- rescale: 0.1
127
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_musdb18_bs_roformer.yaml DELETED
@@ -1,134 +0,0 @@
1
- audio:
2
- chunk_size: 131584
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 512
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- dim: 192
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- linear_transformer_depth: 0
19
- freqs_per_bands: !!python/tuple
20
- - 2
21
- - 2
22
- - 2
23
- - 2
24
- - 2
25
- - 2
26
- - 2
27
- - 2
28
- - 2
29
- - 2
30
- - 2
31
- - 2
32
- - 2
33
- - 2
34
- - 2
35
- - 2
36
- - 2
37
- - 2
38
- - 2
39
- - 2
40
- - 2
41
- - 2
42
- - 2
43
- - 2
44
- - 4
45
- - 4
46
- - 4
47
- - 4
48
- - 4
49
- - 4
50
- - 4
51
- - 4
52
- - 4
53
- - 4
54
- - 4
55
- - 4
56
- - 12
57
- - 12
58
- - 12
59
- - 12
60
- - 12
61
- - 12
62
- - 12
63
- - 12
64
- - 24
65
- - 24
66
- - 24
67
- - 24
68
- - 24
69
- - 24
70
- - 24
71
- - 24
72
- - 48
73
- - 48
74
- - 48
75
- - 48
76
- - 48
77
- - 48
78
- - 48
79
- - 48
80
- - 128
81
- - 129
82
- dim_head: 64
83
- heads: 8
84
- attn_dropout: 0.1
85
- ff_dropout: 0.1
86
- flash_attn: true
87
- dim_freqs_in: 1025
88
- stft_n_fft: 2048
89
- stft_hop_length: 512
90
- stft_win_length: 2048
91
- stft_normalized: false
92
- mask_estimator_depth: 2
93
- multi_stft_resolution_loss_weight: 1.0
94
- multi_stft_resolutions_window_sizes: !!python/tuple
95
- - 4096
96
- - 2048
97
- - 1024
98
- - 512
99
- - 256
100
- multi_stft_hop_size: 147
101
- multi_stft_normalized: False
102
-
103
- training:
104
- batch_size: 10
105
- gradient_accumulation_steps: 1
106
- grad_clip: 0
107
- instruments:
108
- - vocals
109
- - bass
110
- - drums
111
- - other
112
- lr: 5.0e-05
113
- patience: 2
114
- reduce_factor: 0.95
115
- target_instrument: vocals
116
- num_epochs: 1000
117
- num_steps: 1000
118
- q: 0.95
119
- coarse_loss_clip: true
120
- ema_momentum: 0.999
121
- optimizer: adam
122
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
123
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
124
-
125
- augmentations:
126
- enable: true # enable or disable all augmentations (to fast disable if needed)
127
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
128
- loudness_min: 0.5
129
- loudness_max: 1.5
130
-
131
- inference:
132
- batch_size: 1
133
- dim_t: 256
134
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_musdb18_demucs3_mmi.yaml DELETED
@@ -1,72 +0,0 @@
1
- audio:
2
- chunk_size: 485100 # samplerate * segment
3
- min_mean_abs: 0.000
4
- hop_length: 1024
5
-
6
- training:
7
- batch_size: 8
8
- gradient_accumulation_steps: 1
9
- grad_clip: 0
10
- segment: 11
11
- shift: 1
12
- samplerate: 44100
13
- channels: 2
14
- normalize: true
15
- instruments: ['drums', 'bass', 'other', 'vocals']
16
- target_instrument: null
17
- num_epochs: 1000
18
- num_steps: 1000
19
- optimizer: adam
20
- lr: 9.0e-05
21
- patience: 2
22
- reduce_factor: 0.95
23
- q: 0.95
24
- coarse_loss_clip: true
25
- ema_momentum: 0.999
26
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
27
- use_amp: false # enable or disable usage of mixed precision (float16) - usually it must be true
28
-
29
- augmentations:
30
- enable: true # enable or disable all augmentations (to fast disable if needed)
31
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
32
- loudness_min: 0.5
33
- loudness_max: 1.5
34
-
35
- inference:
36
- num_overlap: 4
37
- batch_size: 8
38
-
39
- model: hdemucs
40
-
41
- hdemucs: # see demucs/hdemucs.py for a detailed description
42
- channels: 48
43
- channels_time: null
44
- growth: 2
45
- nfft: 4096
46
- wiener_iters: 0
47
- end_iters: 0
48
- wiener_residual: False
49
- cac: True
50
- depth: 6
51
- rewrite: True
52
- hybrid: True
53
- hybrid_old: False
54
- multi_freqs: []
55
- multi_freqs_depth: 3
56
- freq_emb: 0.2
57
- emb_scale: 10
58
- emb_smooth: True
59
- kernel_size: 8
60
- stride: 4
61
- time_stride: 2
62
- context: 1
63
- context_enc: 0
64
- norm_starts: 4
65
- norm_groups: 4
66
- dconv_mode: 1
67
- dconv_depth: 2
68
- dconv_comp: 4
69
- dconv_attn: 4
70
- dconv_lstm: 4
71
- dconv_init: 0.001
72
- rescale: 0.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_musdb18_htdemucs.yaml DELETED
@@ -1,119 +0,0 @@
1
- audio:
2
- chunk_size: 485100 # samplerate * segment
3
- min_mean_abs: 0.001
4
- hop_length: 1024
5
-
6
- training:
7
- batch_size: 8
8
- gradient_accumulation_steps: 1
9
- grad_clip: 0
10
- segment: 11
11
- shift: 1
12
- samplerate: 44100
13
- channels: 2
14
- normalize: true
15
- instruments: ['drums', 'bass', 'other', 'vocals']
16
- target_instrument: null
17
- num_epochs: 1000
18
- num_steps: 1000
19
- optimizer: adam
20
- lr: 9.0e-05
21
- patience: 2
22
- reduce_factor: 0.95
23
- q: 0.95
24
- coarse_loss_clip: true
25
- ema_momentum: 0.999
26
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
27
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
28
-
29
- augmentations:
30
- enable: true # enable or disable all augmentations (to fast disable if needed)
31
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
32
- loudness_min: 0.5
33
- loudness_max: 1.5
34
-
35
- inference:
36
- num_overlap: 4
37
- batch_size: 8
38
-
39
- model: htdemucs
40
-
41
- htdemucs: # see demucs/htdemucs.py for a detailed description
42
- # Channels
43
- channels: 48
44
- channels_time:
45
- growth: 2
46
- # STFT
47
- num_subbands: 1
48
- nfft: 4096
49
- wiener_iters: 0
50
- end_iters: 0
51
- wiener_residual: false
52
- cac: true
53
- # Main structure
54
- depth: 4
55
- rewrite: true
56
- # Frequency Branch
57
- multi_freqs: []
58
- multi_freqs_depth: 3
59
- freq_emb: 0.2
60
- emb_scale: 10
61
- emb_smooth: true
62
- # Convolutions
63
- kernel_size: 8
64
- stride: 4
65
- time_stride: 2
66
- context: 1
67
- context_enc: 0
68
- # normalization
69
- norm_starts: 4
70
- norm_groups: 4
71
- # DConv residual branch
72
- dconv_mode: 3
73
- dconv_depth: 2
74
- dconv_comp: 8
75
- dconv_init: 1e-3
76
- # Before the Transformer
77
- bottom_channels: 512
78
- # CrossTransformer
79
- # ------ Common to all
80
- # Regular parameters
81
- t_layers: 5
82
- t_hidden_scale: 4.0
83
- t_heads: 8
84
- t_dropout: 0.0
85
- t_layer_scale: True
86
- t_gelu: True
87
- # ------------- Positional Embedding
88
- t_emb: sin
89
- t_max_positions: 10000 # for the scaled embedding
90
- t_max_period: 10000.0
91
- t_weight_pos_embed: 1.0
92
- t_cape_mean_normalize: True
93
- t_cape_augment: True
94
- t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
95
- t_sin_random_shift: 0
96
- # ------------- norm before a transformer encoder
97
- t_norm_in: True
98
- t_norm_in_group: False
99
- # ------------- norm inside the encoder
100
- t_group_norm: False
101
- t_norm_first: True
102
- t_norm_out: True
103
- # ------------- optim
104
- t_weight_decay: 0.0
105
- t_lr:
106
- # ------------- sparsity
107
- t_sparse_self_attn: False
108
- t_sparse_cross_attn: False
109
- t_mask_type: diag
110
- t_mask_random_seed: 42
111
- t_sparse_attn_window: 400
112
- t_global_window: 100
113
- t_sparsity: 0.95
114
- t_auto_sparsity: False
115
- # Cross Encoder First (False)
116
- t_cross_first: False
117
- # Weight init
118
- rescale: 0.1
119
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_musdb18_mdx23c.yaml DELETED
@@ -1,182 +0,0 @@
1
- audio:
2
- chunk_size: 261120
3
- dim_f: 4096
4
- dim_t: 256
5
- hop_length: 1024
6
- n_fft: 8192
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- act: gelu
13
- bottleneck_factor: 4
14
- growth: 128
15
- norm: InstanceNorm
16
- num_blocks_per_scale: 2
17
- num_channels: 128
18
- num_scales: 5
19
- num_subbands: 4
20
- scale:
21
- - 2
22
- - 2
23
-
24
- training:
25
- batch_size: 6
26
- gradient_accumulation_steps: 1
27
- grad_clip: 0
28
- instruments:
29
- - vocals
30
- - bass
31
- - drums
32
- - other
33
- lr: 9.0e-05
34
- patience: 2
35
- reduce_factor: 0.95
36
- target_instrument: null
37
- num_epochs: 1000
38
- num_steps: 1000
39
- q: 0.95
40
- coarse_loss_clip: true
41
- ema_momentum: 0.999
42
- optimizer: adam
43
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
44
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
45
-
46
- augmentations:
47
- enable: true # enable or disable all augmentations (to fast disable if needed)
48
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
49
- loudness_min: 0.5
50
- loudness_max: 1.5
51
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
52
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
53
- - 0.2
54
- - 0.02
55
- mixup_loudness_min: 0.5
56
- mixup_loudness_max: 1.5
57
-
58
- # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
59
- mp3_compression_on_mixture: 0.01
60
- mp3_compression_on_mixture_bitrate_min: 32
61
- mp3_compression_on_mixture_bitrate_max: 320
62
- mp3_compression_on_mixture_backend: "lameenc"
63
-
64
- all:
65
- channel_shuffle: 0.5 # Set 0 or lower to disable
66
- random_inverse: 0.1 # inverse track (better lower probability)
67
- random_polarity: 0.5 # polarity change (multiply waveform to -1)
68
- mp3_compression: 0.01
69
- mp3_compression_min_bitrate: 32
70
- mp3_compression_max_bitrate: 320
71
- mp3_compression_backend: "lameenc"
72
-
73
- # pedalboard reverb block
74
- pedalboard_reverb: 0.01
75
- pedalboard_reverb_room_size_min: 0.1
76
- pedalboard_reverb_room_size_max: 0.9
77
- pedalboard_reverb_damping_min: 0.1
78
- pedalboard_reverb_damping_max: 0.9
79
- pedalboard_reverb_wet_level_min: 0.1
80
- pedalboard_reverb_wet_level_max: 0.9
81
- pedalboard_reverb_dry_level_min: 0.1
82
- pedalboard_reverb_dry_level_max: 0.9
83
- pedalboard_reverb_width_min: 0.9
84
- pedalboard_reverb_width_max: 1.0
85
-
86
- # pedalboard chorus block
87
- pedalboard_chorus: 0.01
88
- pedalboard_chorus_rate_hz_min: 1.0
89
- pedalboard_chorus_rate_hz_max: 7.0
90
- pedalboard_chorus_depth_min: 0.25
91
- pedalboard_chorus_depth_max: 0.95
92
- pedalboard_chorus_centre_delay_ms_min: 3
93
- pedalboard_chorus_centre_delay_ms_max: 10
94
- pedalboard_chorus_feedback_min: 0.0
95
- pedalboard_chorus_feedback_max: 0.5
96
- pedalboard_chorus_mix_min: 0.1
97
- pedalboard_chorus_mix_max: 0.9
98
-
99
- # pedalboard phazer block
100
- pedalboard_phazer: 0.01
101
- pedalboard_phazer_rate_hz_min: 1.0
102
- pedalboard_phazer_rate_hz_max: 10.0
103
- pedalboard_phazer_depth_min: 0.25
104
- pedalboard_phazer_depth_max: 0.95
105
- pedalboard_phazer_centre_frequency_hz_min: 200
106
- pedalboard_phazer_centre_frequency_hz_max: 12000
107
- pedalboard_phazer_feedback_min: 0.0
108
- pedalboard_phazer_feedback_max: 0.5
109
- pedalboard_phazer_mix_min: 0.1
110
- pedalboard_phazer_mix_max: 0.9
111
-
112
- # pedalboard distortion block
113
- pedalboard_distortion: 0.01
114
- pedalboard_distortion_drive_db_min: 1.0
115
- pedalboard_distortion_drive_db_max: 25.0
116
-
117
- # pedalboard pitch shift block
118
- pedalboard_pitch_shift: 0.01
119
- pedalboard_pitch_shift_semitones_min: -7
120
- pedalboard_pitch_shift_semitones_max: 7
121
-
122
- # pedalboard resample block
123
- pedalboard_resample: 0.01
124
- pedalboard_resample_target_sample_rate_min: 4000
125
- pedalboard_resample_target_sample_rate_max: 44100
126
-
127
- # pedalboard bitcrash block
128
- pedalboard_bitcrash: 0.01
129
- pedalboard_bitcrash_bit_depth_min: 4
130
- pedalboard_bitcrash_bit_depth_max: 16
131
-
132
- # pedalboard mp3 compressor block
133
- pedalboard_mp3_compressor: 0.01
134
- pedalboard_mp3_compressor_pedalboard_mp3_compressor_min: 0
135
- pedalboard_mp3_compressor_pedalboard_mp3_compressor_max: 9.999
136
-
137
- vocals:
138
- pitch_shift: 0.1
139
- pitch_shift_min_semitones: -5
140
- pitch_shift_max_semitones: 5
141
- seven_band_parametric_eq: 0.25
142
- seven_band_parametric_eq_min_gain_db: -9
143
- seven_band_parametric_eq_max_gain_db: 9
144
- tanh_distortion: 0.1
145
- tanh_distortion_min: 0.1
146
- tanh_distortion_max: 0.7
147
- bass:
148
- pitch_shift: 0.1
149
- pitch_shift_min_semitones: -2
150
- pitch_shift_max_semitones: 2
151
- seven_band_parametric_eq: 0.25
152
- seven_band_parametric_eq_min_gain_db: -3
153
- seven_band_parametric_eq_max_gain_db: 6
154
- tanh_distortion: 0.2
155
- tanh_distortion_min: 0.1
156
- tanh_distortion_max: 0.5
157
- drums:
158
- pitch_shift: 0.33
159
- pitch_shift_min_semitones: -5
160
- pitch_shift_max_semitones: 5
161
- seven_band_parametric_eq: 0.25
162
- seven_band_parametric_eq_min_gain_db: -9
163
- seven_band_parametric_eq_max_gain_db: 9
164
- tanh_distortion: 0.33
165
- tanh_distortion_min: 0.1
166
- tanh_distortion_max: 0.6
167
- other:
168
- pitch_shift: 0.1
169
- pitch_shift_min_semitones: -4
170
- pitch_shift_max_semitones: 4
171
- gaussian_noise: 0.1
172
- gaussian_noise_min_amplitude: 0.001
173
- gaussian_noise_max_amplitude: 0.015
174
- time_stretch: 0.01
175
- time_stretch_min_rate: 0.8
176
- time_stretch_max_rate: 1.25
177
-
178
-
179
- inference:
180
- batch_size: 1
181
- dim_t: 256
182
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_musdb18_mel_band_roformer.yaml DELETED
@@ -1,73 +0,0 @@
1
- audio:
2
- chunk_size: 131584
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 512
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- dim: 192
13
- depth: 8
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- linear_transformer_depth: 0
19
- num_bands: 60
20
- dim_head: 64
21
- heads: 8
22
- attn_dropout: 0.1
23
- ff_dropout: 0.1
24
- flash_attn: True
25
- dim_freqs_in: 1025
26
- sample_rate: 44100 # needed for mel filter bank from librosa
27
- stft_n_fft: 2048
28
- stft_hop_length: 512
29
- stft_win_length: 2048
30
- stft_normalized: False
31
- mask_estimator_depth: 2
32
- multi_stft_resolution_loss_weight: 1.0
33
- multi_stft_resolutions_window_sizes: !!python/tuple
34
- - 4096
35
- - 2048
36
- - 1024
37
- - 512
38
- - 256
39
- multi_stft_hop_size: 147
40
- multi_stft_normalized: False
41
-
42
- training:
43
- batch_size: 7
44
- gradient_accumulation_steps: 1
45
- grad_clip: 0
46
- instruments:
47
- - vocals
48
- - bass
49
- - drums
50
- - other
51
- lr: 5.0e-05
52
- patience: 2
53
- reduce_factor: 0.95
54
- target_instrument: vocals
55
- num_epochs: 1000
56
- num_steps: 1000
57
- q: 0.95
58
- coarse_loss_clip: true
59
- ema_momentum: 0.999
60
- optimizer: adam
61
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
62
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
63
-
64
- augmentations:
65
- enable: true # enable or disable all augmentations (to fast disable if needed)
66
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
67
- loudness_min: 0.5
68
- loudness_max: 1.5
69
-
70
- inference:
71
- batch_size: 1
72
- dim_t: 256
73
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_musdb18_scnet.yaml DELETED
@@ -1,64 +0,0 @@
1
- audio:
2
- chunk_size: 264600
3
- num_channels: 2
4
- sample_rate: 44100
5
- min_mean_abs: 0.001
6
-
7
- model:
8
- dims: [4, 32, 64, 128]
9
- bandsplit_ratios: [.175, .392, .433]
10
- downsample_strides: [1, 4, 16]
11
- n_conv_modules: [3, 2, 1]
12
- n_rnn_layers: 6
13
- rnn_hidden_dim: 128
14
- n_sources: 4
15
-
16
- n_fft: 4096
17
- hop_length: 1024
18
- win_length: 4096
19
- stft_normalized: false
20
-
21
- use_mamba: true
22
- d_state: 16
23
- d_conv: 4
24
- d_expand: 2
25
-
26
- training:
27
- batch_size: 10
28
- gradient_accumulation_steps: 1
29
- grad_clip: 0
30
- instruments:
31
- - vocals
32
- - bass
33
- - drums
34
- - other
35
- lr: 5.0e-04
36
- patience: 2
37
- reduce_factor: 0.95
38
- target_instrument: null
39
- num_epochs: 1000
40
- num_steps: 1000
41
- q: 0.95
42
- coarse_loss_clip: true
43
- ema_momentum: 0.999
44
- optimizer: adam
45
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
46
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
47
-
48
- augmentations:
49
- enable: true # enable or disable all augmentations (to fast disable if needed)
50
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
51
- loudness_min: 0.5
52
- loudness_max: 1.5
53
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
54
- mixup_probs:
55
- !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
56
- - 0.2
57
- - 0.02
58
- mixup_loudness_min: 0.5
59
- mixup_loudness_max: 1.5
60
-
61
- inference:
62
- batch_size: 1
63
- dim_t: 256
64
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_musdb18_segm_models.yaml DELETED
@@ -1,92 +0,0 @@
1
- audio:
2
- chunk_size: 261632
3
- dim_f: 4096
4
- dim_t: 512
5
- hop_length: 512
6
- n_fft: 8192
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- encoder_name: tu-maxvit_large_tf_512 # look here for possibilities: https://github.com/qubvel/segmentation_models.pytorch#encoders-
13
- decoder_type: unet # unet, fpn
14
- act: gelu
15
- num_channels: 128
16
- num_subbands: 8
17
-
18
- training:
19
- batch_size: 7
20
- gradient_accumulation_steps: 1
21
- grad_clip: 0
22
- instruments:
23
- - vocals
24
- - bass
25
- - drums
26
- - other
27
- lr: 5.0e-05
28
- patience: 2
29
- reduce_factor: 0.95
30
- target_instrument: null
31
- num_epochs: 1000
32
- num_steps: 2000
33
- q: 0.95
34
- coarse_loss_clip: true
35
- ema_momentum: 0.999
36
- optimizer: adamw
37
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
38
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
39
-
40
- augmentations:
41
- enable: true # enable or disable all augmentations (to fast disable if needed)
42
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
43
- loudness_min: 0.5
44
- loudness_max: 1.5
45
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
46
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
47
- - 0.2
48
- - 0.02
49
- mixup_loudness_min: 0.5
50
- mixup_loudness_max: 1.5
51
-
52
- # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
53
- mp3_compression_on_mixture: 0.01
54
- mp3_compression_on_mixture_bitrate_min: 32
55
- mp3_compression_on_mixture_bitrate_max: 320
56
- mp3_compression_on_mixture_backend: "lameenc"
57
-
58
- all:
59
- channel_shuffle: 0.5 # Set 0 or lower to disable
60
- random_inverse: 0.1 # inverse track (better lower probability)
61
- random_polarity: 0.5 # polarity change (multiply waveform to -1)
62
- mp3_compression: 0.01
63
- mp3_compression_min_bitrate: 32
64
- mp3_compression_max_bitrate: 320
65
- mp3_compression_backend: "lameenc"
66
-
67
- vocals:
68
- pitch_shift: 0.1
69
- pitch_shift_min_semitones: -5
70
- pitch_shift_max_semitones: 5
71
- seven_band_parametric_eq: 0.25
72
- seven_band_parametric_eq_min_gain_db: -9
73
- seven_band_parametric_eq_max_gain_db: 9
74
- tanh_distortion: 0.1
75
- tanh_distortion_min: 0.1
76
- tanh_distortion_max: 0.7
77
- other:
78
- pitch_shift: 0.1
79
- pitch_shift_min_semitones: -4
80
- pitch_shift_max_semitones: 4
81
- gaussian_noise: 0.1
82
- gaussian_noise_min_amplitude: 0.001
83
- gaussian_noise_max_amplitude: 0.015
84
- time_stretch: 0.01
85
- time_stretch_min_rate: 0.8
86
- time_stretch_max_rate: 1.25
87
-
88
-
89
- inference:
90
- batch_size: 1
91
- dim_t: 512
92
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_musdb18_torchseg.yaml DELETED
@@ -1,92 +0,0 @@
1
- audio:
2
- chunk_size: 261632
3
- dim_f: 4096
4
- dim_t: 512
5
- hop_length: 512
6
- n_fft: 8192
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- encoder_name: maxvit_tiny_tf_512 # look with torchseg.list_encoders(). Currently 858 available
13
- decoder_type: unet # unet, fpn
14
- act: gelu
15
- num_channels: 128
16
- num_subbands: 8
17
-
18
- training:
19
- batch_size: 18
20
- gradient_accumulation_steps: 1
21
- grad_clip: 0
22
- instruments:
23
- - vocals
24
- - bass
25
- - drums
26
- - other
27
- lr: 5.0e-05
28
- patience: 2
29
- reduce_factor: 0.95
30
- target_instrument: null
31
- num_epochs: 1000
32
- num_steps: 2000
33
- q: 0.95
34
- coarse_loss_clip: true
35
- ema_momentum: 0.999
36
- optimizer: adamw
37
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
38
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
39
-
40
- augmentations:
41
- enable: true # enable or disable all augmentations (to fast disable if needed)
42
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
43
- loudness_min: 0.5
44
- loudness_max: 1.5
45
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
46
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
47
- - 0.2
48
- - 0.02
49
- mixup_loudness_min: 0.5
50
- mixup_loudness_max: 1.5
51
-
52
- # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
53
- mp3_compression_on_mixture: 0.01
54
- mp3_compression_on_mixture_bitrate_min: 32
55
- mp3_compression_on_mixture_bitrate_max: 320
56
- mp3_compression_on_mixture_backend: "lameenc"
57
-
58
- all:
59
- channel_shuffle: 0.5 # Set 0 or lower to disable
60
- random_inverse: 0.1 # inverse track (better lower probability)
61
- random_polarity: 0.5 # polarity change (multiply waveform to -1)
62
- mp3_compression: 0.01
63
- mp3_compression_min_bitrate: 32
64
- mp3_compression_max_bitrate: 320
65
- mp3_compression_backend: "lameenc"
66
-
67
- vocals:
68
- pitch_shift: 0.1
69
- pitch_shift_min_semitones: -5
70
- pitch_shift_max_semitones: 5
71
- seven_band_parametric_eq: 0.25
72
- seven_band_parametric_eq_min_gain_db: -9
73
- seven_band_parametric_eq_max_gain_db: 9
74
- tanh_distortion: 0.1
75
- tanh_distortion_min: 0.1
76
- tanh_distortion_max: 0.7
77
- other:
78
- pitch_shift: 0.1
79
- pitch_shift_min_semitones: -4
80
- pitch_shift_max_semitones: 4
81
- gaussian_noise: 0.1
82
- gaussian_noise_min_amplitude: 0.001
83
- gaussian_noise_max_amplitude: 0.015
84
- time_stretch: 0.01
85
- time_stretch_min_rate: 0.8
86
- time_stretch_max_rate: 1.25
87
-
88
-
89
- inference:
90
- batch_size: 1
91
- dim_t: 512
92
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_vocals_bandit_bsrnn_multi_mus64.yaml DELETED
@@ -1,73 +0,0 @@
1
- name: "MultiMaskMultiSourceBandSplitRNN"
2
- audio:
3
- chunk_size: 264600
4
- num_channels: 2
5
- sample_rate: 44100
6
- min_mean_abs: 0.001
7
-
8
- model:
9
- in_channel: 1
10
- stems: ['vocals', 'other']
11
- band_specs: "musical"
12
- n_bands: 64
13
- fs: 44100
14
- require_no_overlap: false
15
- require_no_gap: true
16
- normalize_channel_independently: false
17
- treat_channel_as_feature: true
18
- n_sqm_modules: 8
19
- emb_dim: 128
20
- rnn_dim: 256
21
- bidirectional: true
22
- rnn_type: "GRU"
23
- mlp_dim: 512
24
- hidden_activation: "Tanh"
25
- hidden_activation_kwargs: null
26
- complex_mask: true
27
- n_fft: 2048
28
- win_length: 2048
29
- hop_length: 512
30
- window_fn: "hann_window"
31
- wkwargs: null
32
- power: null
33
- center: true
34
- normalized: true
35
- pad_mode: "constant"
36
- onesided: true
37
-
38
- training:
39
- batch_size: 4
40
- gradient_accumulation_steps: 4
41
- grad_clip: 0
42
- instruments:
43
- - vocals
44
- - other
45
- lr: 9.0e-05
46
- patience: 2
47
- reduce_factor: 0.95
48
- target_instrument: null
49
- num_epochs: 1000
50
- num_steps: 1000
51
- q: 0.95
52
- coarse_loss_clip: true
53
- ema_momentum: 0.999
54
- optimizer: adam
55
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
56
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
57
-
58
- augmentations:
59
- enable: true # enable or disable all augmentations (to fast disable if needed)
60
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
61
- loudness_min: 0.5
62
- loudness_max: 1.5
63
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
64
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
65
- - 0.2
66
- - 0.02
67
- mixup_loudness_min: 0.5
68
- mixup_loudness_max: 1.5
69
-
70
- inference:
71
- batch_size: 1
72
- dim_t: 256
73
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_vocals_bs_roformer.yaml DELETED
@@ -1,138 +0,0 @@
1
- audio:
2
- chunk_size: 131584
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 512
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- dim: 192
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- linear_transformer_depth: 0
19
- freqs_per_bands: !!python/tuple
20
- - 2
21
- - 2
22
- - 2
23
- - 2
24
- - 2
25
- - 2
26
- - 2
27
- - 2
28
- - 2
29
- - 2
30
- - 2
31
- - 2
32
- - 2
33
- - 2
34
- - 2
35
- - 2
36
- - 2
37
- - 2
38
- - 2
39
- - 2
40
- - 2
41
- - 2
42
- - 2
43
- - 2
44
- - 4
45
- - 4
46
- - 4
47
- - 4
48
- - 4
49
- - 4
50
- - 4
51
- - 4
52
- - 4
53
- - 4
54
- - 4
55
- - 4
56
- - 12
57
- - 12
58
- - 12
59
- - 12
60
- - 12
61
- - 12
62
- - 12
63
- - 12
64
- - 24
65
- - 24
66
- - 24
67
- - 24
68
- - 24
69
- - 24
70
- - 24
71
- - 24
72
- - 48
73
- - 48
74
- - 48
75
- - 48
76
- - 48
77
- - 48
78
- - 48
79
- - 48
80
- - 128
81
- - 129
82
- dim_head: 64
83
- heads: 8
84
- attn_dropout: 0.1
85
- ff_dropout: 0.1
86
- flash_attn: true
87
- dim_freqs_in: 1025
88
- stft_n_fft: 2048
89
- stft_hop_length: 512
90
- stft_win_length: 2048
91
- stft_normalized: false
92
- mask_estimator_depth: 2
93
- multi_stft_resolution_loss_weight: 1.0
94
- multi_stft_resolutions_window_sizes: !!python/tuple
95
- - 4096
96
- - 2048
97
- - 1024
98
- - 512
99
- - 256
100
- multi_stft_hop_size: 147
101
- multi_stft_normalized: False
102
-
103
- training:
104
- batch_size: 10
105
- gradient_accumulation_steps: 1
106
- grad_clip: 0
107
- instruments:
108
- - vocals
109
- - other
110
- lr: 5.0e-05
111
- patience: 2
112
- reduce_factor: 0.95
113
- target_instrument: vocals
114
- num_epochs: 1000
115
- num_steps: 1000
116
- q: 0.95
117
- coarse_loss_clip: true
118
- ema_momentum: 0.999
119
- optimizer: adam
120
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
121
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
122
-
123
- augmentations:
124
- enable: true # enable or disable all augmentations (to fast disable if needed)
125
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
126
- loudness_min: 0.5
127
- loudness_max: 1.5
128
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
129
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
130
- - 0.2
131
- - 0.02
132
- mixup_loudness_min: 0.5
133
- mixup_loudness_max: 1.5
134
-
135
- inference:
136
- batch_size: 1
137
- dim_t: 256
138
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_vocals_htdemucs.yaml DELETED
@@ -1,123 +0,0 @@
1
- audio:
2
- chunk_size: 485100 # samplerate * segment
3
- min_mean_abs: 0.001
4
- hop_length: 1024
5
-
6
- training:
7
- batch_size: 10
8
- gradient_accumulation_steps: 1
9
- grad_clip: 0
10
- segment: 11
11
- shift: 1
12
- samplerate: 44100
13
- channels: 2
14
- normalize: true
15
- instruments: ['vocals', 'other']
16
- target_instrument: null
17
- num_epochs: 1000
18
- num_steps: 1000
19
- optimizer: adam
20
- lr: 9.0e-05
21
- patience: 2
22
- reduce_factor: 0.95
23
- q: 0.95
24
- coarse_loss_clip: true
25
- ema_momentum: 0.999
26
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
27
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
28
-
29
- augmentations:
30
- enable: true # enable or disable all augmentations (to fast disable if needed)
31
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
32
- loudness_min: 0.5
33
- loudness_max: 1.5
34
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
35
- mixup_probs: [0.2, 0.02]
36
- mixup_loudness_min: 0.5
37
- mixup_loudness_max: 1.5
38
-
39
- inference:
40
- num_overlap: 2
41
- batch_size: 8
42
-
43
- model: htdemucs
44
-
45
- htdemucs: # see demucs/htdemucs.py for a detailed description
46
- # Channels
47
- channels: 48
48
- channels_time:
49
- growth: 2
50
- # STFT
51
- num_subbands: 1
52
- nfft: 4096
53
- wiener_iters: 0
54
- end_iters: 0
55
- wiener_residual: false
56
- cac: true
57
- # Main structure
58
- depth: 4
59
- rewrite: true
60
- # Frequency Branch
61
- multi_freqs: []
62
- multi_freqs_depth: 3
63
- freq_emb: 0.2
64
- emb_scale: 10
65
- emb_smooth: true
66
- # Convolutions
67
- kernel_size: 8
68
- stride: 4
69
- time_stride: 2
70
- context: 1
71
- context_enc: 0
72
- # normalization
73
- norm_starts: 4
74
- norm_groups: 4
75
- # DConv residual branch
76
- dconv_mode: 3
77
- dconv_depth: 2
78
- dconv_comp: 8
79
- dconv_init: 1e-3
80
- # Before the Transformer
81
- bottom_channels: 512
82
- # CrossTransformer
83
- # ------ Common to all
84
- # Regular parameters
85
- t_layers: 5
86
- t_hidden_scale: 4.0
87
- t_heads: 8
88
- t_dropout: 0.0
89
- t_layer_scale: True
90
- t_gelu: True
91
- # ------------- Positional Embedding
92
- t_emb: sin
93
- t_max_positions: 10000 # for the scaled embedding
94
- t_max_period: 10000.0
95
- t_weight_pos_embed: 1.0
96
- t_cape_mean_normalize: True
97
- t_cape_augment: True
98
- t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
99
- t_sin_random_shift: 0
100
- # ------------- norm before a transformer encoder
101
- t_norm_in: True
102
- t_norm_in_group: False
103
- # ------------- norm inside the encoder
104
- t_group_norm: False
105
- t_norm_first: True
106
- t_norm_out: True
107
- # ------------- optim
108
- t_weight_decay: 0.0
109
- t_lr:
110
- # ------------- sparsity
111
- t_sparse_self_attn: False
112
- t_sparse_cross_attn: False
113
- t_mask_type: diag
114
- t_mask_random_seed: 42
115
- t_sparse_attn_window: 400
116
- t_global_window: 100
117
- t_sparsity: 0.95
118
- t_auto_sparsity: False
119
- # Cross Encoder First (False)
120
- t_cross_first: False
121
- # Weight init
122
- rescale: 0.1
123
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_vocals_mdx23c.yaml DELETED
@@ -1,95 +0,0 @@
1
- audio:
2
- chunk_size: 261120
3
- dim_f: 4096
4
- dim_t: 256
5
- hop_length: 1024
6
- n_fft: 8192
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- act: gelu
13
- bottleneck_factor: 4
14
- growth: 128
15
- norm: InstanceNorm
16
- num_blocks_per_scale: 2
17
- num_channels: 128
18
- num_scales: 5
19
- num_subbands: 4
20
- scale:
21
- - 2
22
- - 2
23
-
24
- training:
25
- batch_size: 6
26
- gradient_accumulation_steps: 1
27
- grad_clip: 0
28
- instruments:
29
- - vocals
30
- - other
31
- lr: 9.0e-05
32
- patience: 2
33
- reduce_factor: 0.95
34
- target_instrument: null
35
- num_epochs: 1000
36
- num_steps: 1000
37
- q: 0.95
38
- coarse_loss_clip: true
39
- ema_momentum: 0.999
40
- optimizer: adam
41
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
42
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
43
-
44
- augmentations:
45
- enable: true # enable or disable all augmentations (to fast disable if needed)
46
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
47
- loudness_min: 0.5
48
- loudness_max: 1.5
49
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
50
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
51
- - 0.2
52
- - 0.02
53
- mixup_loudness_min: 0.5
54
- mixup_loudness_max: 1.5
55
-
56
- # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
57
- mp3_compression_on_mixture: 0.01
58
- mp3_compression_on_mixture_bitrate_min: 32
59
- mp3_compression_on_mixture_bitrate_max: 320
60
- mp3_compression_on_mixture_backend: "lameenc"
61
-
62
- all:
63
- channel_shuffle: 0.5 # Set 0 or lower to disable
64
- random_inverse: 0.1 # inverse track (better lower probability)
65
- random_polarity: 0.5 # polarity change (multiply waveform to -1)
66
- mp3_compression: 0.01
67
- mp3_compression_min_bitrate: 32
68
- mp3_compression_max_bitrate: 320
69
- mp3_compression_backend: "lameenc"
70
-
71
- vocals:
72
- pitch_shift: 0.1
73
- pitch_shift_min_semitones: -5
74
- pitch_shift_max_semitones: 5
75
- seven_band_parametric_eq: 0.25
76
- seven_band_parametric_eq_min_gain_db: -9
77
- seven_band_parametric_eq_max_gain_db: 9
78
- tanh_distortion: 0.1
79
- tanh_distortion_min: 0.1
80
- tanh_distortion_max: 0.7
81
- other:
82
- pitch_shift: 0.1
83
- pitch_shift_min_semitones: -4
84
- pitch_shift_max_semitones: 4
85
- gaussian_noise: 0.1
86
- gaussian_noise_min_amplitude: 0.001
87
- gaussian_noise_max_amplitude: 0.015
88
- time_stretch: 0.01
89
- time_stretch_min_rate: 0.8
90
- time_stretch_max_rate: 1.25
91
-
92
- inference:
93
- batch_size: 1
94
- dim_t: 256
95
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_vocals_mel_band_roformer.yaml DELETED
@@ -1,77 +0,0 @@
1
- audio:
2
- chunk_size: 131584
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 512
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- dim: 192
13
- depth: 8
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- linear_transformer_depth: 0
19
- num_bands: 60
20
- dim_head: 64
21
- heads: 8
22
- attn_dropout: 0.1
23
- ff_dropout: 0.1
24
- flash_attn: True
25
- dim_freqs_in: 1025
26
- sample_rate: 44100 # needed for mel filter bank from librosa
27
- stft_n_fft: 2048
28
- stft_hop_length: 512
29
- stft_win_length: 2048
30
- stft_normalized: False
31
- mask_estimator_depth: 2
32
- multi_stft_resolution_loss_weight: 1.0
33
- multi_stft_resolutions_window_sizes: !!python/tuple
34
- - 4096
35
- - 2048
36
- - 1024
37
- - 512
38
- - 256
39
- multi_stft_hop_size: 147
40
- multi_stft_normalized: False
41
-
42
- training:
43
- batch_size: 7
44
- gradient_accumulation_steps: 1
45
- grad_clip: 0
46
- instruments:
47
- - vocals
48
- - other
49
- lr: 5.0e-05
50
- patience: 2
51
- reduce_factor: 0.95
52
- target_instrument: vocals
53
- num_epochs: 1000
54
- num_steps: 1000
55
- q: 0.95
56
- coarse_loss_clip: true
57
- ema_momentum: 0.999
58
- optimizer: adam
59
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
60
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
61
-
62
- augmentations:
63
- enable: true # enable or disable all augmentations (to fast disable if needed)
64
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
65
- loudness_min: 0.5
66
- loudness_max: 1.5
67
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
68
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
69
- - 0.2
70
- - 0.02
71
- mixup_loudness_min: 0.5
72
- mixup_loudness_max: 1.5
73
-
74
- inference:
75
- batch_size: 1
76
- dim_t: 256
77
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_vocals_scnet.yaml DELETED
@@ -1,71 +0,0 @@
1
- audio:
2
- chunk_size: 264600
3
- num_channels: 2
4
- sample_rate: 44100
5
- min_mean_abs: 0.000
6
-
7
- model:
8
- sources: ['vocals', 'other']
9
- audio_channels: 2
10
- # dims: [4, 32, 64, 128] # small version
11
- dims: [4, 64, 128, 256]
12
- nfft: 4096
13
- hop_size: 1024
14
- win_size: 4096
15
- normalized: True
16
- band_configs: {
17
- 'low': { 'SR': .175, 'stride': 1, 'kernel': 3 },
18
- 'mid': { 'SR': .392, 'stride': 4, 'kernel': 4 },
19
- 'high': { 'SR': .433, 'stride': 16, 'kernel': 16 }
20
- }
21
- conv_depths: [3, 2, 1]
22
- compress: 4
23
- conv_kernel: 3
24
- # Dual-path RNN
25
- num_dplayer: 6
26
- expand: 1
27
- # mamba
28
- use_mamba: False
29
- mamba_config: {
30
- 'd_stat': 16,
31
- 'd_conv': 4,
32
- 'd_expand': 2
33
- }
34
-
35
- training:
36
- batch_size: 4
37
- gradient_accumulation_steps: 2
38
- grad_clip: 0
39
- instruments:
40
- - vocals
41
- - other
42
- lr: 5.0e-04
43
- patience: 2
44
- reduce_factor: 0.95
45
- target_instrument: null
46
- num_epochs: 1000
47
- num_steps: 1000
48
- q: 0.95
49
- coarse_loss_clip: true
50
- ema_momentum: 0.999
51
- optimizer: adam
52
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
53
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
54
-
55
- augmentations:
56
- enable: true # enable or disable all augmentations (to fast disable if needed)
57
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
58
- loudness_min: 0.5
59
- loudness_max: 1.5
60
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
61
- mixup_probs:
62
- !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
63
- - 0.2
64
- - 0.02
65
- mixup_loudness_min: 0.5
66
- mixup_loudness_max: 1.5
67
-
68
- inference:
69
- batch_size: 8
70
- dim_t: 256
71
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_vocals_scnet_unofficial.yaml DELETED
@@ -1,62 +0,0 @@
1
- audio:
2
- chunk_size: 264600
3
- num_channels: 2
4
- sample_rate: 44100
5
- min_mean_abs: 0.000
6
-
7
- model:
8
- dims: [4, 32, 64, 128]
9
- bandsplit_ratios: [.175, .392, .433]
10
- downsample_strides: [1, 4, 16]
11
- n_conv_modules: [3, 2, 1]
12
- n_rnn_layers: 6
13
- rnn_hidden_dim: 128
14
- n_sources: 2
15
-
16
- n_fft: 4096
17
- hop_length: 1024
18
- win_length: 4096
19
- stft_normalized: false
20
-
21
- use_mamba: false
22
- d_state: 16
23
- d_conv: 4
24
- d_expand: 2
25
-
26
- training:
27
- batch_size: 10
28
- gradient_accumulation_steps: 2
29
- grad_clip: 0
30
- instruments:
31
- - vocals
32
- - other
33
- lr: 5.0e-04
34
- patience: 2
35
- reduce_factor: 0.95
36
- target_instrument: null
37
- num_epochs: 1000
38
- num_steps: 1000
39
- q: 0.95
40
- coarse_loss_clip: true
41
- ema_momentum: 0.999
42
- optimizer: adam
43
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
44
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
45
-
46
- augmentations:
47
- enable: true # enable or disable all augmentations (to fast disable if needed)
48
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
49
- loudness_min: 0.5
50
- loudness_max: 1.5
51
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
52
- mixup_probs:
53
- !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
54
- - 0.2
55
- - 0.02
56
- mixup_loudness_min: 0.5
57
- mixup_loudness_max: 1.5
58
-
59
- inference:
60
- batch_size: 8
61
- dim_t: 256
62
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_vocals_segm_models.yaml DELETED
@@ -1,78 +0,0 @@
1
- audio:
2
- chunk_size: 261632
3
- dim_f: 4096
4
- dim_t: 512
5
- hop_length: 512
6
- n_fft: 8192
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- encoder_name: tu-maxvit_large_tf_512 # look here for possibilities: https://github.com/qubvel/segmentation_models.pytorch#encoders-
13
- decoder_type: unet # unet, fpn
14
- act: gelu
15
- num_channels: 128
16
- num_subbands: 8
17
-
18
- loss_multistft:
19
- fft_sizes:
20
- - 1024
21
- - 2048
22
- - 4096
23
- hop_sizes:
24
- - 512
25
- - 1024
26
- - 2048
27
- win_lengths:
28
- - 1024
29
- - 2048
30
- - 4096
31
- window: "hann_window"
32
- scale: "mel"
33
- n_bins: 128
34
- sample_rate: 44100
35
- perceptual_weighting: true
36
- w_sc: 1.0
37
- w_log_mag: 1.0
38
- w_lin_mag: 0.0
39
- w_phs: 0.0
40
- mag_distance: "L1"
41
-
42
-
43
- training:
44
- batch_size: 8
45
- gradient_accumulation_steps: 1
46
- grad_clip: 0
47
- instruments:
48
- - vocals
49
- - other
50
- lr: 5.0e-05
51
- patience: 2
52
- reduce_factor: 0.95
53
- target_instrument: null
54
- num_epochs: 1000
55
- num_steps: 2000
56
- q: 0.95
57
- coarse_loss_clip: true
58
- ema_momentum: 0.999
59
- optimizer: adamw
60
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
61
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
62
-
63
- augmentations:
64
- enable: true # enable or disable all augmentations (to fast disable if needed)
65
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
66
- loudness_min: 0.5
67
- loudness_max: 1.5
68
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
69
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
70
- - 0.2
71
- - 0.02
72
- mixup_loudness_min: 0.5
73
- mixup_loudness_max: 1.5
74
-
75
- inference:
76
- batch_size: 1
77
- dim_t: 512
78
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_vocals_swin_upernet.yaml DELETED
@@ -1,50 +0,0 @@
1
- audio:
2
- chunk_size: 261632
3
- dim_f: 4096
4
- dim_t: 512
5
- hop_length: 512
6
- n_fft: 8192
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- act: gelu
13
- num_channels: 16
14
- num_subbands: 8
15
-
16
- training:
17
- batch_size: 14
18
- gradient_accumulation_steps: 4
19
- grad_clip: 0
20
- instruments:
21
- - vocals
22
- - other
23
- lr: 3.0e-05
24
- patience: 2
25
- reduce_factor: 0.95
26
- target_instrument: null
27
- num_epochs: 1000
28
- num_steps: 1000
29
- q: 0.95
30
- coarse_loss_clip: true
31
- ema_momentum: 0.999
32
- optimizer: adamw
33
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
34
-
35
- augmentations:
36
- enable: true # enable or disable all augmentations (to fast disable if needed)
37
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
38
- loudness_min: 0.5
39
- loudness_max: 1.5
40
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
41
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
42
- - 0.2
43
- - 0.02
44
- mixup_loudness_min: 0.5
45
- mixup_loudness_max: 1.5
46
-
47
- inference:
48
- batch_size: 1
49
- dim_t: 512
50
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/config_vocals_torchseg.yaml DELETED
@@ -1,58 +0,0 @@
1
- audio:
2
- chunk_size: 261632
3
- dim_f: 4096
4
- dim_t: 512
5
- hop_length: 512
6
- n_fft: 8192
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- encoder_name: maxvit_tiny_tf_512 # look with torchseg.list_encoders(). Currently 858 available
13
- decoder_type: unet # unet, fpn
14
- act: gelu
15
- num_channels: 128
16
- num_subbands: 8
17
-
18
- training:
19
- batch_size: 18
20
- gradient_accumulation_steps: 1
21
- grad_clip: 1.0
22
- instruments:
23
- - vocals
24
- - other
25
- lr: 1.0e-04
26
- patience: 2
27
- reduce_factor: 0.95
28
- target_instrument: null
29
- num_epochs: 1000
30
- num_steps: 1000
31
- q: 0.95
32
- coarse_loss_clip: true
33
- ema_momentum: 0.999
34
- optimizer: adam
35
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
36
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
37
-
38
- augmentations:
39
- enable: false # enable or disable all augmentations (to fast disable if needed)
40
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
41
- loudness_min: 0.5
42
- loudness_max: 1.5
43
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
44
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
45
- - 0.2
46
- - 0.02
47
- mixup_loudness_min: 0.5
48
- mixup_loudness_max: 1.5
49
-
50
- all:
51
- channel_shuffle: 0.5 # Set 0 or lower to disable
52
- random_inverse: 0.1 # inverse track (better lower probability)
53
- random_polarity: 0.5 # polarity change (multiply waveform to -1)
54
-
55
- inference:
56
- batch_size: 8
57
- dim_t: 512
58
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/viperx/model_bs_roformer_ep_317_sdr_12.9755.yaml DELETED
@@ -1,126 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 801 # don't work (use in model)
5
- hop_length: 441 # don't work (use in model)
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 512
13
- depth: 12
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- linear_transformer_depth: 0
19
- freqs_per_bands: !!python/tuple
20
- - 2
21
- - 2
22
- - 2
23
- - 2
24
- - 2
25
- - 2
26
- - 2
27
- - 2
28
- - 2
29
- - 2
30
- - 2
31
- - 2
32
- - 2
33
- - 2
34
- - 2
35
- - 2
36
- - 2
37
- - 2
38
- - 2
39
- - 2
40
- - 2
41
- - 2
42
- - 2
43
- - 2
44
- - 4
45
- - 4
46
- - 4
47
- - 4
48
- - 4
49
- - 4
50
- - 4
51
- - 4
52
- - 4
53
- - 4
54
- - 4
55
- - 4
56
- - 12
57
- - 12
58
- - 12
59
- - 12
60
- - 12
61
- - 12
62
- - 12
63
- - 12
64
- - 24
65
- - 24
66
- - 24
67
- - 24
68
- - 24
69
- - 24
70
- - 24
71
- - 24
72
- - 48
73
- - 48
74
- - 48
75
- - 48
76
- - 48
77
- - 48
78
- - 48
79
- - 48
80
- - 128
81
- - 129
82
- dim_head: 64
83
- heads: 8
84
- attn_dropout: 0.1
85
- ff_dropout: 0.1
86
- flash_attn: true
87
- dim_freqs_in: 1025
88
- stft_n_fft: 2048
89
- stft_hop_length: 441
90
- stft_win_length: 2048
91
- stft_normalized: false
92
- mask_estimator_depth: 2
93
- multi_stft_resolution_loss_weight: 1.0
94
- multi_stft_resolutions_window_sizes: !!python/tuple
95
- - 4096
96
- - 2048
97
- - 1024
98
- - 512
99
- - 256
100
- multi_stft_hop_size: 147
101
- multi_stft_normalized: False
102
-
103
- training:
104
- batch_size: 2
105
- gradient_accumulation_steps: 1
106
- grad_clip: 0
107
- instruments:
108
- - vocals
109
- - other
110
- lr: 1.0e-05
111
- patience: 2
112
- reduce_factor: 0.95
113
- target_instrument: vocals
114
- num_epochs: 1000
115
- num_steps: 1000
116
- q: 0.95
117
- coarse_loss_clip: true
118
- ema_momentum: 0.999
119
- optimizer: adam
120
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
121
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
122
-
123
- inference:
124
- batch_size: 4
125
- dim_t: 801
126
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/viperx/model_bs_roformer_ep_937_sdr_10.5309.yaml DELETED
@@ -1,138 +0,0 @@
1
- audio:
2
- chunk_size: 131584
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 512
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- dim: 384
13
- depth: 12
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- linear_transformer_depth: 0
19
- freqs_per_bands: !!python/tuple
20
- - 2
21
- - 2
22
- - 2
23
- - 2
24
- - 2
25
- - 2
26
- - 2
27
- - 2
28
- - 2
29
- - 2
30
- - 2
31
- - 2
32
- - 2
33
- - 2
34
- - 2
35
- - 2
36
- - 2
37
- - 2
38
- - 2
39
- - 2
40
- - 2
41
- - 2
42
- - 2
43
- - 2
44
- - 4
45
- - 4
46
- - 4
47
- - 4
48
- - 4
49
- - 4
50
- - 4
51
- - 4
52
- - 4
53
- - 4
54
- - 4
55
- - 4
56
- - 12
57
- - 12
58
- - 12
59
- - 12
60
- - 12
61
- - 12
62
- - 12
63
- - 12
64
- - 24
65
- - 24
66
- - 24
67
- - 24
68
- - 24
69
- - 24
70
- - 24
71
- - 24
72
- - 48
73
- - 48
74
- - 48
75
- - 48
76
- - 48
77
- - 48
78
- - 48
79
- - 48
80
- - 128
81
- - 129
82
- dim_head: 64
83
- heads: 8
84
- attn_dropout: 0.1
85
- ff_dropout: 0.1
86
- flash_attn: true
87
- dim_freqs_in: 1025
88
- stft_n_fft: 2048
89
- stft_hop_length: 512
90
- stft_win_length: 2048
91
- stft_normalized: false
92
- mask_estimator_depth: 2
93
- multi_stft_resolution_loss_weight: 1.0
94
- multi_stft_resolutions_window_sizes: !!python/tuple
95
- - 4096
96
- - 2048
97
- - 1024
98
- - 512
99
- - 256
100
- multi_stft_hop_size: 147
101
- multi_stft_normalized: False
102
-
103
- training:
104
- batch_size: 4
105
- gradient_accumulation_steps: 1
106
- grad_clip: 0
107
- instruments:
108
- - vocals
109
- - other
110
- lr: 5.0e-05
111
- patience: 2
112
- reduce_factor: 0.95
113
- target_instrument: other
114
- num_epochs: 1000
115
- num_steps: 1000
116
- q: 0.95
117
- coarse_loss_clip: true
118
- ema_momentum: 0.999
119
- optimizer: adam
120
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
121
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
122
-
123
- augmentations:
124
- enable: true # enable or disable all augmentations (to fast disable if needed)
125
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
126
- loudness_min: 0.5
127
- loudness_max: 1.5
128
- mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
129
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
130
- - 0.2
131
- - 0.02
132
- mixup_loudness_min: 0.5
133
- mixup_loudness_max: 1.5
134
-
135
- inference:
136
- batch_size: 8
137
- dim_t: 512
138
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/viperx/model_mel_band_roformer_ep_3005_sdr_11.4360.yaml DELETED
@@ -1,65 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 801 # don't work (use in model)
5
- hop_length: 441 # don't work (use in model)
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 12
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- linear_transformer_depth: 0
19
- num_bands: 60
20
- dim_head: 64
21
- heads: 8
22
- attn_dropout: 0.1
23
- ff_dropout: 0.1
24
- flash_attn: True
25
- dim_freqs_in: 1025
26
- sample_rate: 44100 # needed for mel filter bank from librosa
27
- stft_n_fft: 2048
28
- stft_hop_length: 441
29
- stft_win_length: 2048
30
- stft_normalized: False
31
- mask_estimator_depth: 2
32
- multi_stft_resolution_loss_weight: 1.0
33
- multi_stft_resolutions_window_sizes: !!python/tuple
34
- - 4096
35
- - 2048
36
- - 1024
37
- - 512
38
- - 256
39
- multi_stft_hop_size: 147
40
- multi_stft_normalized: False
41
-
42
- training:
43
- batch_size: 1
44
- gradient_accumulation_steps: 8
45
- grad_clip: 0
46
- instruments:
47
- - vocals
48
- - other
49
- lr: 4.0e-05
50
- patience: 2
51
- reduce_factor: 0.95
52
- target_instrument: vocals
53
- num_epochs: 1000
54
- num_steps: 1000
55
- q: 0.95
56
- coarse_loss_clip: true
57
- ema_momentum: 0.999
58
- optimizer: adam
59
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
60
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
61
-
62
- inference:
63
- batch_size: 4
64
- dim_t: 801
65
- num_overlap: 2