poiqazwsx commited on
Commit
84ccf60
·
1 Parent(s): 51e2f90

Upload 23 files

Browse files
configs/config_dnr_bandit_bsrnn_multi_mus64.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "MultiMaskMultiSourceBandSplitRNN"
2
+ audio:
3
+ chunk_size: 264600
4
+ num_channels: 2
5
+ sample_rate: 44100
6
+ min_mean_abs: 0.001
7
+
8
+ model:
9
+ in_channel: 1
10
+ stems: ['speech', 'music', 'effects']
11
+ band_specs: "musical"
12
+ n_bands: 64
13
+ fs: 44100
14
+ require_no_overlap: false
15
+ require_no_gap: true
16
+ normalize_channel_independently: false
17
+ treat_channel_as_feature: true
18
+ n_sqm_modules: 8
19
+ emb_dim: 128
20
+ rnn_dim: 256
21
+ bidirectional: true
22
+ rnn_type: "GRU"
23
+ mlp_dim: 512
24
+ hidden_activation: "Tanh"
25
+ hidden_activation_kwargs: null
26
+ complex_mask: true
27
+ n_fft: 2048
28
+ win_length: 2048
29
+ hop_length: 512
30
+ window_fn: "hann_window"
31
+ wkwargs: null
32
+ power: null
33
+ center: true
34
+ normalized: true
35
+ pad_mode: "constant"
36
+ onesided: true
37
+
38
+ training:
39
+ batch_size: 4
40
+ gradient_accumulation_steps: 4
41
+ grad_clip: 0
42
+ instruments:
43
+ - speech
44
+ - music
45
+ - effects
46
+ lr: 9.0e-05
47
+ patience: 2
48
+ reduce_factor: 0.95
49
+ target_instrument: null
50
+ num_epochs: 1000
51
+ num_steps: 1000
52
+ q: 0.95
53
+ coarse_loss_clip: true
54
+ ema_momentum: 0.999
55
+ optimizer: adam
56
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
57
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
58
+
59
+ augmentations:
60
+ enable: true # enable or disable all augmentations (to fast disable if needed)
61
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
62
+ loudness_min: 0.5
63
+ loudness_max: 1.5
64
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
65
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
66
+ - 0.2
67
+ - 0.02
68
+ mixup_loudness_min: 0.5
69
+ mixup_loudness_max: 1.5
70
+ all:
71
+ channel_shuffle: 0.5 # Set 0 or lower to disable
72
+ random_inverse: 0.1 # inverse track (better lower probability)
73
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
74
+
75
+ inference:
76
+ batch_size: 1
77
+ dim_t: 256
78
+ num_overlap: 4
configs/config_htdemucs_6stems.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # samplerate * segment
3
+ min_mean_abs: 0.001
4
+ hop_length: 1024
5
+
6
+ training:
7
+ batch_size: 8
8
+ gradient_accumulation_steps: 1
9
+ grad_clip: 0
10
+ segment: 11
11
+ shift: 1
12
+ samplerate: 44100
13
+ channels: 2
14
+ normalize: true
15
+ instruments: ['drums', 'bass', 'other', 'vocals', 'guitar', 'piano']
16
+ target_instrument: null
17
+ num_epochs: 1000
18
+ num_steps: 1000
19
+ optimizer: adam
20
+ lr: 9.0e-05
21
+ patience: 2
22
+ reduce_factor: 0.95
23
+ q: 0.95
24
+ coarse_loss_clip: true
25
+ ema_momentum: 0.999
26
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
27
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
28
+
29
+ augmentations:
30
+ enable: true # enable or disable all augmentations (to fast disable if needed)
31
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
32
+ loudness_min: 0.5
33
+ loudness_max: 1.5
34
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
35
+ mixup_probs: [0.2, 0.02]
36
+ mixup_loudness_min: 0.5
37
+ mixup_loudness_max: 1.5
38
+ all:
39
+ channel_shuffle: 0.5 # Set 0 or lower to disable
40
+ random_inverse: 0.1 # inverse track (better lower probability)
41
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
42
+
43
+ inference:
44
+ num_overlap: 4
45
+ batch_size: 8
46
+
47
+ model: htdemucs
48
+
49
+ htdemucs: # see demucs/htdemucs.py for a detailed description
50
+ # Channels
51
+ channels: 48
52
+ channels_time:
53
+ growth: 2
54
+ # STFT
55
+ num_subbands: 1
56
+ nfft: 4096
57
+ wiener_iters: 0
58
+ end_iters: 0
59
+ wiener_residual: false
60
+ cac: true
61
+ # Main structure
62
+ depth: 4
63
+ rewrite: true
64
+ # Frequency Branch
65
+ multi_freqs: []
66
+ multi_freqs_depth: 3
67
+ freq_emb: 0.2
68
+ emb_scale: 10
69
+ emb_smooth: true
70
+ # Convolutions
71
+ kernel_size: 8
72
+ stride: 4
73
+ time_stride: 2
74
+ context: 1
75
+ context_enc: 0
76
+ # normalization
77
+ norm_starts: 4
78
+ norm_groups: 4
79
+ # DConv residual branch
80
+ dconv_mode: 3
81
+ dconv_depth: 2
82
+ dconv_comp: 8
83
+ dconv_init: 1e-3
84
+ # Before the Transformer
85
+ bottom_channels: 0
86
+ # CrossTransformer
87
+ # ------ Common to all
88
+ # Regular parameters
89
+ t_layers: 5
90
+ t_hidden_scale: 4.0
91
+ t_heads: 8
92
+ t_dropout: 0.0
93
+ t_layer_scale: True
94
+ t_gelu: True
95
+ # ------------- Positional Embedding
96
+ t_emb: sin
97
+ t_max_positions: 10000 # for the scaled embedding
98
+ t_max_period: 10000.0
99
+ t_weight_pos_embed: 1.0
100
+ t_cape_mean_normalize: True
101
+ t_cape_augment: True
102
+ t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
103
+ t_sin_random_shift: 0
104
+ # ------------- norm before a transformer encoder
105
+ t_norm_in: True
106
+ t_norm_in_group: False
107
+ # ------------- norm inside the encoder
108
+ t_group_norm: False
109
+ t_norm_first: True
110
+ t_norm_out: True
111
+ # ------------- optim
112
+ t_weight_decay: 0.0
113
+ t_lr:
114
+ # ------------- sparsity
115
+ t_sparse_self_attn: False
116
+ t_sparse_cross_attn: False
117
+ t_mask_type: diag
118
+ t_mask_random_seed: 42
119
+ t_sparse_attn_window: 400
120
+ t_global_window: 100
121
+ t_sparsity: 0.95
122
+ t_auto_sparsity: False
123
+ # Cross Encoder First (False)
124
+ t_cross_first: False
125
+ # Weight init
126
+ rescale: 0.1
127
+
configs/config_musdb18_bs_roformer.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 192
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.1
85
+ ff_dropout: 0.1
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+
103
+ training:
104
+ batch_size: 10
105
+ gradient_accumulation_steps: 1
106
+ grad_clip: 0
107
+ instruments:
108
+ - vocals
109
+ - bass
110
+ - drums
111
+ - other
112
+ lr: 5.0e-05
113
+ patience: 2
114
+ reduce_factor: 0.95
115
+ target_instrument: vocals
116
+ num_epochs: 1000
117
+ num_steps: 1000
118
+ q: 0.95
119
+ coarse_loss_clip: true
120
+ ema_momentum: 0.999
121
+ optimizer: adam
122
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
123
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
124
+
125
+ augmentations:
126
+ enable: true # enable or disable all augmentations (to fast disable if needed)
127
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
128
+ loudness_min: 0.5
129
+ loudness_max: 1.5
130
+
131
+ inference:
132
+ batch_size: 1
133
+ dim_t: 256
134
+ num_overlap: 4
configs/config_musdb18_demucs3_mmi.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # samplerate * segment
3
+ min_mean_abs: 0.000
4
+ hop_length: 1024
5
+
6
+ training:
7
+ batch_size: 8
8
+ gradient_accumulation_steps: 1
9
+ grad_clip: 0
10
+ segment: 11
11
+ shift: 1
12
+ samplerate: 44100
13
+ channels: 2
14
+ normalize: true
15
+ instruments: ['drums', 'bass', 'other', 'vocals']
16
+ target_instrument: null
17
+ num_epochs: 1000
18
+ num_steps: 1000
19
+ optimizer: adam
20
+ lr: 9.0e-05
21
+ patience: 2
22
+ reduce_factor: 0.95
23
+ q: 0.95
24
+ coarse_loss_clip: true
25
+ ema_momentum: 0.999
26
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
27
+ use_amp: false # enable or disable usage of mixed precision (float16) - usually it must be true
28
+
29
+ augmentations:
30
+ enable: true # enable or disable all augmentations (to fast disable if needed)
31
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
32
+ loudness_min: 0.5
33
+ loudness_max: 1.5
34
+
35
+ inference:
36
+ num_overlap: 4
37
+ batch_size: 8
38
+
39
+ model: hdemucs
40
+
41
+ hdemucs: # see demucs/hdemucs.py for a detailed description
42
+ channels: 48
43
+ channels_time: null
44
+ growth: 2
45
+ nfft: 4096
46
+ wiener_iters: 0
47
+ end_iters: 0
48
+ wiener_residual: False
49
+ cac: True
50
+ depth: 6
51
+ rewrite: True
52
+ hybrid: True
53
+ hybrid_old: False
54
+ multi_freqs: []
55
+ multi_freqs_depth: 3
56
+ freq_emb: 0.2
57
+ emb_scale: 10
58
+ emb_smooth: True
59
+ kernel_size: 8
60
+ stride: 4
61
+ time_stride: 2
62
+ context: 1
63
+ context_enc: 0
64
+ norm_starts: 4
65
+ norm_groups: 4
66
+ dconv_mode: 1
67
+ dconv_depth: 2
68
+ dconv_comp: 4
69
+ dconv_attn: 4
70
+ dconv_lstm: 4
71
+ dconv_init: 0.001
72
+ rescale: 0.1
configs/config_musdb18_htdemucs.yaml ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # samplerate * segment
3
+ min_mean_abs: 0.001
4
+ hop_length: 1024
5
+
6
+ training:
7
+ batch_size: 8
8
+ gradient_accumulation_steps: 1
9
+ grad_clip: 0
10
+ segment: 11
11
+ shift: 1
12
+ samplerate: 44100
13
+ channels: 2
14
+ normalize: true
15
+ instruments: ['drums', 'bass', 'other', 'vocals']
16
+ target_instrument: null
17
+ num_epochs: 1000
18
+ num_steps: 1000
19
+ optimizer: adam
20
+ lr: 9.0e-05
21
+ patience: 2
22
+ reduce_factor: 0.95
23
+ q: 0.95
24
+ coarse_loss_clip: true
25
+ ema_momentum: 0.999
26
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
27
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
28
+
29
+ augmentations:
30
+ enable: true # enable or disable all augmentations (to fast disable if needed)
31
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
32
+ loudness_min: 0.5
33
+ loudness_max: 1.5
34
+
35
+ inference:
36
+ num_overlap: 4
37
+ batch_size: 8
38
+
39
+ model: htdemucs
40
+
41
+ htdemucs: # see demucs/htdemucs.py for a detailed description
42
+ # Channels
43
+ channels: 48
44
+ channels_time:
45
+ growth: 2
46
+ # STFT
47
+ num_subbands: 1
48
+ nfft: 4096
49
+ wiener_iters: 0
50
+ end_iters: 0
51
+ wiener_residual: false
52
+ cac: true
53
+ # Main structure
54
+ depth: 4
55
+ rewrite: true
56
+ # Frequency Branch
57
+ multi_freqs: []
58
+ multi_freqs_depth: 3
59
+ freq_emb: 0.2
60
+ emb_scale: 10
61
+ emb_smooth: true
62
+ # Convolutions
63
+ kernel_size: 8
64
+ stride: 4
65
+ time_stride: 2
66
+ context: 1
67
+ context_enc: 0
68
+ # normalization
69
+ norm_starts: 4
70
+ norm_groups: 4
71
+ # DConv residual branch
72
+ dconv_mode: 3
73
+ dconv_depth: 2
74
+ dconv_comp: 8
75
+ dconv_init: 1e-3
76
+ # Before the Transformer
77
+ bottom_channels: 512
78
+ # CrossTransformer
79
+ # ------ Common to all
80
+ # Regular parameters
81
+ t_layers: 5
82
+ t_hidden_scale: 4.0
83
+ t_heads: 8
84
+ t_dropout: 0.0
85
+ t_layer_scale: True
86
+ t_gelu: True
87
+ # ------------- Positional Embedding
88
+ t_emb: sin
89
+ t_max_positions: 10000 # for the scaled embedding
90
+ t_max_period: 10000.0
91
+ t_weight_pos_embed: 1.0
92
+ t_cape_mean_normalize: True
93
+ t_cape_augment: True
94
+ t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
95
+ t_sin_random_shift: 0
96
+ # ------------- norm before a transformer encoder
97
+ t_norm_in: True
98
+ t_norm_in_group: False
99
+ # ------------- norm inside the encoder
100
+ t_group_norm: False
101
+ t_norm_first: True
102
+ t_norm_out: True
103
+ # ------------- optim
104
+ t_weight_decay: 0.0
105
+ t_lr:
106
+ # ------------- sparsity
107
+ t_sparse_self_attn: False
108
+ t_sparse_cross_attn: False
109
+ t_mask_type: diag
110
+ t_mask_random_seed: 42
111
+ t_sparse_attn_window: 400
112
+ t_global_window: 100
113
+ t_sparsity: 0.95
114
+ t_auto_sparsity: False
115
+ # Cross Encoder First (False)
116
+ t_cross_first: False
117
+ # Weight init
118
+ rescale: 0.1
119
+
configs/config_musdb18_mdx23c.yaml ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261120
3
+ dim_f: 4096
4
+ dim_t: 256
5
+ hop_length: 1024
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ act: gelu
13
+ bottleneck_factor: 4
14
+ growth: 128
15
+ norm: InstanceNorm
16
+ num_blocks_per_scale: 2
17
+ num_channels: 128
18
+ num_scales: 5
19
+ num_subbands: 4
20
+ scale:
21
+ - 2
22
+ - 2
23
+
24
+ training:
25
+ batch_size: 6
26
+ gradient_accumulation_steps: 1
27
+ grad_clip: 0
28
+ instruments:
29
+ - vocals
30
+ - bass
31
+ - drums
32
+ - other
33
+ lr: 9.0e-05
34
+ patience: 2
35
+ reduce_factor: 0.95
36
+ target_instrument: null
37
+ num_epochs: 1000
38
+ num_steps: 1000
39
+ q: 0.95
40
+ coarse_loss_clip: true
41
+ ema_momentum: 0.999
42
+ optimizer: adam
43
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
44
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
45
+
46
+ augmentations:
47
+ enable: true # enable or disable all augmentations (to fast disable if needed)
48
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
49
+ loudness_min: 0.5
50
+ loudness_max: 1.5
51
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
52
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
53
+ - 0.2
54
+ - 0.02
55
+ mixup_loudness_min: 0.5
56
+ mixup_loudness_max: 1.5
57
+
58
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
59
+ mp3_compression_on_mixture: 0.01
60
+ mp3_compression_on_mixture_bitrate_min: 32
61
+ mp3_compression_on_mixture_bitrate_max: 320
62
+ mp3_compression_on_mixture_backend: "lameenc"
63
+
64
+ all:
65
+ channel_shuffle: 0.5 # Set 0 or lower to disable
66
+ random_inverse: 0.1 # inverse track (better lower probability)
67
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
68
+ mp3_compression: 0.01
69
+ mp3_compression_min_bitrate: 32
70
+ mp3_compression_max_bitrate: 320
71
+ mp3_compression_backend: "lameenc"
72
+
73
+ # pedalboard reverb block
74
+ pedalboard_reverb: 0.01
75
+ pedalboard_reverb_room_size_min: 0.1
76
+ pedalboard_reverb_room_size_max: 0.9
77
+ pedalboard_reverb_damping_min: 0.1
78
+ pedalboard_reverb_damping_max: 0.9
79
+ pedalboard_reverb_wet_level_min: 0.1
80
+ pedalboard_reverb_wet_level_max: 0.9
81
+ pedalboard_reverb_dry_level_min: 0.1
82
+ pedalboard_reverb_dry_level_max: 0.9
83
+ pedalboard_reverb_width_min: 0.9
84
+ pedalboard_reverb_width_max: 1.0
85
+
86
+ # pedalboard chorus block
87
+ pedalboard_chorus: 0.01
88
+ pedalboard_chorus_rate_hz_min: 1.0
89
+ pedalboard_chorus_rate_hz_max: 7.0
90
+ pedalboard_chorus_depth_min: 0.25
91
+ pedalboard_chorus_depth_max: 0.95
92
+ pedalboard_chorus_centre_delay_ms_min: 3
93
+ pedalboard_chorus_centre_delay_ms_max: 10
94
+ pedalboard_chorus_feedback_min: 0.0
95
+ pedalboard_chorus_feedback_max: 0.5
96
+ pedalboard_chorus_mix_min: 0.1
97
+ pedalboard_chorus_mix_max: 0.9
98
+
99
+ # pedalboard phazer block
100
+ pedalboard_phazer: 0.01
101
+ pedalboard_phazer_rate_hz_min: 1.0
102
+ pedalboard_phazer_rate_hz_max: 10.0
103
+ pedalboard_phazer_depth_min: 0.25
104
+ pedalboard_phazer_depth_max: 0.95
105
+ pedalboard_phazer_centre_frequency_hz_min: 200
106
+ pedalboard_phazer_centre_frequency_hz_max: 12000
107
+ pedalboard_phazer_feedback_min: 0.0
108
+ pedalboard_phazer_feedback_max: 0.5
109
+ pedalboard_phazer_mix_min: 0.1
110
+ pedalboard_phazer_mix_max: 0.9
111
+
112
+ # pedalboard distortion block
113
+ pedalboard_distortion: 0.01
114
+ pedalboard_distortion_drive_db_min: 1.0
115
+ pedalboard_distortion_drive_db_max: 25.0
116
+
117
+ # pedalboard pitch shift block
118
+ pedalboard_pitch_shift: 0.01
119
+ pedalboard_pitch_shift_semitones_min: -7
120
+ pedalboard_pitch_shift_semitones_max: 7
121
+
122
+ # pedalboard resample block
123
+ pedalboard_resample: 0.01
124
+ pedalboard_resample_target_sample_rate_min: 4000
125
+ pedalboard_resample_target_sample_rate_max: 44100
126
+
127
+ # pedalboard bitcrash block
128
+ pedalboard_bitcrash: 0.01
129
+ pedalboard_bitcrash_bit_depth_min: 4
130
+ pedalboard_bitcrash_bit_depth_max: 16
131
+
132
+ # pedalboard mp3 compressor block
133
+ pedalboard_mp3_compressor: 0.01
134
+ pedalboard_mp3_compressor_pedalboard_mp3_compressor_min: 0
135
+ pedalboard_mp3_compressor_pedalboard_mp3_compressor_max: 9.999
136
+
137
+ vocals:
138
+ pitch_shift: 0.1
139
+ pitch_shift_min_semitones: -5
140
+ pitch_shift_max_semitones: 5
141
+ seven_band_parametric_eq: 0.25
142
+ seven_band_parametric_eq_min_gain_db: -9
143
+ seven_band_parametric_eq_max_gain_db: 9
144
+ tanh_distortion: 0.1
145
+ tanh_distortion_min: 0.1
146
+ tanh_distortion_max: 0.7
147
+ bass:
148
+ pitch_shift: 0.1
149
+ pitch_shift_min_semitones: -2
150
+ pitch_shift_max_semitones: 2
151
+ seven_band_parametric_eq: 0.25
152
+ seven_band_parametric_eq_min_gain_db: -3
153
+ seven_band_parametric_eq_max_gain_db: 6
154
+ tanh_distortion: 0.2
155
+ tanh_distortion_min: 0.1
156
+ tanh_distortion_max: 0.5
157
+ drums:
158
+ pitch_shift: 0.33
159
+ pitch_shift_min_semitones: -5
160
+ pitch_shift_max_semitones: 5
161
+ seven_band_parametric_eq: 0.25
162
+ seven_band_parametric_eq_min_gain_db: -9
163
+ seven_band_parametric_eq_max_gain_db: 9
164
+ tanh_distortion: 0.33
165
+ tanh_distortion_min: 0.1
166
+ tanh_distortion_max: 0.6
167
+ other:
168
+ pitch_shift: 0.1
169
+ pitch_shift_min_semitones: -4
170
+ pitch_shift_max_semitones: 4
171
+ gaussian_noise: 0.1
172
+ gaussian_noise_min_amplitude: 0.001
173
+ gaussian_noise_max_amplitude: 0.015
174
+ time_stretch: 0.01
175
+ time_stretch_min_rate: 0.8
176
+ time_stretch_max_rate: 1.25
177
+
178
+
179
+ inference:
180
+ batch_size: 1
181
+ dim_t: 256
182
+ num_overlap: 4
configs/config_musdb18_mel_band_roformer.yaml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 192
13
+ depth: 8
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ num_bands: 60
20
+ dim_head: 64
21
+ heads: 8
22
+ attn_dropout: 0.1
23
+ ff_dropout: 0.1
24
+ flash_attn: True
25
+ dim_freqs_in: 1025
26
+ sample_rate: 44100 # needed for mel filter bank from librosa
27
+ stft_n_fft: 2048
28
+ stft_hop_length: 512
29
+ stft_win_length: 2048
30
+ stft_normalized: False
31
+ mask_estimator_depth: 2
32
+ multi_stft_resolution_loss_weight: 1.0
33
+ multi_stft_resolutions_window_sizes: !!python/tuple
34
+ - 4096
35
+ - 2048
36
+ - 1024
37
+ - 512
38
+ - 256
39
+ multi_stft_hop_size: 147
40
+ multi_stft_normalized: False
41
+
42
+ training:
43
+ batch_size: 7
44
+ gradient_accumulation_steps: 1
45
+ grad_clip: 0
46
+ instruments:
47
+ - vocals
48
+ - bass
49
+ - drums
50
+ - other
51
+ lr: 5.0e-05
52
+ patience: 2
53
+ reduce_factor: 0.95
54
+ target_instrument: vocals
55
+ num_epochs: 1000
56
+ num_steps: 1000
57
+ q: 0.95
58
+ coarse_loss_clip: true
59
+ ema_momentum: 0.999
60
+ optimizer: adam
61
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
62
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
63
+
64
+ augmentations:
65
+ enable: true # enable or disable all augmentations (to fast disable if needed)
66
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
67
+ loudness_min: 0.5
68
+ loudness_max: 1.5
69
+
70
+ inference:
71
+ batch_size: 1
72
+ dim_t: 256
73
+ num_overlap: 4
configs/config_musdb18_scnet.yaml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 264600
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.001
6
+
7
+ model:
8
+ dims: [4, 32, 64, 128]
9
+ bandsplit_ratios: [.175, .392, .433]
10
+ downsample_strides: [1, 4, 16]
11
+ n_conv_modules: [3, 2, 1]
12
+ n_rnn_layers: 6
13
+ rnn_hidden_dim: 128
14
+ n_sources: 4
15
+
16
+ n_fft: 4096
17
+ hop_length: 1024
18
+ win_length: 4096
19
+ stft_normalized: false
20
+
21
+ use_mamba: true
22
+ d_state: 16
23
+ d_conv: 4
24
+ d_expand: 2
25
+
26
+ training:
27
+ batch_size: 10
28
+ gradient_accumulation_steps: 1
29
+ grad_clip: 0
30
+ instruments:
31
+ - vocals
32
+ - bass
33
+ - drums
34
+ - other
35
+ lr: 5.0e-04
36
+ patience: 2
37
+ reduce_factor: 0.95
38
+ target_instrument: null
39
+ num_epochs: 1000
40
+ num_steps: 1000
41
+ q: 0.95
42
+ coarse_loss_clip: true
43
+ ema_momentum: 0.999
44
+ optimizer: adam
45
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
46
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
47
+
48
+ augmentations:
49
+ enable: true # enable or disable all augmentations (to fast disable if needed)
50
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
51
+ loudness_min: 0.5
52
+ loudness_max: 1.5
53
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
54
+ mixup_probs:
55
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
56
+ - 0.2
57
+ - 0.02
58
+ mixup_loudness_min: 0.5
59
+ mixup_loudness_max: 1.5
60
+
61
+ inference:
62
+ batch_size: 1
63
+ dim_t: 256
64
+ num_overlap: 4
configs/config_musdb18_segm_models.yaml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261632
3
+ dim_f: 4096
4
+ dim_t: 512
5
+ hop_length: 512
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ encoder_name: tu-maxvit_large_tf_512 # look here for possibilities: https://github.com/qubvel/segmentation_models.pytorch#encoders-
13
+ decoder_type: unet # unet, fpn
14
+ act: gelu
15
+ num_channels: 128
16
+ num_subbands: 8
17
+
18
+ training:
19
+ batch_size: 7
20
+ gradient_accumulation_steps: 1
21
+ grad_clip: 0
22
+ instruments:
23
+ - vocals
24
+ - bass
25
+ - drums
26
+ - other
27
+ lr: 5.0e-05
28
+ patience: 2
29
+ reduce_factor: 0.95
30
+ target_instrument: null
31
+ num_epochs: 1000
32
+ num_steps: 2000
33
+ q: 0.95
34
+ coarse_loss_clip: true
35
+ ema_momentum: 0.999
36
+ optimizer: adamw
37
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
38
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
39
+
40
+ augmentations:
41
+ enable: true # enable or disable all augmentations (to fast disable if needed)
42
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
43
+ loudness_min: 0.5
44
+ loudness_max: 1.5
45
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
46
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
47
+ - 0.2
48
+ - 0.02
49
+ mixup_loudness_min: 0.5
50
+ mixup_loudness_max: 1.5
51
+
52
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
53
+ mp3_compression_on_mixture: 0.01
54
+ mp3_compression_on_mixture_bitrate_min: 32
55
+ mp3_compression_on_mixture_bitrate_max: 320
56
+ mp3_compression_on_mixture_backend: "lameenc"
57
+
58
+ all:
59
+ channel_shuffle: 0.5 # Set 0 or lower to disable
60
+ random_inverse: 0.1 # inverse track (better lower probability)
61
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
62
+ mp3_compression: 0.01
63
+ mp3_compression_min_bitrate: 32
64
+ mp3_compression_max_bitrate: 320
65
+ mp3_compression_backend: "lameenc"
66
+
67
+ vocals:
68
+ pitch_shift: 0.1
69
+ pitch_shift_min_semitones: -5
70
+ pitch_shift_max_semitones: 5
71
+ seven_band_parametric_eq: 0.25
72
+ seven_band_parametric_eq_min_gain_db: -9
73
+ seven_band_parametric_eq_max_gain_db: 9
74
+ tanh_distortion: 0.1
75
+ tanh_distortion_min: 0.1
76
+ tanh_distortion_max: 0.7
77
+ other:
78
+ pitch_shift: 0.1
79
+ pitch_shift_min_semitones: -4
80
+ pitch_shift_max_semitones: 4
81
+ gaussian_noise: 0.1
82
+ gaussian_noise_min_amplitude: 0.001
83
+ gaussian_noise_max_amplitude: 0.015
84
+ time_stretch: 0.01
85
+ time_stretch_min_rate: 0.8
86
+ time_stretch_max_rate: 1.25
87
+
88
+
89
+ inference:
90
+ batch_size: 1
91
+ dim_t: 512
92
+ num_overlap: 4
configs/config_musdb18_torchseg.yaml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261632
3
+ dim_f: 4096
4
+ dim_t: 512
5
+ hop_length: 512
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ encoder_name: maxvit_tiny_tf_512 # look with torchseg.list_encoders(). Currently 858 available
13
+ decoder_type: unet # unet, fpn
14
+ act: gelu
15
+ num_channels: 128
16
+ num_subbands: 8
17
+
18
+ training:
19
+ batch_size: 18
20
+ gradient_accumulation_steps: 1
21
+ grad_clip: 0
22
+ instruments:
23
+ - vocals
24
+ - bass
25
+ - drums
26
+ - other
27
+ lr: 5.0e-05
28
+ patience: 2
29
+ reduce_factor: 0.95
30
+ target_instrument: null
31
+ num_epochs: 1000
32
+ num_steps: 2000
33
+ q: 0.95
34
+ coarse_loss_clip: true
35
+ ema_momentum: 0.999
36
+ optimizer: adamw
37
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
38
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
39
+
40
+ augmentations:
41
+ enable: true # enable or disable all augmentations (to fast disable if needed)
42
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
43
+ loudness_min: 0.5
44
+ loudness_max: 1.5
45
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
46
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
47
+ - 0.2
48
+ - 0.02
49
+ mixup_loudness_min: 0.5
50
+ mixup_loudness_max: 1.5
51
+
52
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
53
+ mp3_compression_on_mixture: 0.01
54
+ mp3_compression_on_mixture_bitrate_min: 32
55
+ mp3_compression_on_mixture_bitrate_max: 320
56
+ mp3_compression_on_mixture_backend: "lameenc"
57
+
58
+ all:
59
+ channel_shuffle: 0.5 # Set 0 or lower to disable
60
+ random_inverse: 0.1 # inverse track (better lower probability)
61
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
62
+ mp3_compression: 0.01
63
+ mp3_compression_min_bitrate: 32
64
+ mp3_compression_max_bitrate: 320
65
+ mp3_compression_backend: "lameenc"
66
+
67
+ vocals:
68
+ pitch_shift: 0.1
69
+ pitch_shift_min_semitones: -5
70
+ pitch_shift_max_semitones: 5
71
+ seven_band_parametric_eq: 0.25
72
+ seven_band_parametric_eq_min_gain_db: -9
73
+ seven_band_parametric_eq_max_gain_db: 9
74
+ tanh_distortion: 0.1
75
+ tanh_distortion_min: 0.1
76
+ tanh_distortion_max: 0.7
77
+ other:
78
+ pitch_shift: 0.1
79
+ pitch_shift_min_semitones: -4
80
+ pitch_shift_max_semitones: 4
81
+ gaussian_noise: 0.1
82
+ gaussian_noise_min_amplitude: 0.001
83
+ gaussian_noise_max_amplitude: 0.015
84
+ time_stretch: 0.01
85
+ time_stretch_min_rate: 0.8
86
+ time_stretch_max_rate: 1.25
87
+
88
+
89
+ inference:
90
+ batch_size: 1
91
+ dim_t: 512
92
+ num_overlap: 4
configs/config_vocals_bandit_bsrnn_multi_mus64.yaml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "MultiMaskMultiSourceBandSplitRNN"
2
+ audio:
3
+ chunk_size: 264600
4
+ num_channels: 2
5
+ sample_rate: 44100
6
+ min_mean_abs: 0.001
7
+
8
+ model:
9
+ in_channel: 1
10
+ stems: ['vocals', 'other']
11
+ band_specs: "musical"
12
+ n_bands: 64
13
+ fs: 44100
14
+ require_no_overlap: false
15
+ require_no_gap: true
16
+ normalize_channel_independently: false
17
+ treat_channel_as_feature: true
18
+ n_sqm_modules: 8
19
+ emb_dim: 128
20
+ rnn_dim: 256
21
+ bidirectional: true
22
+ rnn_type: "GRU"
23
+ mlp_dim: 512
24
+ hidden_activation: "Tanh"
25
+ hidden_activation_kwargs: null
26
+ complex_mask: true
27
+ n_fft: 2048
28
+ win_length: 2048
29
+ hop_length: 512
30
+ window_fn: "hann_window"
31
+ wkwargs: null
32
+ power: null
33
+ center: true
34
+ normalized: true
35
+ pad_mode: "constant"
36
+ onesided: true
37
+
38
+ training:
39
+ batch_size: 4
40
+ gradient_accumulation_steps: 4
41
+ grad_clip: 0
42
+ instruments:
43
+ - vocals
44
+ - other
45
+ lr: 9.0e-05
46
+ patience: 2
47
+ reduce_factor: 0.95
48
+ target_instrument: null
49
+ num_epochs: 1000
50
+ num_steps: 1000
51
+ q: 0.95
52
+ coarse_loss_clip: true
53
+ ema_momentum: 0.999
54
+ optimizer: adam
55
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
56
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
57
+
58
+ augmentations:
59
+ enable: true # enable or disable all augmentations (to fast disable if needed)
60
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
61
+ loudness_min: 0.5
62
+ loudness_max: 1.5
63
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
64
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
65
+ - 0.2
66
+ - 0.02
67
+ mixup_loudness_min: 0.5
68
+ mixup_loudness_max: 1.5
69
+
70
+ inference:
71
+ batch_size: 1
72
+ dim_t: 256
73
+ num_overlap: 4
configs/config_vocals_bs_roformer.yaml ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 192
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.1
85
+ ff_dropout: 0.1
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+
103
+ training:
104
+ batch_size: 10
105
+ gradient_accumulation_steps: 1
106
+ grad_clip: 0
107
+ instruments:
108
+ - vocals
109
+ - other
110
+ lr: 5.0e-05
111
+ patience: 2
112
+ reduce_factor: 0.95
113
+ target_instrument: vocals
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ q: 0.95
117
+ coarse_loss_clip: true
118
+ ema_momentum: 0.999
119
+ optimizer: adam
120
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
121
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
122
+
123
+ augmentations:
124
+ enable: true # enable or disable all augmentations (to fast disable if needed)
125
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
126
+ loudness_min: 0.5
127
+ loudness_max: 1.5
128
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
129
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
130
+ - 0.2
131
+ - 0.02
132
+ mixup_loudness_min: 0.5
133
+ mixup_loudness_max: 1.5
134
+
135
+ inference:
136
+ batch_size: 1
137
+ dim_t: 256
138
+ num_overlap: 4
configs/config_vocals_htdemucs.yaml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # samplerate * segment
3
+ min_mean_abs: 0.001
4
+ hop_length: 1024
5
+
6
+ training:
7
+ batch_size: 10
8
+ gradient_accumulation_steps: 1
9
+ grad_clip: 0
10
+ segment: 11
11
+ shift: 1
12
+ samplerate: 44100
13
+ channels: 2
14
+ normalize: true
15
+ instruments: ['vocals', 'other']
16
+ target_instrument: null
17
+ num_epochs: 1000
18
+ num_steps: 1000
19
+ optimizer: adam
20
+ lr: 9.0e-05
21
+ patience: 2
22
+ reduce_factor: 0.95
23
+ q: 0.95
24
+ coarse_loss_clip: true
25
+ ema_momentum: 0.999
26
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
27
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
28
+
29
+ augmentations:
30
+ enable: true # enable or disable all augmentations (to fast disable if needed)
31
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
32
+ loudness_min: 0.5
33
+ loudness_max: 1.5
34
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
35
+ mixup_probs: [0.2, 0.02]
36
+ mixup_loudness_min: 0.5
37
+ mixup_loudness_max: 1.5
38
+
39
+ inference:
40
+ num_overlap: 2
41
+ batch_size: 8
42
+
43
+ model: htdemucs
44
+
45
+ htdemucs: # see demucs/htdemucs.py for a detailed description
46
+ # Channels
47
+ channels: 48
48
+ channels_time:
49
+ growth: 2
50
+ # STFT
51
+ num_subbands: 1
52
+ nfft: 4096
53
+ wiener_iters: 0
54
+ end_iters: 0
55
+ wiener_residual: false
56
+ cac: true
57
+ # Main structure
58
+ depth: 4
59
+ rewrite: true
60
+ # Frequency Branch
61
+ multi_freqs: []
62
+ multi_freqs_depth: 3
63
+ freq_emb: 0.2
64
+ emb_scale: 10
65
+ emb_smooth: true
66
+ # Convolutions
67
+ kernel_size: 8
68
+ stride: 4
69
+ time_stride: 2
70
+ context: 1
71
+ context_enc: 0
72
+ # normalization
73
+ norm_starts: 4
74
+ norm_groups: 4
75
+ # DConv residual branch
76
+ dconv_mode: 3
77
+ dconv_depth: 2
78
+ dconv_comp: 8
79
+ dconv_init: 1e-3
80
+ # Before the Transformer
81
+ bottom_channels: 512
82
+ # CrossTransformer
83
+ # ------ Common to all
84
+ # Regular parameters
85
+ t_layers: 5
86
+ t_hidden_scale: 4.0
87
+ t_heads: 8
88
+ t_dropout: 0.0
89
+ t_layer_scale: True
90
+ t_gelu: True
91
+ # ------------- Positional Embedding
92
+ t_emb: sin
93
+ t_max_positions: 10000 # for the scaled embedding
94
+ t_max_period: 10000.0
95
+ t_weight_pos_embed: 1.0
96
+ t_cape_mean_normalize: True
97
+ t_cape_augment: True
98
+ t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
99
+ t_sin_random_shift: 0
100
+ # ------------- norm before a transformer encoder
101
+ t_norm_in: True
102
+ t_norm_in_group: False
103
+ # ------------- norm inside the encoder
104
+ t_group_norm: False
105
+ t_norm_first: True
106
+ t_norm_out: True
107
+ # ------------- optim
108
+ t_weight_decay: 0.0
109
+ t_lr:
110
+ # ------------- sparsity
111
+ t_sparse_self_attn: False
112
+ t_sparse_cross_attn: False
113
+ t_mask_type: diag
114
+ t_mask_random_seed: 42
115
+ t_sparse_attn_window: 400
116
+ t_global_window: 100
117
+ t_sparsity: 0.95
118
+ t_auto_sparsity: False
119
+ # Cross Encoder First (False)
120
+ t_cross_first: False
121
+ # Weight init
122
+ rescale: 0.1
123
+
configs/config_vocals_mdx23c.yaml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261120
3
+ dim_f: 4096
4
+ dim_t: 256
5
+ hop_length: 1024
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ act: gelu
13
+ bottleneck_factor: 4
14
+ growth: 128
15
+ norm: InstanceNorm
16
+ num_blocks_per_scale: 2
17
+ num_channels: 128
18
+ num_scales: 5
19
+ num_subbands: 4
20
+ scale:
21
+ - 2
22
+ - 2
23
+
24
+ training:
25
+ batch_size: 6
26
+ gradient_accumulation_steps: 1
27
+ grad_clip: 0
28
+ instruments:
29
+ - vocals
30
+ - other
31
+ lr: 9.0e-05
32
+ patience: 2
33
+ reduce_factor: 0.95
34
+ target_instrument: null
35
+ num_epochs: 1000
36
+ num_steps: 1000
37
+ q: 0.95
38
+ coarse_loss_clip: true
39
+ ema_momentum: 0.999
40
+ optimizer: adam
41
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
42
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
43
+
44
+ augmentations:
45
+ enable: true # enable or disable all augmentations (to fast disable if needed)
46
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
47
+ loudness_min: 0.5
48
+ loudness_max: 1.5
49
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
50
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
51
+ - 0.2
52
+ - 0.02
53
+ mixup_loudness_min: 0.5
54
+ mixup_loudness_max: 1.5
55
+
56
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
57
+ mp3_compression_on_mixture: 0.01
58
+ mp3_compression_on_mixture_bitrate_min: 32
59
+ mp3_compression_on_mixture_bitrate_max: 320
60
+ mp3_compression_on_mixture_backend: "lameenc"
61
+
62
+ all:
63
+ channel_shuffle: 0.5 # Set 0 or lower to disable
64
+ random_inverse: 0.1 # inverse track (better lower probability)
65
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
66
+ mp3_compression: 0.01
67
+ mp3_compression_min_bitrate: 32
68
+ mp3_compression_max_bitrate: 320
69
+ mp3_compression_backend: "lameenc"
70
+
71
+ vocals:
72
+ pitch_shift: 0.1
73
+ pitch_shift_min_semitones: -5
74
+ pitch_shift_max_semitones: 5
75
+ seven_band_parametric_eq: 0.25
76
+ seven_band_parametric_eq_min_gain_db: -9
77
+ seven_band_parametric_eq_max_gain_db: 9
78
+ tanh_distortion: 0.1
79
+ tanh_distortion_min: 0.1
80
+ tanh_distortion_max: 0.7
81
+ other:
82
+ pitch_shift: 0.1
83
+ pitch_shift_min_semitones: -4
84
+ pitch_shift_max_semitones: 4
85
+ gaussian_noise: 0.1
86
+ gaussian_noise_min_amplitude: 0.001
87
+ gaussian_noise_max_amplitude: 0.015
88
+ time_stretch: 0.01
89
+ time_stretch_min_rate: 0.8
90
+ time_stretch_max_rate: 1.25
91
+
92
+ inference:
93
+ batch_size: 1
94
+ dim_t: 256
95
+ num_overlap: 4
configs/config_vocals_mel_band_roformer.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 192
13
+ depth: 8
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ num_bands: 60
20
+ dim_head: 64
21
+ heads: 8
22
+ attn_dropout: 0.1
23
+ ff_dropout: 0.1
24
+ flash_attn: True
25
+ dim_freqs_in: 1025
26
+ sample_rate: 44100 # needed for mel filter bank from librosa
27
+ stft_n_fft: 2048
28
+ stft_hop_length: 512
29
+ stft_win_length: 2048
30
+ stft_normalized: False
31
+ mask_estimator_depth: 2
32
+ multi_stft_resolution_loss_weight: 1.0
33
+ multi_stft_resolutions_window_sizes: !!python/tuple
34
+ - 4096
35
+ - 2048
36
+ - 1024
37
+ - 512
38
+ - 256
39
+ multi_stft_hop_size: 147
40
+ multi_stft_normalized: False
41
+
42
+ training:
43
+ batch_size: 7
44
+ gradient_accumulation_steps: 1
45
+ grad_clip: 0
46
+ instruments:
47
+ - vocals
48
+ - other
49
+ lr: 5.0e-05
50
+ patience: 2
51
+ reduce_factor: 0.95
52
+ target_instrument: vocals
53
+ num_epochs: 1000
54
+ num_steps: 1000
55
+ q: 0.95
56
+ coarse_loss_clip: true
57
+ ema_momentum: 0.999
58
+ optimizer: adam
59
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
60
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
61
+
62
+ augmentations:
63
+ enable: true # enable or disable all augmentations (to fast disable if needed)
64
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
65
+ loudness_min: 0.5
66
+ loudness_max: 1.5
67
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
68
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
69
+ - 0.2
70
+ - 0.02
71
+ mixup_loudness_min: 0.5
72
+ mixup_loudness_max: 1.5
73
+
74
+ inference:
75
+ batch_size: 1
76
+ dim_t: 256
77
+ num_overlap: 4
configs/config_vocals_scnet.yaml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 264600
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources: ['vocals', 'other']
9
+ audio_channels: 2
10
+ # dims: [4, 32, 64, 128] # small version
11
+ dims: [4, 64, 128, 256]
12
+ nfft: 4096
13
+ hop_size: 1024
14
+ win_size: 4096
15
+ normalized: True
16
+ band_configs: {
17
+ 'low': { 'SR': .175, 'stride': 1, 'kernel': 3 },
18
+ 'mid': { 'SR': .392, 'stride': 4, 'kernel': 4 },
19
+ 'high': { 'SR': .433, 'stride': 16, 'kernel': 16 }
20
+ }
21
+ conv_depths: [3, 2, 1]
22
+ compress: 4
23
+ conv_kernel: 3
24
+ # Dual-path RNN
25
+ num_dplayer: 6
26
+ expand: 1
27
+ # mamba
28
+ use_mamba: False
29
+ mamba_config: {
30
+ 'd_stat': 16,
31
+ 'd_conv': 4,
32
+ 'd_expand': 2
33
+ }
34
+
35
+ training:
36
+ batch_size: 4
37
+ gradient_accumulation_steps: 2
38
+ grad_clip: 0
39
+ instruments:
40
+ - vocals
41
+ - other
42
+ lr: 5.0e-04
43
+ patience: 2
44
+ reduce_factor: 0.95
45
+ target_instrument: null
46
+ num_epochs: 1000
47
+ num_steps: 1000
48
+ q: 0.95
49
+ coarse_loss_clip: true
50
+ ema_momentum: 0.999
51
+ optimizer: adam
52
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
53
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
54
+
55
+ augmentations:
56
+ enable: true # enable or disable all augmentations (to fast disable if needed)
57
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
58
+ loudness_min: 0.5
59
+ loudness_max: 1.5
60
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
61
+ mixup_probs:
62
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
63
+ - 0.2
64
+ - 0.02
65
+ mixup_loudness_min: 0.5
66
+ mixup_loudness_max: 1.5
67
+
68
+ inference:
69
+ batch_size: 8
70
+ dim_t: 256
71
+ num_overlap: 4
configs/config_vocals_scnet_unofficial.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 264600
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ dims: [4, 32, 64, 128]
9
+ bandsplit_ratios: [.175, .392, .433]
10
+ downsample_strides: [1, 4, 16]
11
+ n_conv_modules: [3, 2, 1]
12
+ n_rnn_layers: 6
13
+ rnn_hidden_dim: 128
14
+ n_sources: 2
15
+
16
+ n_fft: 4096
17
+ hop_length: 1024
18
+ win_length: 4096
19
+ stft_normalized: false
20
+
21
+ use_mamba: false
22
+ d_state: 16
23
+ d_conv: 4
24
+ d_expand: 2
25
+
26
+ training:
27
+ batch_size: 10
28
+ gradient_accumulation_steps: 2
29
+ grad_clip: 0
30
+ instruments:
31
+ - vocals
32
+ - other
33
+ lr: 5.0e-04
34
+ patience: 2
35
+ reduce_factor: 0.95
36
+ target_instrument: null
37
+ num_epochs: 1000
38
+ num_steps: 1000
39
+ q: 0.95
40
+ coarse_loss_clip: true
41
+ ema_momentum: 0.999
42
+ optimizer: adam
43
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
44
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
45
+
46
+ augmentations:
47
+ enable: true # enable or disable all augmentations (to fast disable if needed)
48
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
49
+ loudness_min: 0.5
50
+ loudness_max: 1.5
51
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
52
+ mixup_probs:
53
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
54
+ - 0.2
55
+ - 0.02
56
+ mixup_loudness_min: 0.5
57
+ mixup_loudness_max: 1.5
58
+
59
+ inference:
60
+ batch_size: 8
61
+ dim_t: 256
62
+ num_overlap: 4
configs/config_vocals_segm_models.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261632
3
+ dim_f: 4096
4
+ dim_t: 512
5
+ hop_length: 512
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ encoder_name: tu-maxvit_large_tf_512 # look here for possibilities: https://github.com/qubvel/segmentation_models.pytorch#encoders-
13
+ decoder_type: unet # unet, fpn
14
+ act: gelu
15
+ num_channels: 128
16
+ num_subbands: 8
17
+
18
+ loss_multistft:
19
+ fft_sizes:
20
+ - 1024
21
+ - 2048
22
+ - 4096
23
+ hop_sizes:
24
+ - 512
25
+ - 1024
26
+ - 2048
27
+ win_lengths:
28
+ - 1024
29
+ - 2048
30
+ - 4096
31
+ window: "hann_window"
32
+ scale: "mel"
33
+ n_bins: 128
34
+ sample_rate: 44100
35
+ perceptual_weighting: true
36
+ w_sc: 1.0
37
+ w_log_mag: 1.0
38
+ w_lin_mag: 0.0
39
+ w_phs: 0.0
40
+ mag_distance: "L1"
41
+
42
+
43
+ training:
44
+ batch_size: 8
45
+ gradient_accumulation_steps: 1
46
+ grad_clip: 0
47
+ instruments:
48
+ - vocals
49
+ - other
50
+ lr: 5.0e-05
51
+ patience: 2
52
+ reduce_factor: 0.95
53
+ target_instrument: null
54
+ num_epochs: 1000
55
+ num_steps: 2000
56
+ q: 0.95
57
+ coarse_loss_clip: true
58
+ ema_momentum: 0.999
59
+ optimizer: adamw
60
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
61
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
62
+
63
+ augmentations:
64
+ enable: true # enable or disable all augmentations (to fast disable if needed)
65
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
66
+ loudness_min: 0.5
67
+ loudness_max: 1.5
68
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
69
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
70
+ - 0.2
71
+ - 0.02
72
+ mixup_loudness_min: 0.5
73
+ mixup_loudness_max: 1.5
74
+
75
+ inference:
76
+ batch_size: 1
77
+ dim_t: 512
78
+ num_overlap: 4
configs/config_vocals_swin_upernet.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261632
3
+ dim_f: 4096
4
+ dim_t: 512
5
+ hop_length: 512
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ act: gelu
13
+ num_channels: 16
14
+ num_subbands: 8
15
+
16
+ training:
17
+ batch_size: 14
18
+ gradient_accumulation_steps: 4
19
+ grad_clip: 0
20
+ instruments:
21
+ - vocals
22
+ - other
23
+ lr: 3.0e-05
24
+ patience: 2
25
+ reduce_factor: 0.95
26
+ target_instrument: null
27
+ num_epochs: 1000
28
+ num_steps: 1000
29
+ q: 0.95
30
+ coarse_loss_clip: true
31
+ ema_momentum: 0.999
32
+ optimizer: adamw
33
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
34
+
35
+ augmentations:
36
+ enable: true # enable or disable all augmentations (to fast disable if needed)
37
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
38
+ loudness_min: 0.5
39
+ loudness_max: 1.5
40
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
41
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
42
+ - 0.2
43
+ - 0.02
44
+ mixup_loudness_min: 0.5
45
+ mixup_loudness_max: 1.5
46
+
47
+ inference:
48
+ batch_size: 1
49
+ dim_t: 512
50
+ num_overlap: 4
configs/config_vocals_torchseg.yaml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261632
3
+ dim_f: 4096
4
+ dim_t: 512
5
+ hop_length: 512
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ encoder_name: maxvit_tiny_tf_512 # look with torchseg.list_encoders(). Currently 858 available
13
+ decoder_type: unet # unet, fpn
14
+ act: gelu
15
+ num_channels: 128
16
+ num_subbands: 8
17
+
18
+ training:
19
+ batch_size: 18
20
+ gradient_accumulation_steps: 1
21
+ grad_clip: 1.0
22
+ instruments:
23
+ - vocals
24
+ - other
25
+ lr: 1.0e-04
26
+ patience: 2
27
+ reduce_factor: 0.95
28
+ target_instrument: null
29
+ num_epochs: 1000
30
+ num_steps: 1000
31
+ q: 0.95
32
+ coarse_loss_clip: true
33
+ ema_momentum: 0.999
34
+ optimizer: adam
35
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
36
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
37
+
38
+ augmentations:
39
+ enable: false # enable or disable all augmentations (to fast disable if needed)
40
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
41
+ loudness_min: 0.5
42
+ loudness_max: 1.5
43
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
44
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
45
+ - 0.2
46
+ - 0.02
47
+ mixup_loudness_min: 0.5
48
+ mixup_loudness_max: 1.5
49
+
50
+ all:
51
+ channel_shuffle: 0.5 # Set 0 or lower to disable
52
+ random_inverse: 0.1 # inverse track (better lower probability)
53
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
54
+
55
+ inference:
56
+ batch_size: 8
57
+ dim_t: 512
58
+ num_overlap: 2
configs/viperx/model_bs_roformer_ep_317_sdr_12.9755.yaml ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 512
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.1
85
+ ff_dropout: 0.1
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 441
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+
103
+ training:
104
+ batch_size: 2
105
+ gradient_accumulation_steps: 1
106
+ grad_clip: 0
107
+ instruments:
108
+ - vocals
109
+ - other
110
+ lr: 1.0e-05
111
+ patience: 2
112
+ reduce_factor: 0.95
113
+ target_instrument: vocals
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ q: 0.95
117
+ coarse_loss_clip: true
118
+ ema_momentum: 0.999
119
+ optimizer: adam
120
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
121
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
122
+
123
+ inference:
124
+ batch_size: 4
125
+ dim_t: 801
126
+ num_overlap: 2
configs/viperx/model_bs_roformer_ep_937_sdr_10.5309.yaml ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.1
85
+ ff_dropout: 0.1
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+
103
+ training:
104
+ batch_size: 4
105
+ gradient_accumulation_steps: 1
106
+ grad_clip: 0
107
+ instruments:
108
+ - vocals
109
+ - other
110
+ lr: 5.0e-05
111
+ patience: 2
112
+ reduce_factor: 0.95
113
+ target_instrument: other
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ q: 0.95
117
+ coarse_loss_clip: true
118
+ ema_momentum: 0.999
119
+ optimizer: adam
120
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
121
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
122
+
123
+ augmentations:
124
+ enable: true # enable or disable all augmentations (to fast disable if needed)
125
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
126
+ loudness_min: 0.5
127
+ loudness_max: 1.5
128
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
129
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
130
+ - 0.2
131
+ - 0.02
132
+ mixup_loudness_min: 0.5
133
+ mixup_loudness_max: 1.5
134
+
135
+ inference:
136
+ batch_size: 8
137
+ dim_t: 512
138
+ num_overlap: 2
configs/viperx/model_mel_band_roformer_ep_3005_sdr_11.4360.yaml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ num_bands: 60
20
+ dim_head: 64
21
+ heads: 8
22
+ attn_dropout: 0.1
23
+ ff_dropout: 0.1
24
+ flash_attn: True
25
+ dim_freqs_in: 1025
26
+ sample_rate: 44100 # needed for mel filter bank from librosa
27
+ stft_n_fft: 2048
28
+ stft_hop_length: 441
29
+ stft_win_length: 2048
30
+ stft_normalized: False
31
+ mask_estimator_depth: 2
32
+ multi_stft_resolution_loss_weight: 1.0
33
+ multi_stft_resolutions_window_sizes: !!python/tuple
34
+ - 4096
35
+ - 2048
36
+ - 1024
37
+ - 512
38
+ - 256
39
+ multi_stft_hop_size: 147
40
+ multi_stft_normalized: False
41
+
42
+ training:
43
+ batch_size: 1
44
+ gradient_accumulation_steps: 8
45
+ grad_clip: 0
46
+ instruments:
47
+ - vocals
48
+ - other
49
+ lr: 4.0e-05
50
+ patience: 2
51
+ reduce_factor: 0.95
52
+ target_instrument: vocals
53
+ num_epochs: 1000
54
+ num_steps: 1000
55
+ q: 0.95
56
+ coarse_loss_clip: true
57
+ ema_momentum: 0.999
58
+ optimizer: adam
59
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
60
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
61
+
62
+ inference:
63
+ batch_size: 4
64
+ dim_t: 801
65
+ num_overlap: 2