annapurnapadmaprema-ji's picture
Upload 278 files
2631d60 verified
# @package __global__
defaults:
- /solver/default
- /augmentations/default
- override /dset: audio/example
- _self_
solver: watermarking # standard name to load the solver using builders
sample_rate: ???
channels: ???
# all the defaults form compression
losses:
adv: 4.
feat: 4.
l1: 0.1
mel: 0.0
msspec: 2.0
sisnr: 0.0
wm_detection: 1.0 # loss for first 2 bits cannot be 0
wm_mb: 1.0 # loss for the rest of the bits (wm message)
tf_loudnessratio: 10.0
balancer:
balance_grads: true
ema_decay: 0.999
per_batch_item: true
total_norm: 1.
crop:
prob: 0.4
shuffle_prob: 0.2
pad_prob: 0.2 # shuffle_prob + pad_prob + prob <= 1
size: 0.5
max_n_windows: 5
adversarial:
every: 1
adversaries: [msstftd]
adv_loss: hinge
feat_loss: l1
tf_loudnessratio:
sample_rate: ${sample_rate}
segment: 0.5
overlap: 0.5
n_bands: 16
temperature: 1.0
# watermarking: audioseal
# losses hyperparameters
l1: {}
l2: {}
wm_detection:
p_weight: 1
n_weight: 1
wm_mb:
loss_type: bce # loss between decoded and original
temperature: 0.1 # decoded is divided by temperature before loss computation
spec_range:
n_fft: 2048
min_frequency: 300.0
max_frequency: 15000.0
sample_rate: ${sample_rate}
spec_entropy_range:
n_fft: 2048
min_frequency: 300.0
max_frequency: 15000.0
sample_rate: ${sample_rate}
mrstft:
factor_sc: .5
factor_mag: .5
normalized: false
mel:
sample_rate: ${sample_rate}
n_fft: 1024
hop_length: 256
win_length: 1024
n_mels: 64
f_min: 64
f_max: null
normalized: false
floor_level: 1e-5
sisnr:
sample_rate: ${sample_rate}
segment: 5.
msspec:
sample_rate: ${sample_rate}
range_start: 6
range_end: 11
n_mels: 64
f_min: 64
f_max: null
normalized: true
alphas: false
floor_level: 1e-5
# metrics
metrics:
visqol:
mode: audio
bin: null # path to visqol install
model: tcdaudio14_aacvopus_coresv_svrnsim_n.68_g.01_c1.model # visqol v3
# adversaries hyperparameters
msstftd:
in_channels: 1
out_channels: 1
filters: 32
norm: weight_norm
n_ffts: [1024, 2048, 512, 256, 128]
hop_lengths: [256, 512, 128, 64, 32]
win_lengths: [1024, 2048, 512, 256, 128]
activation: LeakyReLU
activation_params: { negative_slope: 0.3 }
msd:
in_channels: 1
out_channels: 1
scale_norms: [spectral_norm, weight_norm, weight_norm]
kernel_sizes: [5, 3]
filters: 16
max_filters: 1024
downsample_scales: [4, 4, 4, 4]
inner_kernel_sizes: null
groups: [4, 4, 4, 4]
strides: null
paddings: null
activation: LeakyReLU
activation_params: { negative_slope: 0.3 }
mpd:
in_channels: 1
out_channels: 1
periods: [2, 3, 5, 7, 11]
n_layers: 5
kernel_size: 5
stride: 3
filters: 8
filter_scales: 4
max_filters: 1024
activation: LeakyReLU
activation_params: { negative_slope: 0.3 }
norm: weight_norm
# data hyperparameters
dataset:
batch_size: 16
num_workers: 10
segment_duration: 1
train:
num_samples: 500000
valid:
num_samples: 10000
evaluate:
batch_size: 16
num_samples: 10000
segment_duration: 10
generate:
batch_size: 16
num_samples: 50
segment_duration: 30
# solver hyperparameters
evaluate:
every: 10
num_workers: 5
metrics:
visqol: false
sisnr: true
generate:
every: 10
num_workers: 5
audio:
sample_rate: ${sample_rate}
# checkpointing schedule
checkpoint:
save_last: true
save_every: 25
keep_last: 10
keep_every_states: null
# optimization hyperparameters
optim:
epochs: 300
updates_per_epoch: 2000
lr: 5e-5
max_norm: 3.0
optimizer: adam
adam:
betas: [0.5, 0.9]
weight_decay: 0.
ema:
use: true # whether to use EMA or not
updates: 1 # update at every step
device: ${device} # device for EMA, can be put on GPU if more frequent updates
decay: 0.99 # EMA decay value, if null, no EMA is used
schedule:
lr_scheduler: "cosine"
cosine:
warmup: 4000
lr_min_ratio: 0.0
cycle_length: 1.0