File size: 2,737 Bytes
5325fcc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# @package __global__
defaults:
- /solver/default
- /conditioner: none
- _self_
- /solver/musicgen/evaluation: none
- override /dset: audio/default
autocast: true
autocast_dtype: float16
solver: musicgen
sample_rate: ???
channels: ???
compression_model_checkpoint: ???
# The following will set the num codebooks on the underlying
# model, this might be different from the actual value for n_q
# given to the transformer, when the model output is postprocessed, for instance
# for stereo channels. If not provided, default value for the compression model
# will be used.
compression_model_n_q: null
tokens:
padding_with_special_token: false
interleave_stereo_codebooks:
use: false
per_timestep: false
cache:
path:
write: false
write_shard: 0
write_num_shards: 1
dataset:
batch_size: 128
num_workers: 10
segment_duration: 30
min_segment_ratio: 0.8 # lower values such as 0.5 result in generations with a lot of silence.
return_info: true
train:
num_samples: 1000000 # need a randomly large number here for AudioDataset
valid:
num_samples: 10000
generate:
num_samples: 50
metrics:
fad:
use_gt: false
model: tf
tf:
bin: null # path to local frechet_audio_distance code
model_path: //reference/fad/vggish_model.ckpt
kld:
use_gt: false
model: passt
passt:
pretrained_length: 20
text_consistency:
use_gt: false
model: clap
clap:
model_path: //reference/clap/music_audioset_epoch_15_esc_90.14.pt
model_arch: 'HTSAT-base'
enable_fusion: false
chroma_cosine:
use_gt: false
model: chroma_base
chroma_base:
sample_rate: ${sample_rate}
n_chroma: 12
radix2_exp: 14
argmax: true
generate:
every: 25
num_workers: 5
path: samples
audio:
format: wav
strategy: loudness
sample_rate: ${sample_rate}
loudness_headroom_db: 14
lm:
prompted_samples: true
unprompted_samples: true
gen_gt_samples: false
prompt_duration: null # if not set, will use dataset.generate.segment_duration / 4
gen_duration: null # if not set, will use dataset.generate.segment_duration
remove_prompts: false
# generation params
use_sampling: false
temp: 1.0
top_k: 0
top_p: 0.0
evaluate:
every: 25
num_workers: 5
metrics:
base: false
fad: false
kld: false
text_consistency: false
chroma_cosine: false
checkpoint:
save_last: true
save_every: 50
keep_last: 10
keep_every_states: null
optim:
epochs: 200
updates_per_epoch: 2000
lr: 1e-4
optimizer: adamw
max_norm: 1.0
eager_sync: true
adam:
betas: [0.9, 0.95]
weight_decay: 0.1
eps: 1e-8
schedule:
lr_scheduler: null
|