dataset: | |
training: [ | |
] | |
validation: [ | |
] | |
noise: [ | |
] | |
speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'" | |
use_hdf5: True | |
use_metadata: True | |
hdf5_flag: r | |
validate: True | |
workers: 4 | |
cache: True | |
phones_range: [4, 256] | |
duration_range: [1.0, 16.0] | |
random_utterance: 1.0 | |
max_prompts: 3 | |
prompt_duration: 3.0 | |
sample_type: speaker | |
tasks_list: ["tts"] # , "ns", "sr", "tse", "cse", "nse", "tts"] | |
models: | |
_prom_levels: 4 | |
_max_levels: 8 | |
_models: | |
- name: "ar+nar" | |
size: "double" | |
resp_levels: 4 | |
prom_levels: 4 | |
tasks: 8 | |
arch_type: "retnet" | |
training: True | |
hyperparameters: | |
batch_size: 8 | |
gradient_accumulation_steps: 1 | |
gradient_clipping: 100 | |
optimizer: AdamW | |
learning_rate: 1.0e-5 | |
scheduler_type: "" | |
evaluation: | |
batch_size: 16 | |
frequency: 500 | |
size: 16 | |
steps: 300 | |
ar_temperature: 0.95 | |
nar_temperature: 0.25 | |
load_disabled_engines: True | |
trainer: | |
iterations: 1_000_000 | |
save_tag: step | |
save_on_oom: True | |
save_on_quit: True | |
save_frequency: 500 | |
export_on_save: True | |
keep_last_checkpoints: 4 | |
aggressive_optimizations: False | |
load_disabled_engines: False | |
load_state_dict: True | |
gc_mode: None # "global_step" | |
weight_dtype: float32 | |
amp: False | |
backend: local | |
deepspeed: | |
zero_optimization_level: 0 | |
use_compression_training: True | |
inference: | |
weight_dtype: float32 | |
amp: False | |
use_vocos: True | |
normalize: False | |
recurrent_chunk_size: 0 | |
recurrent_forward: False | |
bitsandbytes: | |
enabled: False | |
injects: True | |
linear: True | |
embedding: True | |
device: cpu |