vall-e / models /config.retnet.yaml
ecker's picture
Update models/config.retnet.yaml
494a301 verified
sample_rate: 24_000
audio_backend: "vocos"
models:
- name: "ar+nar"
size: "full"
resp_levels: 8
prom_levels: 8
tasks: 8
langs: 2
tones: 1
arch_type: retnet
training: False
version: 2
dropout: 0.1
capabilities: ["ar", "nar"]
experimental:
audio_embedding_sums: True
#loras:
#- name : "lora"
# rank: 128
# alpha: 128
# training: True
# rvq_levels: []
hyperparameters:
batch_size: 32
gradient_accumulation_steps: 8
gradient_clipping: 1.0
warmup_steps: 10
optimizer: Prodigy
learning_rate: 1.0
torch_optimizer: True
scheduler: "" # ScheduleFree
torch_scheduler: True
evaluation:
batch_size: 4
frequency: 250
size: 4
steps: 500
ar_temperature: 1.0
nar_temperature: 0.0
trainer:
iterations: 1_000_000
save_frequency: 250
keep_last_checkpoints: 4
resize_modules: True
gradient_checkpointing: True
weight_dtype: bfloat16
amp: True
backend: deepspeed
deepspeed:
inferencing: False
amp: False
inference:
backend: local
weight_dtype: bfloat16
amp: True
optimizations:
injects: False
replace: True
linear: False
embedding: False
optimizers: True
bitsandbytes: False
dadaptation: False
bitnet: False
fp8: False
dataset:
use_hdf5: True
hdf5_flag: r
use_metadata: True
validate: True
workers: 1
cache: True
duration_range: [3.0, 12.0]
prompt_max_samples: 1
prompt_duration_range: [3.0, 3.0]
resps_max_samples: 1
sample_type: path # path # speaker
sample_order: duration
sample_max_duration_batch: 300
sample_shuffle: False
tasks_list: [ "tts", "stt" ]
training: []
validation: []
noise: []