File size: 1,830 Bytes
09f258a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ea585e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
sample_rate: 24_000
audio_backend: "vocos"

models:
- name: "ar+nar-tts+stt"
  size: "full"
  resp_levels: 8
  prom_levels: 8
  tasks: 9
  langs: 4
  tones: 1
  arch_type: llama
  training: False
  version: 5
  attention: auto
  dropout: 0.1
  #loss_factors:
  #  text: 0.01
  #  prom: 0.5
  #  resp: 1.0
  capabilities: ["ar", "nar"]
  experimental:
    # modifies model arch
    audio_embedding_sums: True
    unified_position_ids: False
    split_classifiers: True

#loras:
#- name : "lora"
#  rank: 128
#  alpha: 128
#  training: True
#  rvq_levels: []

hyperparameters:
  batch_size: 32
  gradient_accumulation_steps: 8
  gradient_clipping: 1.0
  warmup_steps: 10

  optimizer: Prodigy
  learning_rate: 1.0
  torch_optimizer: True
  
  scheduler: "" # ScheduleFree
  torch_scheduler: True

evaluation:
  batch_size: 4
  frequency: 250
  size: 4
  
  steps: 500
  ar_temperature: 1.0
  nar_temperature: 0.0

trainer:
  iterations: 1_000_000  
  save_frequency: 250
  keep_last_checkpoints: 4

  resize_modules: True
  gradient_checkpointing: True

  weight_dtype: bfloat16
  amp: True

  backend: deepspeed
  deepspeed:
    inferencing: False
    amp: False

inference:
  backend: local
  weight_dtype: bfloat16
  amp: True

optimizations:
  injects: False
  replace: True

  linear: False
  embedding: False
  optimizers: True

  bitsandbytes: False
  dadaptation: False
  bitnet: False
  fp8: False

dataset:
  use_hdf5: True
  hdf5_flag: r
  
  use_metadata: True
  validate: True

  workers: 1
  cache: True

  duration_range: [3.0, 12.0]

  prompt_max_samples: 1
  prompt_duration_range: [3.0, 3.0]
  
  resps_max_samples: 1

  sample_type: path # path # speaker
  sample_order: duration
  sample_max_duration_batch: 300
  sample_shuffle: False

  tasks_list: [ "tts", "stt" ]

  training: []
  validation: []
  noise: []