File size: 4,798 Bytes
7616769
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
train_input:
  batch_size: 256
  data_processor: GptHDF5MapDataProcessor
  data_dir: /cra-406/datasets/jais_instruction_datasets/v12p2/tokenized_mlv2_2k/
  num_workers: 1
  persistent_workers: true
  prefetch_factor: 10
  repeat: true
  shuffle: false
  shuffle_seed: 1
  use_worker_cache: false
  vocab_size: 84992
eval_input:
  batch_size: 32
  data_dir: /cb/customers/g42/datasets/multilingual_v2/pile_val_correct_packed
  data_processor: GptHDF5MapDataProcessor
  num_workers: 1
  repeat: false
  shuffle: false
  use_worker_cache: false
  vocab_size: 84992
model:
  mixed_precision: true
  fp16_type: cbfloat16
  boundary_casting: false
  lora_params: null
  vocab_size: 84992
  embedding_layer_norm: false
  embedding_dropout_rate: 0.0
  share_embedding_weights: true
  position_embedding_type: alibi
  max_position_embeddings: 2048
  position_embedding_offset: 0
  num_relative_attention_buckets: 32
  rotary_dim: null
  rope_theta: 10000
  pad_rope: false
  alibi_trainable_slopes: false
  pos_scaling_factor: 1.0
  hidden_size: 1088
  num_hidden_layers: 14
  dropout_rate: 0.0
  norm_type: layernorm
  layer_norm_epsilon: 1.0e-05
  num_heads: 17
  attention_module: aiayn_attention
  extra_attention_params: {}
  attention_type: scaled_dot_product
  attention_dropout_rate: 0.0
  use_projection_bias_in_attention: true
  use_ffn_bias_in_attention: true
  attention_softmax_fp32: false
  attention_kernel: optimized_beta
  attention_sliding_window_length: null
  scale_qk_dot_by_layer_idx: false
  fixed_sparse_attention: null
  filter_size: 2912
  nonlinearity: swiglu
  use_ffn_bias: true
  use_bias_in_output: false
  loss_scaling: num_tokens
  loss_weight: 1.0
  embeddings_scale: 9.1705785388303
  scale_qk_dot_by_d: true
  output_logits_scale: 0.2576902348606329
  initializer:
    name: truncated_normal
    mean: 0.0
    std: 0.04203434605680388
    a: -0.08406869211360776
    b: 0.08406869211360776
    nonlinearity: null
    mode: null
    scale: null
    distribution: null
  initializer_range: 0.02
  embedding_initializer:
    name: truncated_normal
    mean: 0.0
    std: 0.0866560243479838
    a: -0.1733120486959676
    b: 0.1733120486959676
    nonlinearity: null
    mode: null
    scale: null
    distribution: null
  output_layer_initializer:
    name: truncated_normal
    mean: 0.0
    std: 0.007943744727823684
    a: -0.015887489455647368
    b: 0.015887489455647368
    nonlinearity: null
    mode: null
    scale: null
    distribution: null
  compute_eval_metrics: true
sparsity: null
optimizer:
  optimizer_type: AdamW
  weight_decay: 0.1
  log_summaries: true
  loss_scaling_factor: dynamic
  learning_rate:
  - end_learning_rate: 0.0016
    initial_learning_rate: 0.0
    scheduler: Linear
    total_iters: 695
  - end_learning_rate: 0.00016
    initial_learning_rate: 0.0016
    scheduler: Linear
    total_iters: 23995
  max_gradient_norm: 1.0
  adjust_learning_rate:
    decoder_kernel: 0.23529411764705882
  betas:
  - 0.9
  - 0.95
  correct_bias: true
  eps: 1.0e-08
runconfig:
  steps_per_epoch: null
  max_steps: 24690
  mgmt_address: null
  mount_dirs:
  - /cra-406
  num_epochs: null
  python_paths:
  - /cra-406/workdirs/modelzoos/rel-2.2.1/modelzoo/src
  compile_dir: null
  checkpoint_path: /cra-406/workdirs/240209_Jais_series_v3/artifacts/model_dir_256M/checkpoint_240320.mdl
  credentials_path: null
  debug_args_path: null
  retrace_every_iteration: null
  eval_steps: 5219
  init_method: env://
  job_time_sec: null
  job_labels:
  - Name=Neha_Sengupta
  - Organization=Core42
  - Model=Jais_256M
  - Mode=Train
  - Num_CSX=4
  - Language=Bilingual
  - Type=Train
  - Dataset=v12p2
  job_priority: p3
  seed: 1
  mgmt_namespace: null
  load_checkpoint_states: model
  target_device: CSX
  mode: train
  wsc_log_level: null
  autoload_last_checkpoint: true
  check_loss_values: true
  disable_strict_checkpoint_loading: null
  dist_addr: localhost:8888
  dist_backend: nccl
  checkpoint_steps: 8231
  disable_version_check: true
  drop_data: false
  enable_distributed: false
  model_dir: artifacts/jais_256M_v12p2_gbs256
  save_initial_checkpoint: false
  precision_opt_level: 1
  num_workers_per_csx: 0
  validate_only: null
  logging: null
  sync_batchnorm: false
  compile_only: null
  log_steps: 1
  num_steps: null
  transfer_processes: null
  num_wgt_servers: null
  num_csx: 4
  num_act_servers: null
  eval_frequency: null
  execute_crd_memory_gi: null
  compile_crd_memory_gi: null
  op_profiler_config: null
  dump_activations: false
  log_input_summaries: false
  main_process_id: 0
  max_checkpoints: 100000
  summary_dir: null
  lazy_initialization: true
  use_cstorch_optimizer_step: false
  wrk_memory_gi: null
  act_memory_gi: null
  cmd_memory_gi: null
  wgt_memory_gi: null
  experimental: {}
  ini: null
  debug_args: null