File size: 7,212 Bytes
b7bdf95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
train_input:
  batch_size: 976
  data_processor: GptHDF5MapDataProcessor
  mixture:
  - data_dir: /cra-406/datasets/pile/multilingual_v2/train_correct/
    weight: 0.6510508179774476
  - data_dir: /cra-406/datasets/github/multilingual_v2/packed_2k/train
    weight: 0.055087602323960365
  - data_dir: /cra-406/datasets/books3_arabic/multilingual_v2/packed_2k/books_3_arabic_train_correct_packed
    weight: 0.031560734650858936
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/alkhair_train_packed
    weight: 0.0008441127388845985
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/aranews_train_packed
    weight: 0.00015702987060793174
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/baai_train_packed
    weight: 0.02652363386071335
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/C4_train_packed
    weight: 0.04370135940994404
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/ccnews_train_packed
    weight: 0.006820988629070355
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/common_crawl_train_packed
    weight: 0.16413286051785408
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/daypop_train_packed
    weight: 0.001772579714458703
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en2ar_wikipedia_train_packed
    weight: 0.006335165657431352
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en_wikipedia_train_packed
    weight: 0.0035095904892209306
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/maktabah_train_packed
    weight: 0.002642036817637927
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/misc_train_packed
    weight: 6.954077746676907e-05
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/osian_train_packed
    weight: 0.0006243331144143421
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/un_train_packed
    weight: 0.001005513115682201
  - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/wikipedia_train_packed
    weight: 0.00034892678537459647
  - data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/ar/train_shuffled
    weight: 0.0012430476743474177
  - data_dir: /cra-406/datasets/multilingual_v2/itc_663m_split/train_shuffled
    weight: 0.0013597894242614768
  - data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/en/train_shuffled
    weight: 0.0012103364503629503
  num_workers: 1
  persistent_workers: true
  prefetch_factor: 10
  repeat: true
  shuffle: false
  shuffle_seed: 1
  use_worker_cache: false
  vocab_size: 84992
eval_input:
  batch_size: 32
  data_dir: /cb/customers/g42/datasets/multilingual_v2/pile_val_correct_packed
  data_processor: GptHDF5MapDataProcessor
  num_workers: 1
  repeat: false
  shuffle: false
  use_worker_cache: false
  vocab_size: 84992
model:
  mixed_precision: true
  fp16_type: cbfloat16
  boundary_casting: false
  lora_params: null
  vocab_size: 84992
  embedding_layer_norm: false
  embedding_dropout_rate: 0.0
  share_embedding_weights: true
  position_embedding_type: alibi
  max_position_embeddings: 2048
  position_embedding_offset: 0
  num_relative_attention_buckets: 32
  rotary_dim: null
  rope_theta: 10000
  pad_rope: false
  alibi_trainable_slopes: false
  pos_scaling_factor: 1.0
  hidden_size: 1088
  num_hidden_layers: 14
  dropout_rate: 0.0
  norm_type: layernorm
  layer_norm_epsilon: 1.0e-05
  num_heads: 17
  attention_module: aiayn_attention
  extra_attention_params: {}
  attention_type: scaled_dot_product
  attention_dropout_rate: 0.0
  use_projection_bias_in_attention: true
  use_ffn_bias_in_attention: true
  attention_softmax_fp32: false
  attention_kernel: optimized_beta
  attention_sliding_window_length: null
  scale_qk_dot_by_layer_idx: false
  fixed_sparse_attention: null
  filter_size: 2912
  nonlinearity: swiglu
  use_ffn_bias: true
  use_bias_in_output: false
  loss_scaling: num_tokens
  loss_weight: 1.0
  embeddings_scale: 9.1705785388303
  scale_qk_dot_by_d: true
  output_logits_scale: 0.2576902348606329
  initializer:
    name: truncated_normal
    mean: 0.0
    std: 0.04203434605680388
    a: -0.08406869211360776
    b: 0.08406869211360776
    nonlinearity: null
    mode: null
    scale: null
    distribution: null
  initializer_range: 0.02
  embedding_initializer:
    name: truncated_normal
    mean: 0.0
    std: 0.0866560243479838
    a: -0.1733120486959676
    b: 0.1733120486959676
    nonlinearity: null
    mode: null
    scale: null
    distribution: null
  output_layer_initializer:
    name: truncated_normal
    mean: 0.0
    std: 0.007943744727823684
    a: -0.015887489455647368
    b: 0.015887489455647368
    nonlinearity: null
    mode: null
    scale: null
    distribution: null
  compute_eval_metrics: true
sparsity: null
optimizer:
  optimizer_type: AdamW
  weight_decay: 0.1
  log_summaries: true
  loss_scaling_factor: dynamic
  learning_rate:
  - end_learning_rate: 0.015625
    initial_learning_rate: 0.0
    scheduler: Linear
    total_iters: 187
  - end_learning_rate: 1.9196e-05
    initial_learning_rate: 0.015625
    scheduler: Linear
    total_iters: 240133
  max_gradient_norm: 1.0
  adjust_learning_rate:
    decoder_kernel: 0.23529411764705882
  betas:
  - 0.9
  - 0.95
  correct_bias: true
  eps: 1.0e-08
runconfig:
  steps_per_epoch: null
  max_steps: 240320
  mgmt_address: null
  mount_dirs:
  - /cra-406
  num_epochs: null
  python_paths:
  - /cra-406/workdirs/modelzoos/rel-2.2.1/modelzoo/src
  compile_dir: null
  checkpoint_path: null
  credentials_path: null
  debug_args_path: null
  retrace_every_iteration: null
  eval_steps: 5219
  init_method: env://
  job_time_sec: null
  job_labels:
  - Name=Neha_Sengupta
  - Organization=Inception
  - Model=Jais_256M
  - Mode=Train
  - Num_CSX=8
  - Language=Bilingual
  - Type=Train
  - Dataset=AraV5_Pile_Github_Books_UAE_ITC
  job_priority: p2
  seed: 1
  mgmt_namespace: cra-406
  load_checkpoint_states: model,optimizer,global_step,dataloader,lr_scheduler
  target_device: CSX
  mode: train
  wsc_log_level: null
  autoload_last_checkpoint: true
  check_loss_values: true
  disable_strict_checkpoint_loading: null
  dist_addr: localhost:8888
  dist_backend: nccl
  checkpoint_steps: 24032
  disable_version_check: null
  drop_data: false
  enable_distributed: false
  model_dir: artifacts/model_dir_256M
  save_initial_checkpoint: false
  precision_opt_level: 1
  num_workers_per_csx: 0
  validate_only: null
  logging: null
  sync_batchnorm: false
  compile_only: null
  log_steps: 1
  num_steps: null
  transfer_processes: null
  num_wgt_servers: null
  num_csx: 8
  num_act_servers: null
  eval_frequency: null
  execute_crd_memory_gi: null
  compile_crd_memory_gi: null
  op_profiler_config: null
  dump_activations: false
  log_input_summaries: false
  main_process_id: 0
  max_checkpoints: 100000
  summary_dir: null
  lazy_initialization: true
  use_cstorch_optimizer_step: false
  wrk_memory_gi: null
  act_memory_gi: null
  cmd_memory_gi: null
  wgt_memory_gi: null
  experimental: {}
  ini:
    ws_opt_speculate_optimizer: true
  debug_args: null