File size: 7,212 Bytes
b7bdf95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
train_input:
batch_size: 976
data_processor: GptHDF5MapDataProcessor
mixture:
- data_dir: /cra-406/datasets/pile/multilingual_v2/train_correct/
weight: 0.6510508179774476
- data_dir: /cra-406/datasets/github/multilingual_v2/packed_2k/train
weight: 0.055087602323960365
- data_dir: /cra-406/datasets/books3_arabic/multilingual_v2/packed_2k/books_3_arabic_train_correct_packed
weight: 0.031560734650858936
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/alkhair_train_packed
weight: 0.0008441127388845985
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/aranews_train_packed
weight: 0.00015702987060793174
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/baai_train_packed
weight: 0.02652363386071335
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/C4_train_packed
weight: 0.04370135940994404
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/ccnews_train_packed
weight: 0.006820988629070355
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/common_crawl_train_packed
weight: 0.16413286051785408
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/daypop_train_packed
weight: 0.001772579714458703
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en2ar_wikipedia_train_packed
weight: 0.006335165657431352
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en_wikipedia_train_packed
weight: 0.0035095904892209306
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/maktabah_train_packed
weight: 0.002642036817637927
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/misc_train_packed
weight: 6.954077746676907e-05
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/osian_train_packed
weight: 0.0006243331144143421
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/un_train_packed
weight: 0.001005513115682201
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/wikipedia_train_packed
weight: 0.00034892678537459647
- data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/ar/train_shuffled
weight: 0.0012430476743474177
- data_dir: /cra-406/datasets/multilingual_v2/itc_663m_split/train_shuffled
weight: 0.0013597894242614768
- data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/en/train_shuffled
weight: 0.0012103364503629503
num_workers: 1
persistent_workers: true
prefetch_factor: 10
repeat: true
shuffle: false
shuffle_seed: 1
use_worker_cache: false
vocab_size: 84992
eval_input:
batch_size: 32
data_dir: /cb/customers/g42/datasets/multilingual_v2/pile_val_correct_packed
data_processor: GptHDF5MapDataProcessor
num_workers: 1
repeat: false
shuffle: false
use_worker_cache: false
vocab_size: 84992
model:
mixed_precision: true
fp16_type: cbfloat16
boundary_casting: false
lora_params: null
vocab_size: 84992
embedding_layer_norm: false
embedding_dropout_rate: 0.0
share_embedding_weights: true
position_embedding_type: alibi
max_position_embeddings: 2048
position_embedding_offset: 0
num_relative_attention_buckets: 32
rotary_dim: null
rope_theta: 10000
pad_rope: false
alibi_trainable_slopes: false
pos_scaling_factor: 1.0
hidden_size: 1088
num_hidden_layers: 14
dropout_rate: 0.0
norm_type: layernorm
layer_norm_epsilon: 1.0e-05
num_heads: 17
attention_module: aiayn_attention
extra_attention_params: {}
attention_type: scaled_dot_product
attention_dropout_rate: 0.0
use_projection_bias_in_attention: true
use_ffn_bias_in_attention: true
attention_softmax_fp32: false
attention_kernel: optimized_beta
attention_sliding_window_length: null
scale_qk_dot_by_layer_idx: false
fixed_sparse_attention: null
filter_size: 2912
nonlinearity: swiglu
use_ffn_bias: true
use_bias_in_output: false
loss_scaling: num_tokens
loss_weight: 1.0
embeddings_scale: 9.1705785388303
scale_qk_dot_by_d: true
output_logits_scale: 0.2576902348606329
initializer:
name: truncated_normal
mean: 0.0
std: 0.04203434605680388
a: -0.08406869211360776
b: 0.08406869211360776
nonlinearity: null
mode: null
scale: null
distribution: null
initializer_range: 0.02
embedding_initializer:
name: truncated_normal
mean: 0.0
std: 0.0866560243479838
a: -0.1733120486959676
b: 0.1733120486959676
nonlinearity: null
mode: null
scale: null
distribution: null
output_layer_initializer:
name: truncated_normal
mean: 0.0
std: 0.007943744727823684
a: -0.015887489455647368
b: 0.015887489455647368
nonlinearity: null
mode: null
scale: null
distribution: null
compute_eval_metrics: true
sparsity: null
optimizer:
optimizer_type: AdamW
weight_decay: 0.1
log_summaries: true
loss_scaling_factor: dynamic
learning_rate:
- end_learning_rate: 0.015625
initial_learning_rate: 0.0
scheduler: Linear
total_iters: 187
- end_learning_rate: 1.9196e-05
initial_learning_rate: 0.015625
scheduler: Linear
total_iters: 240133
max_gradient_norm: 1.0
adjust_learning_rate:
decoder_kernel: 0.23529411764705882
betas:
- 0.9
- 0.95
correct_bias: true
eps: 1.0e-08
runconfig:
steps_per_epoch: null
max_steps: 240320
mgmt_address: null
mount_dirs:
- /cra-406
num_epochs: null
python_paths:
- /cra-406/workdirs/modelzoos/rel-2.2.1/modelzoo/src
compile_dir: null
checkpoint_path: null
credentials_path: null
debug_args_path: null
retrace_every_iteration: null
eval_steps: 5219
init_method: env://
job_time_sec: null
job_labels:
- Name=Neha_Sengupta
- Organization=Inception
- Model=Jais_256M
- Mode=Train
- Num_CSX=8
- Language=Bilingual
- Type=Train
- Dataset=AraV5_Pile_Github_Books_UAE_ITC
job_priority: p2
seed: 1
mgmt_namespace: cra-406
load_checkpoint_states: model,optimizer,global_step,dataloader,lr_scheduler
target_device: CSX
mode: train
wsc_log_level: null
autoload_last_checkpoint: true
check_loss_values: true
disable_strict_checkpoint_loading: null
dist_addr: localhost:8888
dist_backend: nccl
checkpoint_steps: 24032
disable_version_check: null
drop_data: false
enable_distributed: false
model_dir: artifacts/model_dir_256M
save_initial_checkpoint: false
precision_opt_level: 1
num_workers_per_csx: 0
validate_only: null
logging: null
sync_batchnorm: false
compile_only: null
log_steps: 1
num_steps: null
transfer_processes: null
num_wgt_servers: null
num_csx: 8
num_act_servers: null
eval_frequency: null
execute_crd_memory_gi: null
compile_crd_memory_gi: null
op_profiler_config: null
dump_activations: false
log_input_summaries: false
main_process_id: 0
max_checkpoints: 100000
summary_dir: null
lazy_initialization: true
use_cstorch_optimizer_step: false
wrk_memory_gi: null
act_memory_gi: null
cmd_memory_gi: null
wgt_memory_gi: null
experimental: {}
ini:
ws_opt_speculate_optimizer: true
debug_args: null
|