File size: 4,798 Bytes
7616769 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
train_input:
batch_size: 256
data_processor: GptHDF5MapDataProcessor
data_dir: /cra-406/datasets/jais_instruction_datasets/v12p2/tokenized_mlv2_2k/
num_workers: 1
persistent_workers: true
prefetch_factor: 10
repeat: true
shuffle: false
shuffle_seed: 1
use_worker_cache: false
vocab_size: 84992
eval_input:
batch_size: 32
data_dir: /cb/customers/g42/datasets/multilingual_v2/pile_val_correct_packed
data_processor: GptHDF5MapDataProcessor
num_workers: 1
repeat: false
shuffle: false
use_worker_cache: false
vocab_size: 84992
model:
mixed_precision: true
fp16_type: cbfloat16
boundary_casting: false
lora_params: null
vocab_size: 84992
embedding_layer_norm: false
embedding_dropout_rate: 0.0
share_embedding_weights: true
position_embedding_type: alibi
max_position_embeddings: 2048
position_embedding_offset: 0
num_relative_attention_buckets: 32
rotary_dim: null
rope_theta: 10000
pad_rope: false
alibi_trainable_slopes: false
pos_scaling_factor: 1.0
hidden_size: 1088
num_hidden_layers: 14
dropout_rate: 0.0
norm_type: layernorm
layer_norm_epsilon: 1.0e-05
num_heads: 17
attention_module: aiayn_attention
extra_attention_params: {}
attention_type: scaled_dot_product
attention_dropout_rate: 0.0
use_projection_bias_in_attention: true
use_ffn_bias_in_attention: true
attention_softmax_fp32: false
attention_kernel: optimized_beta
attention_sliding_window_length: null
scale_qk_dot_by_layer_idx: false
fixed_sparse_attention: null
filter_size: 2912
nonlinearity: swiglu
use_ffn_bias: true
use_bias_in_output: false
loss_scaling: num_tokens
loss_weight: 1.0
embeddings_scale: 9.1705785388303
scale_qk_dot_by_d: true
output_logits_scale: 0.2576902348606329
initializer:
name: truncated_normal
mean: 0.0
std: 0.04203434605680388
a: -0.08406869211360776
b: 0.08406869211360776
nonlinearity: null
mode: null
scale: null
distribution: null
initializer_range: 0.02
embedding_initializer:
name: truncated_normal
mean: 0.0
std: 0.0866560243479838
a: -0.1733120486959676
b: 0.1733120486959676
nonlinearity: null
mode: null
scale: null
distribution: null
output_layer_initializer:
name: truncated_normal
mean: 0.0
std: 0.007943744727823684
a: -0.015887489455647368
b: 0.015887489455647368
nonlinearity: null
mode: null
scale: null
distribution: null
compute_eval_metrics: true
sparsity: null
optimizer:
optimizer_type: AdamW
weight_decay: 0.1
log_summaries: true
loss_scaling_factor: dynamic
learning_rate:
- end_learning_rate: 0.0016
initial_learning_rate: 0.0
scheduler: Linear
total_iters: 695
- end_learning_rate: 0.00016
initial_learning_rate: 0.0016
scheduler: Linear
total_iters: 23995
max_gradient_norm: 1.0
adjust_learning_rate:
decoder_kernel: 0.23529411764705882
betas:
- 0.9
- 0.95
correct_bias: true
eps: 1.0e-08
runconfig:
steps_per_epoch: null
max_steps: 24690
mgmt_address: null
mount_dirs:
- /cra-406
num_epochs: null
python_paths:
- /cra-406/workdirs/modelzoos/rel-2.2.1/modelzoo/src
compile_dir: null
checkpoint_path: /cra-406/workdirs/240209_Jais_series_v3/artifacts/model_dir_256M/checkpoint_240320.mdl
credentials_path: null
debug_args_path: null
retrace_every_iteration: null
eval_steps: 5219
init_method: env://
job_time_sec: null
job_labels:
- Name=Neha_Sengupta
- Organization=Core42
- Model=Jais_256M
- Mode=Train
- Num_CSX=4
- Language=Bilingual
- Type=Train
- Dataset=v12p2
job_priority: p3
seed: 1
mgmt_namespace: null
load_checkpoint_states: model
target_device: CSX
mode: train
wsc_log_level: null
autoload_last_checkpoint: true
check_loss_values: true
disable_strict_checkpoint_loading: null
dist_addr: localhost:8888
dist_backend: nccl
checkpoint_steps: 8231
disable_version_check: true
drop_data: false
enable_distributed: false
model_dir: artifacts/jais_256M_v12p2_gbs256
save_initial_checkpoint: false
precision_opt_level: 1
num_workers_per_csx: 0
validate_only: null
logging: null
sync_batchnorm: false
compile_only: null
log_steps: 1
num_steps: null
transfer_processes: null
num_wgt_servers: null
num_csx: 4
num_act_servers: null
eval_frequency: null
execute_crd_memory_gi: null
compile_crd_memory_gi: null
op_profiler_config: null
dump_activations: false
log_input_summaries: false
main_process_id: 0
max_checkpoints: 100000
summary_dir: null
lazy_initialization: true
use_cstorch_optimizer_step: false
wrk_memory_gi: null
act_memory_gi: null
cmd_memory_gi: null
wgt_memory_gi: null
experimental: {}
ini: null
debug_args: null
|