|
train_input: |
|
batch_size: 976 |
|
data_processor: GptHDF5MapDataProcessor |
|
mixture: |
|
- data_dir: /cra-406/datasets/pile/multilingual_v2/train_correct/ |
|
weight: 0.6510508179774476 |
|
- data_dir: /cra-406/datasets/github/multilingual_v2/packed_2k/train |
|
weight: 0.055087602323960365 |
|
- data_dir: /cra-406/datasets/books3_arabic/multilingual_v2/packed_2k/books_3_arabic_train_correct_packed |
|
weight: 0.031560734650858936 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/alkhair_train_packed |
|
weight: 0.0008441127388845985 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/aranews_train_packed |
|
weight: 0.00015702987060793174 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/baai_train_packed |
|
weight: 0.02652363386071335 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/C4_train_packed |
|
weight: 0.04370135940994404 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/ccnews_train_packed |
|
weight: 0.006820988629070355 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/common_crawl_train_packed |
|
weight: 0.16413286051785408 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/daypop_train_packed |
|
weight: 0.001772579714458703 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en2ar_wikipedia_train_packed |
|
weight: 0.006335165657431352 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en_wikipedia_train_packed |
|
weight: 0.0035095904892209306 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/maktabah_train_packed |
|
weight: 0.002642036817637927 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/misc_train_packed |
|
weight: 6.954077746676907e-05 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/osian_train_packed |
|
weight: 0.0006243331144143421 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/un_train_packed |
|
weight: 0.001005513115682201 |
|
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/wikipedia_train_packed |
|
weight: 0.00034892678537459647 |
|
- data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/ar/train_shuffled |
|
weight: 0.0012430476743474177 |
|
- data_dir: /cra-406/datasets/multilingual_v2/itc_663m_split/train_shuffled |
|
weight: 0.0013597894242614768 |
|
- data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/en/train_shuffled |
|
weight: 0.0012103364503629503 |
|
num_workers: 1 |
|
persistent_workers: true |
|
prefetch_factor: 10 |
|
repeat: true |
|
shuffle: false |
|
shuffle_seed: 1 |
|
use_worker_cache: false |
|
vocab_size: 84992 |
|
eval_input: |
|
batch_size: 32 |
|
data_dir: /cb/customers/g42/datasets/multilingual_v2/pile_val_correct_packed |
|
data_processor: GptHDF5MapDataProcessor |
|
num_workers: 1 |
|
repeat: false |
|
shuffle: false |
|
use_worker_cache: false |
|
vocab_size: 84992 |
|
model: |
|
mixed_precision: true |
|
fp16_type: cbfloat16 |
|
boundary_casting: false |
|
lora_params: null |
|
vocab_size: 84992 |
|
embedding_layer_norm: false |
|
embedding_dropout_rate: 0.0 |
|
share_embedding_weights: true |
|
position_embedding_type: alibi |
|
max_position_embeddings: 2048 |
|
position_embedding_offset: 0 |
|
num_relative_attention_buckets: 32 |
|
rotary_dim: null |
|
rope_theta: 10000 |
|
pad_rope: false |
|
alibi_trainable_slopes: false |
|
pos_scaling_factor: 1.0 |
|
hidden_size: 1088 |
|
num_hidden_layers: 14 |
|
dropout_rate: 0.0 |
|
norm_type: layernorm |
|
layer_norm_epsilon: 1.0e-05 |
|
num_heads: 17 |
|
attention_module: aiayn_attention |
|
extra_attention_params: {} |
|
attention_type: scaled_dot_product |
|
attention_dropout_rate: 0.0 |
|
use_projection_bias_in_attention: true |
|
use_ffn_bias_in_attention: true |
|
attention_softmax_fp32: false |
|
attention_kernel: optimized_beta |
|
attention_sliding_window_length: null |
|
scale_qk_dot_by_layer_idx: false |
|
fixed_sparse_attention: null |
|
filter_size: 2912 |
|
nonlinearity: swiglu |
|
use_ffn_bias: true |
|
use_bias_in_output: false |
|
loss_scaling: num_tokens |
|
loss_weight: 1.0 |
|
embeddings_scale: 9.1705785388303 |
|
scale_qk_dot_by_d: true |
|
output_logits_scale: 0.2576902348606329 |
|
initializer: |
|
name: truncated_normal |
|
mean: 0.0 |
|
std: 0.04203434605680388 |
|
a: -0.08406869211360776 |
|
b: 0.08406869211360776 |
|
nonlinearity: null |
|
mode: null |
|
scale: null |
|
distribution: null |
|
initializer_range: 0.02 |
|
embedding_initializer: |
|
name: truncated_normal |
|
mean: 0.0 |
|
std: 0.0866560243479838 |
|
a: -0.1733120486959676 |
|
b: 0.1733120486959676 |
|
nonlinearity: null |
|
mode: null |
|
scale: null |
|
distribution: null |
|
output_layer_initializer: |
|
name: truncated_normal |
|
mean: 0.0 |
|
std: 0.007943744727823684 |
|
a: -0.015887489455647368 |
|
b: 0.015887489455647368 |
|
nonlinearity: null |
|
mode: null |
|
scale: null |
|
distribution: null |
|
compute_eval_metrics: true |
|
sparsity: null |
|
optimizer: |
|
optimizer_type: AdamW |
|
weight_decay: 0.1 |
|
log_summaries: true |
|
loss_scaling_factor: dynamic |
|
learning_rate: |
|
- end_learning_rate: 0.015625 |
|
initial_learning_rate: 0.0 |
|
scheduler: Linear |
|
total_iters: 187 |
|
- end_learning_rate: 1.9196e-05 |
|
initial_learning_rate: 0.015625 |
|
scheduler: Linear |
|
total_iters: 240133 |
|
max_gradient_norm: 1.0 |
|
adjust_learning_rate: |
|
decoder_kernel: 0.23529411764705882 |
|
betas: |
|
- 0.9 |
|
- 0.95 |
|
correct_bias: true |
|
eps: 1.0e-08 |
|
runconfig: |
|
steps_per_epoch: null |
|
max_steps: 240320 |
|
mgmt_address: null |
|
mount_dirs: |
|
- /cra-406 |
|
num_epochs: null |
|
python_paths: |
|
- /cra-406/workdirs/modelzoos/rel-2.2.1/modelzoo/src |
|
compile_dir: null |
|
checkpoint_path: null |
|
credentials_path: null |
|
debug_args_path: null |
|
retrace_every_iteration: null |
|
eval_steps: 5219 |
|
init_method: env:// |
|
job_time_sec: null |
|
job_labels: |
|
- Name=Neha_Sengupta |
|
- Organization=Inception |
|
- Model=Jais_256M |
|
- Mode=Train |
|
- Num_CSX=8 |
|
- Language=Bilingual |
|
- Type=Train |
|
- Dataset=AraV5_Pile_Github_Books_UAE_ITC |
|
job_priority: p2 |
|
seed: 1 |
|
mgmt_namespace: cra-406 |
|
load_checkpoint_states: model,optimizer,global_step,dataloader,lr_scheduler |
|
target_device: CSX |
|
mode: train |
|
wsc_log_level: null |
|
autoload_last_checkpoint: true |
|
check_loss_values: true |
|
disable_strict_checkpoint_loading: null |
|
dist_addr: localhost:8888 |
|
dist_backend: nccl |
|
checkpoint_steps: 24032 |
|
disable_version_check: null |
|
drop_data: false |
|
enable_distributed: false |
|
model_dir: artifacts/model_dir_256M |
|
save_initial_checkpoint: false |
|
precision_opt_level: 1 |
|
num_workers_per_csx: 0 |
|
validate_only: null |
|
logging: null |
|
sync_batchnorm: false |
|
compile_only: null |
|
log_steps: 1 |
|
num_steps: null |
|
transfer_processes: null |
|
num_wgt_servers: null |
|
num_csx: 8 |
|
num_act_servers: null |
|
eval_frequency: null |
|
execute_crd_memory_gi: null |
|
compile_crd_memory_gi: null |
|
op_profiler_config: null |
|
dump_activations: false |
|
log_input_summaries: false |
|
main_process_id: 0 |
|
max_checkpoints: 100000 |
|
summary_dir: null |
|
lazy_initialization: true |
|
use_cstorch_optimizer_step: false |
|
wrk_memory_gi: null |
|
act_memory_gi: null |
|
cmd_memory_gi: null |
|
wgt_memory_gi: null |
|
experimental: {} |
|
ini: |
|
ws_opt_speculate_optimizer: true |
|
debug_args: null |
|
|