Jais-family-256m / params_train.yaml
onkarpandit-g42's picture
Upload params_train.yaml with huggingface_hub
b7bdf95 verified
train_input:
batch_size: 976
data_processor: GptHDF5MapDataProcessor
mixture:
- data_dir: /cra-406/datasets/pile/multilingual_v2/train_correct/
weight: 0.6510508179774476
- data_dir: /cra-406/datasets/github/multilingual_v2/packed_2k/train
weight: 0.055087602323960365
- data_dir: /cra-406/datasets/books3_arabic/multilingual_v2/packed_2k/books_3_arabic_train_correct_packed
weight: 0.031560734650858936
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/alkhair_train_packed
weight: 0.0008441127388845985
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/aranews_train_packed
weight: 0.00015702987060793174
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/baai_train_packed
weight: 0.02652363386071335
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/C4_train_packed
weight: 0.04370135940994404
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/ccnews_train_packed
weight: 0.006820988629070355
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/common_crawl_train_packed
weight: 0.16413286051785408
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/daypop_train_packed
weight: 0.001772579714458703
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en2ar_wikipedia_train_packed
weight: 0.006335165657431352
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en_wikipedia_train_packed
weight: 0.0035095904892209306
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/maktabah_train_packed
weight: 0.002642036817637927
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/misc_train_packed
weight: 6.954077746676907e-05
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/osian_train_packed
weight: 0.0006243331144143421
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/un_train_packed
weight: 0.001005513115682201
- data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/wikipedia_train_packed
weight: 0.00034892678537459647
- data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/ar/train_shuffled
weight: 0.0012430476743474177
- data_dir: /cra-406/datasets/multilingual_v2/itc_663m_split/train_shuffled
weight: 0.0013597894242614768
- data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/en/train_shuffled
weight: 0.0012103364503629503
num_workers: 1
persistent_workers: true
prefetch_factor: 10
repeat: true
shuffle: false
shuffle_seed: 1
use_worker_cache: false
vocab_size: 84992
eval_input:
batch_size: 32
data_dir: /cb/customers/g42/datasets/multilingual_v2/pile_val_correct_packed
data_processor: GptHDF5MapDataProcessor
num_workers: 1
repeat: false
shuffle: false
use_worker_cache: false
vocab_size: 84992
model:
mixed_precision: true
fp16_type: cbfloat16
boundary_casting: false
lora_params: null
vocab_size: 84992
embedding_layer_norm: false
embedding_dropout_rate: 0.0
share_embedding_weights: true
position_embedding_type: alibi
max_position_embeddings: 2048
position_embedding_offset: 0
num_relative_attention_buckets: 32
rotary_dim: null
rope_theta: 10000
pad_rope: false
alibi_trainable_slopes: false
pos_scaling_factor: 1.0
hidden_size: 1088
num_hidden_layers: 14
dropout_rate: 0.0
norm_type: layernorm
layer_norm_epsilon: 1.0e-05
num_heads: 17
attention_module: aiayn_attention
extra_attention_params: {}
attention_type: scaled_dot_product
attention_dropout_rate: 0.0
use_projection_bias_in_attention: true
use_ffn_bias_in_attention: true
attention_softmax_fp32: false
attention_kernel: optimized_beta
attention_sliding_window_length: null
scale_qk_dot_by_layer_idx: false
fixed_sparse_attention: null
filter_size: 2912
nonlinearity: swiglu
use_ffn_bias: true
use_bias_in_output: false
loss_scaling: num_tokens
loss_weight: 1.0
embeddings_scale: 9.1705785388303
scale_qk_dot_by_d: true
output_logits_scale: 0.2576902348606329
initializer:
name: truncated_normal
mean: 0.0
std: 0.04203434605680388
a: -0.08406869211360776
b: 0.08406869211360776
nonlinearity: null
mode: null
scale: null
distribution: null
initializer_range: 0.02
embedding_initializer:
name: truncated_normal
mean: 0.0
std: 0.0866560243479838
a: -0.1733120486959676
b: 0.1733120486959676
nonlinearity: null
mode: null
scale: null
distribution: null
output_layer_initializer:
name: truncated_normal
mean: 0.0
std: 0.007943744727823684
a: -0.015887489455647368
b: 0.015887489455647368
nonlinearity: null
mode: null
scale: null
distribution: null
compute_eval_metrics: true
sparsity: null
optimizer:
optimizer_type: AdamW
weight_decay: 0.1
log_summaries: true
loss_scaling_factor: dynamic
learning_rate:
- end_learning_rate: 0.015625
initial_learning_rate: 0.0
scheduler: Linear
total_iters: 187
- end_learning_rate: 1.9196e-05
initial_learning_rate: 0.015625
scheduler: Linear
total_iters: 240133
max_gradient_norm: 1.0
adjust_learning_rate:
decoder_kernel: 0.23529411764705882
betas:
- 0.9
- 0.95
correct_bias: true
eps: 1.0e-08
runconfig:
steps_per_epoch: null
max_steps: 240320
mgmt_address: null
mount_dirs:
- /cra-406
num_epochs: null
python_paths:
- /cra-406/workdirs/modelzoos/rel-2.2.1/modelzoo/src
compile_dir: null
checkpoint_path: null
credentials_path: null
debug_args_path: null
retrace_every_iteration: null
eval_steps: 5219
init_method: env://
job_time_sec: null
job_labels:
- Name=Neha_Sengupta
- Organization=Inception
- Model=Jais_256M
- Mode=Train
- Num_CSX=8
- Language=Bilingual
- Type=Train
- Dataset=AraV5_Pile_Github_Books_UAE_ITC
job_priority: p2
seed: 1
mgmt_namespace: cra-406
load_checkpoint_states: model,optimizer,global_step,dataloader,lr_scheduler
target_device: CSX
mode: train
wsc_log_level: null
autoload_last_checkpoint: true
check_loss_values: true
disable_strict_checkpoint_loading: null
dist_addr: localhost:8888
dist_backend: nccl
checkpoint_steps: 24032
disable_version_check: null
drop_data: false
enable_distributed: false
model_dir: artifacts/model_dir_256M
save_initial_checkpoint: false
precision_opt_level: 1
num_workers_per_csx: 0
validate_only: null
logging: null
sync_batchnorm: false
compile_only: null
log_steps: 1
num_steps: null
transfer_processes: null
num_wgt_servers: null
num_csx: 8
num_act_servers: null
eval_frequency: null
execute_crd_memory_gi: null
compile_crd_memory_gi: null
op_profiler_config: null
dump_activations: false
log_input_summaries: false
main_process_id: 0
max_checkpoints: 100000
summary_dir: null
lazy_initialization: true
use_cstorch_optimizer_step: false
wrk_memory_gi: null
act_memory_gi: null
cmd_memory_gi: null
wgt_memory_gi: null
experimental: {}
ini:
ws_opt_speculate_optimizer: true
debug_args: null