train_input: batch_size: 976 data_processor: GptHDF5MapDataProcessor mixture: - data_dir: /cra-406/datasets/pile/multilingual_v2/train_correct/ weight: 0.6510508179774476 - data_dir: /cra-406/datasets/github/multilingual_v2/packed_2k/train weight: 0.055087602323960365 - data_dir: /cra-406/datasets/books3_arabic/multilingual_v2/packed_2k/books_3_arabic_train_correct_packed weight: 0.031560734650858936 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/alkhair_train_packed weight: 0.0008441127388845985 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/aranews_train_packed weight: 0.00015702987060793174 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/baai_train_packed weight: 0.02652363386071335 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/C4_train_packed weight: 0.04370135940994404 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/ccnews_train_packed weight: 0.006820988629070355 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/common_crawl_train_packed weight: 0.16413286051785408 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/daypop_train_packed weight: 0.001772579714458703 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en2ar_wikipedia_train_packed weight: 0.006335165657431352 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/en_wikipedia_train_packed weight: 0.0035095904892209306 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/maktabah_train_packed weight: 0.002642036817637927 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/misc_train_packed weight: 6.954077746676907e-05 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/osian_train_packed weight: 0.0006243331144143421 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/un_train_packed weight: 0.001005513115682201 - data_dir: /cra-406/datasets/AraV5/multilingual_v2/packed_2k/train/wikipedia_train_packed weight: 0.00034892678537459647 - data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/ar/train_shuffled weight: 0.0012430476743474177 - data_dir: /cra-406/datasets/multilingual_v2/itc_663m_split/train_shuffled weight: 0.0013597894242614768 - data_dir: /cra-406/datasets/multilingual_v2/uae_news_sep_2023_12b_split/en/train_shuffled weight: 0.0012103364503629503 num_workers: 1 persistent_workers: true prefetch_factor: 10 repeat: true shuffle: false shuffle_seed: 1 use_worker_cache: false vocab_size: 84992 eval_input: batch_size: 32 data_dir: /cb/customers/g42/datasets/multilingual_v2/pile_val_correct_packed data_processor: GptHDF5MapDataProcessor num_workers: 1 repeat: false shuffle: false use_worker_cache: false vocab_size: 84992 model: mixed_precision: true fp16_type: cbfloat16 boundary_casting: false lora_params: null vocab_size: 84992 embedding_layer_norm: false embedding_dropout_rate: 0.0 share_embedding_weights: true position_embedding_type: alibi max_position_embeddings: 2048 position_embedding_offset: 0 num_relative_attention_buckets: 32 rotary_dim: null rope_theta: 10000 pad_rope: false alibi_trainable_slopes: false pos_scaling_factor: 1.0 hidden_size: 1088 num_hidden_layers: 14 dropout_rate: 0.0 norm_type: layernorm layer_norm_epsilon: 1.0e-05 num_heads: 17 attention_module: aiayn_attention extra_attention_params: {} attention_type: scaled_dot_product attention_dropout_rate: 0.0 use_projection_bias_in_attention: true use_ffn_bias_in_attention: true attention_softmax_fp32: false attention_kernel: optimized_beta attention_sliding_window_length: null scale_qk_dot_by_layer_idx: false fixed_sparse_attention: null filter_size: 2912 nonlinearity: swiglu use_ffn_bias: true use_bias_in_output: false loss_scaling: num_tokens loss_weight: 1.0 embeddings_scale: 9.1705785388303 scale_qk_dot_by_d: true output_logits_scale: 0.2576902348606329 initializer: name: truncated_normal mean: 0.0 std: 0.04203434605680388 a: -0.08406869211360776 b: 0.08406869211360776 nonlinearity: null mode: null scale: null distribution: null initializer_range: 0.02 embedding_initializer: name: truncated_normal mean: 0.0 std: 0.0866560243479838 a: -0.1733120486959676 b: 0.1733120486959676 nonlinearity: null mode: null scale: null distribution: null output_layer_initializer: name: truncated_normal mean: 0.0 std: 0.007943744727823684 a: -0.015887489455647368 b: 0.015887489455647368 nonlinearity: null mode: null scale: null distribution: null compute_eval_metrics: true sparsity: null optimizer: optimizer_type: AdamW weight_decay: 0.1 log_summaries: true loss_scaling_factor: dynamic learning_rate: - end_learning_rate: 0.015625 initial_learning_rate: 0.0 scheduler: Linear total_iters: 187 - end_learning_rate: 1.9196e-05 initial_learning_rate: 0.015625 scheduler: Linear total_iters: 240133 max_gradient_norm: 1.0 adjust_learning_rate: decoder_kernel: 0.23529411764705882 betas: - 0.9 - 0.95 correct_bias: true eps: 1.0e-08 runconfig: steps_per_epoch: null max_steps: 240320 mgmt_address: null mount_dirs: - /cra-406 num_epochs: null python_paths: - /cra-406/workdirs/modelzoos/rel-2.2.1/modelzoo/src compile_dir: null checkpoint_path: null credentials_path: null debug_args_path: null retrace_every_iteration: null eval_steps: 5219 init_method: env:// job_time_sec: null job_labels: - Name=Neha_Sengupta - Organization=Inception - Model=Jais_256M - Mode=Train - Num_CSX=8 - Language=Bilingual - Type=Train - Dataset=AraV5_Pile_Github_Books_UAE_ITC job_priority: p2 seed: 1 mgmt_namespace: cra-406 load_checkpoint_states: model,optimizer,global_step,dataloader,lr_scheduler target_device: CSX mode: train wsc_log_level: null autoload_last_checkpoint: true check_loss_values: true disable_strict_checkpoint_loading: null dist_addr: localhost:8888 dist_backend: nccl checkpoint_steps: 24032 disable_version_check: null drop_data: false enable_distributed: false model_dir: artifacts/model_dir_256M save_initial_checkpoint: false precision_opt_level: 1 num_workers_per_csx: 0 validate_only: null logging: null sync_batchnorm: false compile_only: null log_steps: 1 num_steps: null transfer_processes: null num_wgt_servers: null num_csx: 8 num_act_servers: null eval_frequency: null execute_crd_memory_gi: null compile_crd_memory_gi: null op_profiler_config: null dump_activations: false log_input_summaries: false main_process_id: 0 max_checkpoints: 100000 summary_dir: null lazy_initialization: true use_cstorch_optimizer_step: false wrk_memory_gi: null act_memory_gi: null cmd_memory_gi: null wgt_memory_gi: null experimental: {} ini: ws_opt_speculate_optimizer: true debug_args: null