train_input: batch_size: 256 data_processor: GptHDF5MapDataProcessor data_dir: /cra-406/datasets/jais_instruction_datasets/v12p2/tokenized_mlv2_2k/ num_workers: 1 persistent_workers: true prefetch_factor: 10 repeat: true shuffle: false shuffle_seed: 1 use_worker_cache: false vocab_size: 84992 eval_input: batch_size: 32 data_dir: /cb/customers/g42/datasets/multilingual_v2/pile_val_correct_packed data_processor: GptHDF5MapDataProcessor num_workers: 1 repeat: false shuffle: false use_worker_cache: false vocab_size: 84992 model: mixed_precision: true fp16_type: cbfloat16 boundary_casting: false lora_params: null vocab_size: 84992 embedding_layer_norm: false embedding_dropout_rate: 0.0 share_embedding_weights: true position_embedding_type: alibi max_position_embeddings: 2048 position_embedding_offset: 0 num_relative_attention_buckets: 32 rotary_dim: null rope_theta: 10000 pad_rope: false alibi_trainable_slopes: false pos_scaling_factor: 1.0 hidden_size: 1088 num_hidden_layers: 14 dropout_rate: 0.0 norm_type: layernorm layer_norm_epsilon: 1.0e-05 num_heads: 17 attention_module: aiayn_attention extra_attention_params: {} attention_type: scaled_dot_product attention_dropout_rate: 0.0 use_projection_bias_in_attention: true use_ffn_bias_in_attention: true attention_softmax_fp32: false attention_kernel: optimized_beta attention_sliding_window_length: null scale_qk_dot_by_layer_idx: false fixed_sparse_attention: null filter_size: 2912 nonlinearity: swiglu use_ffn_bias: true use_bias_in_output: false loss_scaling: num_tokens loss_weight: 1.0 embeddings_scale: 9.1705785388303 scale_qk_dot_by_d: true output_logits_scale: 0.2576902348606329 initializer: name: truncated_normal mean: 0.0 std: 0.04203434605680388 a: -0.08406869211360776 b: 0.08406869211360776 nonlinearity: null mode: null scale: null distribution: null initializer_range: 0.02 embedding_initializer: name: truncated_normal mean: 0.0 std: 0.0866560243479838 a: -0.1733120486959676 b: 0.1733120486959676 nonlinearity: null mode: null scale: null distribution: null output_layer_initializer: name: truncated_normal mean: 0.0 std: 0.007943744727823684 a: -0.015887489455647368 b: 0.015887489455647368 nonlinearity: null mode: null scale: null distribution: null compute_eval_metrics: true sparsity: null optimizer: optimizer_type: AdamW weight_decay: 0.1 log_summaries: true loss_scaling_factor: dynamic learning_rate: - end_learning_rate: 0.0016 initial_learning_rate: 0.0 scheduler: Linear total_iters: 695 - end_learning_rate: 0.00016 initial_learning_rate: 0.0016 scheduler: Linear total_iters: 23995 max_gradient_norm: 1.0 adjust_learning_rate: decoder_kernel: 0.23529411764705882 betas: - 0.9 - 0.95 correct_bias: true eps: 1.0e-08 runconfig: steps_per_epoch: null max_steps: 24690 mgmt_address: null mount_dirs: - /cra-406 num_epochs: null python_paths: - /cra-406/workdirs/modelzoos/rel-2.2.1/modelzoo/src compile_dir: null checkpoint_path: /cra-406/workdirs/240209_Jais_series_v3/artifacts/model_dir_256M/checkpoint_240320.mdl credentials_path: null debug_args_path: null retrace_every_iteration: null eval_steps: 5219 init_method: env:// job_time_sec: null job_labels: - Name=Neha_Sengupta - Organization=Core42 - Model=Jais_256M - Mode=Train - Num_CSX=4 - Language=Bilingual - Type=Train - Dataset=v12p2 job_priority: p3 seed: 1 mgmt_namespace: null load_checkpoint_states: model target_device: CSX mode: train wsc_log_level: null autoload_last_checkpoint: true check_loss_values: true disable_strict_checkpoint_loading: null dist_addr: localhost:8888 dist_backend: nccl checkpoint_steps: 8231 disable_version_check: true drop_data: false enable_distributed: false model_dir: artifacts/jais_256M_v12p2_gbs256 save_initial_checkpoint: false precision_opt_level: 1 num_workers_per_csx: 0 validate_only: null logging: null sync_batchnorm: false compile_only: null log_steps: 1 num_steps: null transfer_processes: null num_wgt_servers: null num_csx: 4 num_act_servers: null eval_frequency: null execute_crd_memory_gi: null compile_crd_memory_gi: null op_profiler_config: null dump_activations: false log_input_summaries: false main_process_id: 0 max_checkpoints: 100000 summary_dir: null lazy_initialization: true use_cstorch_optimizer_step: false wrk_memory_gi: null act_memory_gi: null cmd_memory_gi: null wgt_memory_gi: null experimental: {} ini: null debug_args: null