run_name: AMD-OLMo-1B-SFT-1st-phase seed: 6198 dry_run: false wandb: name: ${run_name} project: AMD-OLMo group: SFT model: d_model: 2048 n_heads: 16 n_layers: 16 mlp_ratio: 8 weight_tying: true alibi: false rope: true flash_attention: false attention_dropout: 0.0 attention_layer_norm: false multi_query_attention: false include_bias: false block_type: sequential layer_norm_type: default layer_norm_with_affine: false bias_for_layer_norm: false attention_layer_norm_with_affine: false activation_type: swiglu residual_dropout: 0.0 embedding_dropout: 0.0 max_sequence_length: 2048 vocab_size: 50280 embedding_size: 50304 eos_token_id: 50279 pad_token_id: 1 init_device: meta init_fn: mitchell compile: fullgraph: false optimizer: name: adamw learning_rate: 2.0e-5 weight_decay: 0 betas: - 0.9 - 0.95 metrics_log_interval: 10 scheduler: name: linear_with_warmup t_warmup: 200 alpha_f: 0.001 tokenizer: identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json truncate_direction: right save_folder: ./outputs/${run_name}/ save_overwrite: true # Sharded checkpoints (best for restarts) save_interval: 1000 save_num_checkpoints_to_keep: -1 # Unsharded checkpoints (for final storage) save_interval_unsharded: 10000 save_num_unsharded_checkpoints_to_keep: -1 load_path: path_to_unsharded_pretrain_checkpoint reset_trainer_state: true max_duration: 3ep # train 3 epochs global_train_batch_size: 128 device_train_microbatch_size: 8 precision: amp_bf16 fsdp: wrapping_strategy: null precision: mixed max_grad_norm: 1.0 max_grad_norm_ratio: null speed_monitor: window_size: 20 eval_interval: ${save_interval} eval_subset_num_batches: -1 device_eval_batch_size: ${device_train_microbatch_size} evaluators: - label: piqa type: downstream - label: hellaswag type: downstream - label: winogrande type: downstream - label: openbook_qa type: downstream # - label: boolq # requires implemention of the pmi_dc matrix # type: downstream - label: sciq type: downstream - label: arc_easy type: downstream # - label: arc_challenge # requires implemention of the pmi_dc matrix # type: downstream - label: copa type: downstream - label: rte type: downstream - label: commitment_bank type: downstream - label: mrpc type: downstream - label: sst2 type: downstream data: pad_direction: right num_workers: 0 drop_last: true pin_memory: true prefetch_factor: 1 persistent_workers: true timeout: 0 generate_attention_mask: true paths: - ./datasets/tulu/input_ids.npy label_mask_paths: - ./datasets/tulu/label_mask.npy