File size: 2,803 Bytes
27651a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
run_name: AMD-OLMo-1B-SFT-2nd-phase
seed: 6198
dry_run: false
wandb:
name: ${run_name}
project: AMD-OLMo
group: SFT
model:
d_model: 2048
n_heads: 16
n_layers: 16
mlp_ratio: 8
weight_tying: true
alibi: false
rope: true
flash_attention: false
attention_dropout: 0.0
attention_layer_norm: false
multi_query_attention: false
include_bias: false
block_type: sequential
layer_norm_type: default
layer_norm_with_affine: false
bias_for_layer_norm: false
attention_layer_norm_with_affine: false
activation_type: swiglu
residual_dropout: 0.0
embedding_dropout: 0.0
max_sequence_length: 2048
vocab_size: 50280
embedding_size: 50304
eos_token_id: 50279
pad_token_id: 1
init_device: meta
init_fn: mitchell
compile:
fullgraph: false
optimizer:
name: adamw
learning_rate: 2.0e-5
weight_decay: 0
betas:
- 0.9
- 0.95
metrics_log_interval: 10
scheduler:
name: linear_with_warmup
t_warmup: 200
alpha_f: 0.001
tokenizer:
identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
truncate_direction: right
save_folder: ./outputs/${run_name}/
save_overwrite: true
# Sharded checkpoints (best for restarts)
save_interval: 1000
save_num_checkpoints_to_keep: -1
# Unsharded checkpoints (for final storage)
save_interval_unsharded: 10000
save_num_unsharded_checkpoints_to_keep: -1
load_path: path_to_unsharded_1st_phase_SFT_checkpoint
reset_trainer_state: true
max_duration: 3ep # train 3 epochs
global_train_batch_size: 512
device_train_microbatch_size: 8
precision: amp_bf16
fsdp:
wrapping_strategy: null
precision: mixed
max_grad_norm: 1.0
max_grad_norm_ratio: null
speed_monitor:
window_size: 20
eval_interval: ${save_interval}
eval_subset_num_batches: -1
device_eval_batch_size: ${device_train_microbatch_size}
evaluators:
- label: piqa
type: downstream
- label: hellaswag
type: downstream
- label: winogrande
type: downstream
- label: openbook_qa
type: downstream
# - label: boolq # requires implemention of the pmi_dc matrix
# type: downstream
- label: sciq
type: downstream
- label: arc_easy
type: downstream
# - label: arc_challenge # requires implemention of the pmi_dc matrix
# type: downstream
- label: copa
type: downstream
- label: rte
type: downstream
- label: commitment_bank
type: downstream
- label: mrpc
type: downstream
- label: sst2
type: downstream
data:
pad_direction: right
num_workers: 0
drop_last: true
pin_memory: true
prefetch_factor: 1
persistent_workers: true
timeout: 0
generate_attention_mask: true
paths:
- ./datasets/OpenHermes_WebInstructSub_CodeFeedBack/input_ids.npy
label_mask_paths:
- ./datasets/OpenHermes_WebInstructSub_CodeFeedBack/label_mask.npy |