|
run_name: AMD-OLMo-1B-SFT-2nd-phase |
|
seed: 6198 |
|
dry_run: false |
|
|
|
wandb: |
|
name: ${run_name} |
|
project: AMD-OLMo |
|
group: SFT |
|
|
|
model: |
|
d_model: 2048 |
|
n_heads: 16 |
|
n_layers: 16 |
|
mlp_ratio: 8 |
|
weight_tying: true |
|
alibi: false |
|
rope: true |
|
flash_attention: false |
|
attention_dropout: 0.0 |
|
attention_layer_norm: false |
|
multi_query_attention: false |
|
include_bias: false |
|
block_type: sequential |
|
layer_norm_type: default |
|
layer_norm_with_affine: false |
|
bias_for_layer_norm: false |
|
attention_layer_norm_with_affine: false |
|
activation_type: swiglu |
|
residual_dropout: 0.0 |
|
embedding_dropout: 0.0 |
|
max_sequence_length: 2048 |
|
vocab_size: 50280 |
|
embedding_size: 50304 |
|
eos_token_id: 50279 |
|
pad_token_id: 1 |
|
init_device: meta |
|
init_fn: mitchell |
|
|
|
compile: |
|
fullgraph: false |
|
|
|
optimizer: |
|
name: adamw |
|
learning_rate: 2.0e-5 |
|
weight_decay: 0 |
|
betas: |
|
- 0.9 |
|
- 0.95 |
|
metrics_log_interval: 10 |
|
|
|
scheduler: |
|
name: linear_with_warmup |
|
t_warmup: 200 |
|
alpha_f: 0.001 |
|
|
|
tokenizer: |
|
identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json |
|
truncate_direction: right |
|
|
|
save_folder: ./outputs/${run_name}/ |
|
save_overwrite: true |
|
|
|
save_interval: 1000 |
|
save_num_checkpoints_to_keep: -1 |
|
|
|
save_interval_unsharded: 10000 |
|
save_num_unsharded_checkpoints_to_keep: -1 |
|
|
|
load_path: path_to_unsharded_1st_phase_SFT_checkpoint |
|
reset_trainer_state: true |
|
|
|
max_duration: 3ep |
|
global_train_batch_size: 512 |
|
device_train_microbatch_size: 8 |
|
|
|
precision: amp_bf16 |
|
|
|
fsdp: |
|
wrapping_strategy: null |
|
precision: mixed |
|
|
|
max_grad_norm: 1.0 |
|
max_grad_norm_ratio: null |
|
|
|
speed_monitor: |
|
window_size: 20 |
|
|
|
eval_interval: ${save_interval} |
|
eval_subset_num_batches: -1 |
|
device_eval_batch_size: ${device_train_microbatch_size} |
|
evaluators: |
|
- label: piqa |
|
type: downstream |
|
|
|
- label: hellaswag |
|
type: downstream |
|
|
|
- label: winogrande |
|
type: downstream |
|
|
|
- label: openbook_qa |
|
type: downstream |
|
|
|
|
|
|
|
|
|
- label: sciq |
|
type: downstream |
|
|
|
- label: arc_easy |
|
type: downstream |
|
|
|
|
|
|
|
|
|
- label: copa |
|
type: downstream |
|
|
|
- label: rte |
|
type: downstream |
|
|
|
- label: commitment_bank |
|
type: downstream |
|
|
|
- label: mrpc |
|
type: downstream |
|
|
|
- label: sst2 |
|
type: downstream |
|
|
|
data: |
|
pad_direction: right |
|
num_workers: 0 |
|
drop_last: true |
|
pin_memory: true |
|
prefetch_factor: 1 |
|
persistent_workers: true |
|
timeout: 0 |
|
generate_attention_mask: true |
|
paths: |
|
- ./datasets/OpenHermes_WebInstructSub_CodeFeedBack/input_ids.npy |
|
label_mask_paths: |
|
- ./datasets/OpenHermes_WebInstructSub_CodeFeedBack/label_mask.npy |