|
|
|
defaults: |
|
- override /trainer: default |
|
- override /model: null |
|
- override /datamodule: openwebtext |
|
|
|
|
|
|
|
- override /optimizer: adamw-apex |
|
- override /scheduler: linear-warmup |
|
- override /callbacks: [default, norm-monitor] |
|
- override /metrics: [perplexity, num-tokens] |
|
- override /logger: wandb |
|
|
|
|
|
|
|
|
|
task: |
|
_target_: src.tasks.seq.SequenceLMModel |
|
|
|
seed: 1111 |
|
|
|
trainer: |
|
accelerator: gpu |
|
devices: 8 |
|
num_nodes: 1 |
|
accumulate_grad_batches: ${div_up:${train.global_batch_size}, ${eval:${trainer.devices} * ${datamodule.batch_size} * ${trainer.num_nodes}}} |
|
max_steps: 400000 |
|
val_check_interval: ${eval:1000 * ${.accumulate_grad_batches}} |
|
check_val_every_n_epoch: null |
|
precision: 16 |
|
gradient_clip_val: 1.0 |
|
strategy: null |
|
|
|
datamodule: |
|
batch_size: 16 |
|
batch_size_eval: ${.batch_size} |
|
max_length: 1024 |
|
fault_tolerant: True |
|
ddp: ${eval:"${trainer.devices} > 1"} |
|
|
|
train: |
|
gpu_mem: ${eval:"round(float(__import__('subprocess').check_output('nvidia-smi -i 0 --query-gpu=memory.total --format=csv,noheader,nounits', shell=True).strip().decode()) / 1000)"} |
|
global_batch_size: 512 |
|
optimizer: |
|
lr: 6e-4 |
|
weight_decay: 0.1 |
|
optimizer_param_grouping: |
|
bias_weight_decay: False |
|
normalization_weight_decay: False |
|
scheduler: |
|
num_warmup_steps: ${eval:0.01 * ${trainer.max_steps}} |
|
num_training_steps: ${trainer.max_steps} |
|
loss_fn: |
|
|
|
|
|
_target_: flash_attn.losses.cross_entropy.CrossEntropyLoss |
|
inplace_backward: True |
|
|
|
eval: |
|
log_on_step: True |
|
|
|
callbacks: |
|
model_checkpoint: |
|
monitor: val/loss |
|
mode: min |
|
save_top_k: 3 |
|
save_last: True |
|
every_n_train_steps: 1000 |
|
dirpath: ${work_dir}/checkpoints/${oc.select:name,''} |
|
filename: step_{step} |
|
auto_insert_metric_name: False |
|
model_checkpoint_progress: |
|
_target_: src.callbacks.model_checkpoint.ModelCheckpointMine |
|
fault_tolerant: True |
|
every_n_train_steps: 50000 |
|
save_last: False |
|
save_top_k: -1 |
|
dirpath: ${..model_checkpoint.dirpath} |
|
filename: progress_step_{step} |
|
auto_insert_metric_name: False |
|
early_stopping: null |
|
|