File size: 1,631 Bytes
d114a5e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
checkpointing:
checkpoints_dir: checkpoints
evaluation:
eval_results_dir: eval_results
fabric_checkpoint_dir: fabric_state
fabric_checkpoint_filename: checkpoint.pt
hf_checkpoint:
collection_slug: null
repo_id: pico-lm/pico-decoder-tiny
learning_dynamics:
batch_size: 256
eval_data: pico-lm/pretokenized-paloma-tinsy
layer_suffixes:
- attention.v_proj
- attention.o_proj
- swiglu.w_2
sequence_idx: -1
learning_dynamics_dir: learning_dynamics
logs_dir: logs
run_name: pico-decoder-tiny-1
runs_dir: runs
save_every_n_steps: 1000
save_to_hf: true
training:
auto_resume: true
data:
dataloader:
batch_size: 1024
dataset:
name: pico-lm/pretokenized-dolma
tokenizer:
name: allenai/OLMo-7B-0724-hf
vocab_size: 50304
evaluation:
metrics:
- paloma
paloma:
batch_size: 32
dataset_name: pico-lm/pretokenized-paloma-tinsy
dataset_split: val
max_length: 2048
model:
activation_hidden_dim: 384
attention_n_heads: 12
attention_n_kv_heads: 4
batch_size: 1024
d_model: 96
max_seq_len: 2048
model_type: pico_decoder
n_layers: 12
norm_eps: 1.0e-06
position_emb_theta: 10000.0
vocab_size: 50304
monitoring:
logging:
log_every_n_steps: 100
log_level: INFO
save_to_wandb: true
wandb:
entity: pico-lm
project: pico-decoder
training:
fabric:
accelerator: cuda
num_devices: 4
num_nodes: 4
precision: bf16-mixed
max_steps: 200000
optimization:
gradient_accumulation_steps: 4
lr: 0.0003
lr_scheduler: linear_with_warmup
lr_warmup_steps: 2500
optimizer: adamw
|