checkpointing: | |
checkpoints_dir: checkpoints | |
evaluation: | |
eval_results_dir: eval_results | |
fabric_checkpoint_dir: fabric_state | |
fabric_checkpoint_filename: checkpoint.pt | |
hf_checkpoint: | |
collection_slug: null | |
repo_id: pico-lm/pico-decoder-large | |
learning_dynamics: | |
batch_size: 128 | |
eval_data: pico-lm/pretokenized-paloma-tinsy | |
layer_suffixes: | |
- attention.v_proj | |
- attention.o_proj | |
- swiglu.w_2 | |
sequence_idx: -1 | |
learning_dynamics_dir: learning_dynamics | |
logs_dir: logs | |
run_name: pico-decoder-large-1 | |
runs_dir: runs | |
save_every_n_steps: 1000 | |
save_to_hf: true | |
training: | |
auto_resume: true | |
data: | |
dataloader: | |
batch_size: 1024 | |
dataset: | |
name: pico-lm/pretokenized-dolma | |
tokenizer: | |
name: allenai/OLMo-7B-0724-hf | |
vocab_size: 50304 | |
evaluation: | |
metrics: | |
- paloma | |
paloma: | |
batch_size: 16 | |
dataset_name: pico-lm/pretokenized-paloma-tinsy | |
dataset_split: val | |
max_length: 2048 | |
model: | |
activation_hidden_dim: 6144 | |
attention_n_heads: 12 | |
attention_n_kv_heads: 4 | |
batch_size: 1024 | |
d_model: 1536 | |
max_seq_len: 2048 | |
model_type: pico_decoder | |
n_layers: 12 | |
norm_eps: 1.0e-06 | |
position_emb_theta: 10000.0 | |
vocab_size: 50304 | |
monitoring: | |
logging: | |
log_every_n_steps: 100 | |
log_level: INFO | |
save_to_wandb: true | |
wandb: | |
entity: pico-lm | |
project: pico-decoder | |
training: | |
fabric: | |
accelerator: cuda | |
num_devices: 4 | |
num_nodes: 4 | |
precision: bf16-mixed | |
max_steps: 200000 | |
optimization: | |
gradient_accumulation_steps: 8 | |
lr: 0.0003 | |
lr_scheduler: linear_with_warmup | |
lr_warmup_steps: 2500 | |
optimizer: adamw | |