pico-decoder-large / training_config.yaml
rdiehlmartinez's picture
pico-decoder-large-1 trained to 50k steps
6bfde12
checkpointing:
checkpoints_dir: checkpoints
evaluation:
eval_results_dir: eval_results
fabric_checkpoint_dir: fabric_state
fabric_checkpoint_filename: checkpoint.pt
hf_checkpoint:
collection_slug: null
repo_id: pico-lm/pico-decoder-large
learning_dynamics:
batch_size: 128
eval_data: pico-lm/pretokenized-paloma-tinsy
layer_suffixes:
- attention.v_proj
- attention.o_proj
- swiglu.w_2
sequence_idx: -1
learning_dynamics_dir: learning_dynamics
logs_dir: logs
run_name: pico-decoder-large-1
runs_dir: runs
save_every_n_steps: 1000
save_to_hf: true
training:
auto_resume: true
data:
dataloader:
batch_size: 1024
dataset:
name: pico-lm/pretokenized-dolma
tokenizer:
name: allenai/OLMo-7B-0724-hf
vocab_size: 50304
evaluation:
metrics:
- paloma
paloma:
batch_size: 16
dataset_name: pico-lm/pretokenized-paloma-tinsy
dataset_split: val
max_length: 2048
model:
activation_hidden_dim: 6144
attention_n_heads: 12
attention_n_kv_heads: 4
batch_size: 1024
d_model: 1536
max_seq_len: 2048
model_type: pico_decoder
n_layers: 12
norm_eps: 1.0e-06
position_emb_theta: 10000.0
vocab_size: 50304
monitoring:
logging:
log_every_n_steps: 100
log_level: INFO
save_to_wandb: true
wandb:
entity: pico-lm
project: pico-decoder
training:
fabric:
accelerator: cuda
num_devices: 4
num_nodes: 4
precision: bf16-mixed
max_steps: 200000
optimization:
gradient_accumulation_steps: 8
lr: 0.0003
lr_scheduler: linear_with_warmup
lr_warmup_steps: 2500
optimizer: adamw