|
accum_freq: 1 |
|
attn_activation: None |
|
attn_name: auto |
|
attn_seq_scalar: None |
|
attn_seq_scalar_alpha: None |
|
average: None |
|
average_coefficients: None |
|
beta1: 0.9 |
|
beta2: 0.95 |
|
checkpoint_path: logs/448/c4_original-open_lm_1b-4.0/checkpoints |
|
copy_codebase: False |
|
data_key: txt |
|
dataset_manifest: None |
|
dataset_resampled: False |
|
dataset_type: auto |
|
ddp_static_graph: False |
|
debug: False |
|
delete_previous_checkpoint: True |
|
device: cuda:0 |
|
disable_buffer: False |
|
dist_backend: nccl |
|
dist_url: env:// |
|
distill_model: None |
|
distill_pretrained: None |
|
distributed: True |
|
epochs: 5 |
|
epochs_cooldown: None |
|
eps: 1e-08 |
|
experimental_meta_device: False |
|
ffn_type: swiglu |
|
force_distributed: False |
|
force_min_lr: 0.0 |
|
fsdp: False |
|
fsdp_amp: False |
|
fsdp_backward_prefetch: False |
|
fsdp_checkpoint: False |
|
fsdp_cpu_offload: False |
|
fsdp_hybrid: False |
|
fsdp_hybrid_o2: False |
|
fsdp_limit_all_gathers: False |
|
fsdp_pure_bf16: False |
|
fsdp_use_orig_params: False |
|
global_batch_size: 128 |
|
global_val_batch_size: 128 |
|
grad_checkpointing: False |
|
grad_clip_norm: 1.0 |
|
hf_fsdp_block: None |
|
hf_model: None |
|
hf_seq_len: None |
|
ignore_parse_errors: False |
|
load_pretrained_state: False |
|
local_rank: 0 |
|
log_every_n_steps: 20 |
|
log_level: 20 |
|
log_local: False |
|
log_logit_mean: False |
|
log_path: logs/448/c4_original-open_lm_1b-4.0/out.log |
|
logs: logs/448 |
|
lr: 0.003 |
|
lr_cooldown_end: 3e-05 |
|
lr_cooldown_power: 1.0 |
|
lr_scheduler: cosine |
|
model: open_lm_1b |
|
model_norm: gain_only_lp_layer_norm |
|
moe_capacity_factor: 1.25 |
|
moe_expert_model_parallelism: False |
|
moe_freq: 0 |
|
moe_loss_weight: 0.1 |
|
moe_num_experts: None |
|
moe_top_k: 2 |
|
moe_weight_parallelism: False |
|
multiple_data_passes: False |
|
name: c4_original-open_lm_1b-4.0 |
|
no_set_device_rank: False |
|
optimizer: adamw |
|
per_gpu_batch_size: 16 |
|
per_gpu_val_batch_size: 16 |
|
positional_embedding_type: rotary |
|
precision: amp_bfloat16 |
|
pretrained: None |
|
qk_norm: True |
|
rank: 0 |
|
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 |
|
remote_sync_frequency: 300 |
|
remote_sync_protocol: s3 |
|
report_to: |
|
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-open_lm_1b-4.0/checkpoints/epoch_6.pt |
|
save_frequency: 1 |
|
save_most_recent: False |
|
seed: 124 |
|
seq_len: 2048 |
|
skip_scheduler: False |
|
squash_mask_left: True |
|
target_mask_individual: 50400 |
|
target_mask_left: 50300 |
|
tensorboard: False |
|
tensorboard_path: |
|
torchcompile: False |
|
torchscript: False |
|
trace: False |
|
train_data: None |
|
train_data_mix_weights: None |
|
train_data_upsampling_factors: None |
|
train_num_samples: None |
|
use_bn_sync: False |
|
use_bnb_linear: None |
|
val_data: ['training/eval_data/val_tok_mult/de-en/val_de-en_000.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_010.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_020.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_030.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_040.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_050.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_060.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_070.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_080.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_090.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_100.tar'] |
|
val_data_key: ['json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz'] |
|
val_frequency: 5 |
|
val_iter_ci: 10000 |
|
val_max_pop_ci: 300000 |
|
val_num_samples: None |
|
val_seq_ci: True |
|
val_tok_ci: True |
|
vocab_size: 50432 |
|
wandb: False |
|
wandb_notes: |
|
wandb_project_name: open-lm |
|
warmup: 5000 |
|
wd: 0.033 |
|
workers: 2 |
|
world_size: 8 |
|
z_loss_coefficient: 0.0001 |
|
|