File size: 4,286 Bytes
f6ae2e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
checkpoints:
checkpoint_interval: 1000
checkpoints_path: /scratch/loubna/checkpoints/smollm-big-run-1p81G-smollm-1.7B-8T-seed-0-
checkpoints_path_is_shared_file_system: false
overwrite_datastage: false
resume_checkpoint_path: /scratch/loubna/checkpoints/smollm-big-run-1p81G-smollm-1.7B-8T-seed-0-/3074000
save_initial_state: true
data_stages:
- data:
dataset:
dataloader_type: cyclic
dataset_max_tokens: null
dataset_weights:
- 1
datasets:
- filename_pattern: .*\.ds$
folder: /fsx/loubna/tokenized_for_exps/phase2_mixture
seed: 0
shuffle: true
skip_tokens: 0
pad_samples_to_global_batch_size: false
skip_in_stream: true
num_loading_workers: 0
seed: 0
name: stable
start_training_step: 1
experiment_logger:
tensorboard_logger:
push_to_hub_interval: 50
repo_id: HuggingFaceTB/smollm-big-run
repo_public: false
tensorboard_dir: /scratch/loubna/tensorboard-cosmo-smollm-big-run
wandb_logger:
wandb_entity: loubnabnl
wandb_project: smollm-big-run
general:
benchmark_csv_path: null
consumed_train_samples: 3927040000
ignore_sanity_checks: true
project: smollm-big-run
run: smollm-big-run-1p81G-smollm-1.7B-8T-seed-0-
seed: 42
step: 3835000
kill_switch_path: null
lighteval:
batch_size: 16
checkpoints_path: null
generation: null
logging:
hub_repo_details: null
hub_repo_results: HuggingFaceTB/smollm-big-run
hub_repo_tensorboard: HuggingFaceTB/smollm-big-run
local_output_path: /scratch/loubna/lighteval/smollm-big-run-1p81G-smollm-1.7B-8T-seed-0-
push_details_to_hub: false
push_results_to_hub: true
push_results_to_tensorboard: true
tensorboard_metric_prefix: e
parallelism:
dp: 8
expert_parallel_size: 1
pp: 1
pp_engine: 1f1b
tp: 1
tp_linear_async_communication: false
tp_mode: ALL_REDUCE
slurm_script_dir: /fsx/loubna/logs/smollmv2/eval-scripts
slurm_template: /fsx/loubna/projects/brrr/examples/loubna/eval_1b.slurm.jinja
tasks:
custom_tasks: brrr.lighteval.evaluation_tasks
dataset_loading_processes: 8
max_samples: 1000
multichoice_continuations_start_space: null
no_multichoice_continuations_start_space: null
num_fewshot_seeds: null
tasks: early-signal
wandb:
wandb_entity: loubnabnl
wandb_project: smollm-big-run
wandb_run_name: smollm-big-run-1p81G-smollm-1.7B-8T-seed-0-_evals
logging:
iteration_step_info_interval: 1
log_level: info
log_level_replica: info
model:
ddp_bucket_cap_mb: 25
dtype: bfloat16
init_method:
std: 0.02
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 0
eos_token_id: 0
hidden_act: silu
hidden_size: 2048
initializer_range: 0.02
intermediate_size: 8192
is_llama_config: true
max_position_embeddings: 2048
num_attention_heads: 32
num_hidden_layers: 24
num_key_value_heads: 32
pad_token_id: null
pretraining_tp: 1
rms_norm_eps: 1.0e-05
rope_scaling: null
tie_word_embeddings: true
use_cache: true
vocab_size: 49152
optimizer:
accumulate_grad_in_fp32: true
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.0005
lr_decay_starting_step: 3870000
lr_decay_steps: 430000
lr_decay_style: linear
lr_warmup_steps: 2000
lr_warmup_style: linear
min_decay_lr: 0
optimizer_factory:
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
name: adamW
torch_adam_is_fused: true
weight_decay: 0.01
zero_stage: 0
parallelism:
dp: 256
expert_parallel_size: 1
pp: 1
pp_engine: 1f1b
tp: 1
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
profiler: null
s3_upload:
remove_after_upload: true
s5cmd_concurrency: 5
s5cmd_numworkers: 16
s5cmd_path: /admin/home/loubna/miniconda3/envs/nanotron/bin/s5cmd
upload_s3_path: s3://synthetic-project-models/big-run-5T/smollm-big-run-1p81G-smollm-1.7B-8T-seed-0-
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 1
limit_test_batches: 0
limit_val_batches: 0
micro_batch_size: 4
sequence_length: 2048
train_steps: 4300000
val_check_interval: 100
|