File size: 4,286 Bytes
f6ae2e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
checkpoints:
  checkpoint_interval: 1000
  checkpoints_path: /scratch/loubna/checkpoints/smollm-big-run-1p81G-smollm-1.7B-8T-seed-0-
  checkpoints_path_is_shared_file_system: false
  overwrite_datastage: false
  resume_checkpoint_path: /scratch/loubna/checkpoints/smollm-big-run-1p81G-smollm-1.7B-8T-seed-0-/3074000
  save_initial_state: true
data_stages:
- data:
    dataset:
      dataloader_type: cyclic
      dataset_max_tokens: null
      dataset_weights:
      - 1
      datasets:
      - filename_pattern: .*\.ds$
        folder: /fsx/loubna/tokenized_for_exps/phase2_mixture
        seed: 0
        shuffle: true
        skip_tokens: 0
      pad_samples_to_global_batch_size: false
      skip_in_stream: true
    num_loading_workers: 0
    seed: 0
  name: stable
  start_training_step: 1
experiment_logger:
  tensorboard_logger:
    push_to_hub_interval: 50
    repo_id: HuggingFaceTB/smollm-big-run
    repo_public: false
    tensorboard_dir: /scratch/loubna/tensorboard-cosmo-smollm-big-run
  wandb_logger:
    wandb_entity: loubnabnl
    wandb_project: smollm-big-run
general:
  benchmark_csv_path: null
  consumed_train_samples: 3927040000
  ignore_sanity_checks: true
  project: smollm-big-run
  run: smollm-big-run-1p81G-smollm-1.7B-8T-seed-0-
  seed: 42
  step: 3835000
kill_switch_path: null
lighteval:
  batch_size: 16
  checkpoints_path: null
  generation: null
  logging:
    hub_repo_details: null
    hub_repo_results: HuggingFaceTB/smollm-big-run
    hub_repo_tensorboard: HuggingFaceTB/smollm-big-run
    local_output_path: /scratch/loubna/lighteval/smollm-big-run-1p81G-smollm-1.7B-8T-seed-0-
    push_details_to_hub: false
    push_results_to_hub: true
    push_results_to_tensorboard: true
    tensorboard_metric_prefix: e
  parallelism:
    dp: 8
    expert_parallel_size: 1
    pp: 1
    pp_engine: 1f1b
    tp: 1
    tp_linear_async_communication: false
    tp_mode: ALL_REDUCE
  slurm_script_dir: /fsx/loubna/logs/smollmv2/eval-scripts
  slurm_template: /fsx/loubna/projects/brrr/examples/loubna/eval_1b.slurm.jinja
  tasks:
    custom_tasks: brrr.lighteval.evaluation_tasks
    dataset_loading_processes: 8
    max_samples: 1000
    multichoice_continuations_start_space: null
    no_multichoice_continuations_start_space: null
    num_fewshot_seeds: null
    tasks: early-signal
  wandb:
    wandb_entity: loubnabnl
    wandb_project: smollm-big-run
    wandb_run_name: smollm-big-run-1p81G-smollm-1.7B-8T-seed-0-_evals
logging:
  iteration_step_info_interval: 1
  log_level: info
  log_level_replica: info
model:
  ddp_bucket_cap_mb: 25
  dtype: bfloat16
  init_method:
    std: 0.02
  make_vocab_size_divisible_by: 1
  model_config:
    bos_token_id: 0
    eos_token_id: 0
    hidden_act: silu
    hidden_size: 2048
    initializer_range: 0.02
    intermediate_size: 8192
    is_llama_config: true
    max_position_embeddings: 2048
    num_attention_heads: 32
    num_hidden_layers: 24
    num_key_value_heads: 32
    pad_token_id: null
    pretraining_tp: 1
    rms_norm_eps: 1.0e-05
    rope_scaling: null
    tie_word_embeddings: true
    use_cache: true
    vocab_size: 49152
optimizer:
  accumulate_grad_in_fp32: true
  clip_grad: 1.0
  learning_rate_scheduler:
    learning_rate: 0.0005
    lr_decay_starting_step: 3870000
    lr_decay_steps: 430000
    lr_decay_style: linear
    lr_warmup_steps: 2000
    lr_warmup_style: linear
    min_decay_lr: 0
  optimizer_factory:
    adam_beta1: 0.9
    adam_beta2: 0.95
    adam_eps: 1.0e-08
    name: adamW
    torch_adam_is_fused: true
  weight_decay: 0.01
  zero_stage: 0
parallelism:
  dp: 256
  expert_parallel_size: 1
  pp: 1
  pp_engine: 1f1b
  tp: 1
  tp_linear_async_communication: true
  tp_mode: REDUCE_SCATTER
profiler: null
s3_upload:
  remove_after_upload: true
  s5cmd_concurrency: 5
  s5cmd_numworkers: 16
  s5cmd_path: /admin/home/loubna/miniconda3/envs/nanotron/bin/s5cmd
  upload_s3_path: s3://synthetic-project-models/big-run-5T/smollm-big-run-1p81G-smollm-1.7B-8T-seed-0-
tokenizer:
  tokenizer_max_length: null
  tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
  tokenizer_revision: null
tokens:
  batch_accumulation_per_replica: 1
  limit_test_batches: 0
  limit_val_batches: 0
  micro_batch_size: 4
  sequence_length: 2048
  train_steps: 4300000
  val_check_interval: 100