3outeille HF Staff commited on
Commit
e34ee2a
·
verified ·
1 Parent(s): ac79bed

Upload llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256

Browse files
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=02:00:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=1
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=normal
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 1 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256 llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256 --commit-message "Upload llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 4
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 2
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 0
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 1
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 256
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/log.out ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Wed Jul 3 22:46:56 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ Already on 'bench_cluster'
10
+ M examples/config_tiny_llama.py
11
+ M examples/config_tiny_llama.yaml
12
+ M examples/train_tiny_llama.sh
13
+ M src/nanotron/models/llama.py
14
+ M src/nanotron/trainer.py
15
+ Your branch is up to date with 'origin/bench_cluster'.
16
+ Job status: RUNNING
17
+ W0703 22:46:59.466000 139920708081472 torch/distributed/run.py:757]
18
+ W0703 22:46:59.466000 139920708081472 torch/distributed/run.py:757] *****************************************
19
+ W0703 22:46:59.466000 139920708081472 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0703 22:46:59.466000 139920708081472 torch/distributed/run.py:757] *****************************************
21
+ [default0]:07/03/2024 22:47:15 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Vocab Size Padding] Padded vocab (size: 50257) with 1 dummy tokens (new size: 50258)
22
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Config:
23
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Config(general=GeneralArgs(project='bench_cluster',
24
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: run='%date_%jobid',
25
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: seed=42,
26
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: step=None,
27
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: consumed_train_samples=None,
28
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: benchmark_csv_path=None,
29
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: ignore_sanity_checks=True),
30
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: parallelism=ParallelismArgs(dp=4,
31
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pp=1,
32
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tp=2,
33
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7fd644bdc940>,
34
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
35
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tp_linear_async_communication=False,
36
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: expert_parallel_size=1),
37
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
38
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: eos_token_id=2,
39
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hidden_act='silu',
40
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hidden_size=2048,
41
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: initializer_range=0.02,
42
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: intermediate_size=4096,
43
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: is_llama_config=True,
44
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: max_position_embeddings=4096,
45
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_attention_heads=32,
46
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_hidden_layers=24,
47
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_key_value_heads=32,
48
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pad_token_id=None,
49
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pretraining_tp=1,
50
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rms_norm_eps=1e-05,
51
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rope_scaling=None,
52
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rope_theta=10000.0,
53
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tie_word_embeddings=True,
54
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: use_cache=True,
55
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: vocab_size=50258),
56
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: init_method=RandomInit(std=0.025),
57
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: dtype=torch.bfloat16,
58
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: make_vocab_size_divisible_by=1,
59
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: ddp_bucket_cap_mb=25),
60
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
61
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tokenizer_revision=None,
62
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tokenizer_max_length=None),
63
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
64
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: checkpoint_interval=100000,
65
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: save_initial_state=False,
66
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: resume_checkpoint_path=None,
67
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: checkpoints_path_is_shared_file_system=False),
68
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: logging=LoggingArgs(log_level='info',
69
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: log_level_replica='info',
70
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: iteration_step_info_interval=1),
71
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tokens=TokensArgs(sequence_length=4096,
72
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: train_steps=20,
73
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: micro_batch_size=256,
74
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: batch_accumulation_per_replica=1,
75
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: val_check_interval=-1,
76
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: limit_val_batches=0,
77
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: limit_test_batches=0),
78
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
79
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: adam_beta1=0.9,
80
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: adam_beta2=0.95,
81
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: torch_adam_is_fused=True,
82
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: name='adamW'),
83
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: zero_stage=1,
84
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: weight_decay=0.01,
85
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: clip_grad=1.0,
86
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: accumulate_grad_in_fp32=True,
87
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
88
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_warmup_steps=1,
89
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_warmup_style='linear',
90
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_decay_style='linear',
91
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_decay_steps=19,
92
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_decay_starting_step=None,
93
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: min_decay_lr=1e-05)),
94
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: data_stages=[DatasetStageArgs(name='Training Stage',
95
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: start_training_step=1,
96
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
97
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hf_dataset_splits='train',
98
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hf_dataset_config_name=None,
99
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: dataset_processing_num_proc_per_process=64,
100
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: dataset_overwrite_cache=False,
101
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: text_column_name='text'),
102
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: seed=42,
103
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_loading_workers=0))],
104
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256')),
105
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lighteval=None)
106
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Model Config:
107
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: LlamaConfig(bos_token_id=1,
108
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: eos_token_id=2,
109
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hidden_act='silu',
110
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hidden_size=2048,
111
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: initializer_range=0.02,
112
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: intermediate_size=4096,
113
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: is_llama_config=True,
114
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: max_position_embeddings=4096,
115
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_attention_heads=32,
116
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_hidden_layers=24,
117
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_key_value_heads=32,
118
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pad_token_id=None,
119
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pretraining_tp=1,
120
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rms_norm_eps=1e-05,
121
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rope_scaling=None,
122
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rope_theta=10000.0,
123
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tie_word_embeddings=True,
124
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: use_cache=True,
125
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: vocab_size=50258)
126
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Building model..
127
+ [default0]:07/03/2024 22:47:15 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Setting PP block ranks...
128
+ [default0]:07/03/2024 22:47:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Total number of parameters: 1.11G (2116.70MiB)
129
+ [default0]:07/03/2024 22:47:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Local number of parameters: 555M (1058.35MiB)
130
+ [default0]:07/03/2024 22:47:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [After model building] Memory usage: 1082.37MiB. Peak allocated: 1182.56MiB Peak reserved: 1200.00MiB
131
+ [default0]:07/03/2024 22:47:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: No checkpoint path provided.
132
+ [default0]:07/03/2024 22:47:25 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Parametrizing model parameters using StandardParametrizator
133
+ [default1]:07/03/2024 22:47:25 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-178]: Local number of parameters: 555M (1058.35MiB)
134
+ [default1]:07/03/2024 22:47:25 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-178]: [After model building] Memory usage: 1082.37MiB. Peak allocated: 1182.56MiB Peak reserved: 1200.00MiB
135
+ [default1]:07/03/2024 22:47:25 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-178]: No checkpoint path provided.
136
+ [default3]:07/03/2024 22:47:26 [INFO|DP=1|PP=0|TP=1|ip-26-0-161-178]: No checkpoint path provided.
137
+ [default2]:07/03/2024 22:47:26 [INFO|DP=1|PP=0|TP=0|ip-26-0-161-178]: No checkpoint path provided.
138
+ [default7]:07/03/2024 22:47:26 [INFO|DP=3|PP=0|TP=1|ip-26-0-161-178]: No checkpoint path provided.
139
+ [default4]:07/03/2024 22:47:26 [INFO|DP=2|PP=0|TP=0|ip-26-0-161-178]: No checkpoint path provided.
140
+ [default6]:07/03/2024 22:47:26 [INFO|DP=3|PP=0|TP=0|ip-26-0-161-178]: No checkpoint path provided.
141
+ [default5]:07/03/2024 22:47:26 [INFO|DP=2|PP=0|TP=1|ip-26-0-161-178]: No checkpoint path provided.
142
+ [default0]:07/03/2024 22:47:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Optimizer Building] Using LearningRateForSP as learning rate
143
+ [default0]:07/03/2024 22:47:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [ZeRO sharding] Size of optimizer params per rank:
144
+ [default0]:07/03/2024 22:47:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [ZeRO sharding] DP Rank 0 has 139M out of 555M (25.00%) params' optimizer states
145
+ [default0]:07/03/2024 22:47:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [ZeRO sharding] DP Rank 1 has 139M out of 555M (25.00%) params' optimizer states
146
+ [default0]:07/03/2024 22:47:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [ZeRO sharding] DP Rank 2 has 139M out of 555M (25.00%) params' optimizer states
147
+ [default0]:07/03/2024 22:47:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [ZeRO sharding] DP Rank 3 has 139M out of 555M (25.00%) params' optimizer states
148
+ [default0]:07/03/2024 22:47:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
149
+ [default0]:07/03/2024 22:47:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Using `datasets` library
150
+ [default0]:07/03/2024 22:47:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
151
+ [default0]:07/03/2024 22:47:31 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
152
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
153
+ [default0]:07/03/2024 22:47:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Training Plan] There are 1 training stages
154
+ [default0]:07/03/2024 22:47:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Stage Training Stage] start from step 1
155
+ [default0]:07/03/2024 22:47:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]:
156
+ [default0]:07/03/2024 22:47:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Start training] datetime: 2024-07-03 22:47:32.630581 | mbs: 256 | grad_accum: 1 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
157
+ [default0]:07/03/2024 22:47:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
158
+ [default0]:07/03/2024 22:47:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Memory usage: 3729.08MiB. Peak allocated 3729.08MiB. Peak reserved: 3848.00MiB
159
+ [default3]:07/03/2024 22:47:32 [WARNING|DP=1|PP=0|TP=1|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
160
+ [default2]:07/03/2024 22:47:32 [WARNING|DP=1|PP=0|TP=0|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
161
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
162
+ [default5]:07/03/2024 22:47:32 [WARNING|DP=2|PP=0|TP=1|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
163
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
164
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
165
+ [default7]:07/03/2024 22:47:32 [WARNING|DP=3|PP=0|TP=1|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
166
+ [default4]:07/03/2024 22:47:32 [WARNING|DP=2|PP=0|TP=0|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
167
+ [default6]:07/03/2024 22:47:32 [WARNING|DP=3|PP=0|TP=0|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
168
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
169
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
170
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
171
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
172
+ [default1]:07/03/2024 22:47:32 [WARNING|DP=0|PP=0|TP=1|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
173
+ [default3]:[rank3]: Traceback (most recent call last):
174
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
175
+ [default3]:[rank3]: trainer.train(dataloader)
176
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
177
+ [default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
178
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
179
+ [default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter(
180
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
181
+ [default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
182
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
183
+ [default3]:[rank3]: output = model(**micro_batch)
184
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
185
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
186
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
187
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
188
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
189
+ [default3]:[rank3]: sharded_logits = self.model(
190
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
191
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
192
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
193
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
194
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
195
+ [default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
196
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
197
+ [default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
198
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
199
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
200
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
201
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
202
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
203
+ [default3]:[rank3]: output = self.pp_block(**new_kwargs)
204
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
205
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
206
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
207
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
208
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward
209
+ [default3]:[rank3]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
210
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
211
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
212
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
213
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
214
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 389, in forward
215
+ [default3]:[rank3]: .contiguous()
216
+ [default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.13 GiB is free. Including non-PyTorch memory, this process has 77.19 GiB memory in use. Of the allocated memory 59.77 GiB is allocated by PyTorch, and 5.96 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
217
+ [default2]:[rank2]: Traceback (most recent call last):
218
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
219
+ [default2]:[rank2]: trainer.train(dataloader)
220
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
221
+ [default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
222
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
223
+ [default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter(
224
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
225
+ [default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
226
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
227
+ [default2]:[rank2]: output = model(**micro_batch)
228
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
229
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
230
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
231
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
232
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
233
+ [default2]:[rank2]: sharded_logits = self.model(
234
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
235
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
236
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
237
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
238
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
239
+ [default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
240
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
241
+ [default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
242
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
243
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
244
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
245
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
246
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
247
+ [default2]:[rank2]: output = self.pp_block(**new_kwargs)
248
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
249
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
250
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
251
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
252
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward
253
+ [default2]:[rank2]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
254
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
255
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
256
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
257
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
258
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 389, in forward
259
+ [default2]:[rank2]: .contiguous()
260
+ [default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.00 GiB. GPU  has a total capacity of 79.33 GiB of which 2.13 GiB is free. Including non-PyTorch memory, this process has 77.19 GiB memory in use. Of the allocated memory 59.77 GiB is allocated by PyTorch, and 5.96 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
261
+ W0703 22:47:39.670000 139920708081472 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1030915 closing signal SIGTERM
262
+ W0703 22:47:39.671000 139920708081472 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1030916 closing signal SIGTERM
263
+ W0703 22:47:39.671000 139920708081472 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1030919 closing signal SIGTERM
264
+ W0703 22:47:39.672000 139920708081472 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1030920 closing signal SIGTERM
265
+ W0703 22:47:39.672000 139920708081472 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1030921 closing signal SIGTERM
266
+ W0703 22:47:39.672000 139920708081472 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1030922 closing signal SIGTERM
267
+ E0703 22:47:41.189000 139920708081472 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 2 (pid: 1030917) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
268
+ Traceback (most recent call last):
269
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
270
+ sys.exit(main())
271
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
272
+ return f(*args, **kwargs)
273
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
274
+ run(args)
275
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
276
+ elastic_launch(
277
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
278
+ return launch_agent(self._config, self._entrypoint, list(args))
279
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
280
+ raise ChildFailedError(
281
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
282
+ ============================================================
283
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
284
+ ------------------------------------------------------------
285
+ Failures:
286
+ [1]:
287
+ time : 2024-07-03_22:47:39
288
+ host : ip-26-0-161-178.ec2.internal
289
+ rank : 3 (local_rank: 3)
290
+ exitcode : 1 (pid: 1030918)
291
+ error_file: <N/A>
292
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
293
+ ------------------------------------------------------------
294
+ Root Cause (first observed failure):
295
+ [0]:
296
+ time : 2024-07-03_22:47:39
297
+ host : ip-26-0-161-178.ec2.internal
298
+ rank : 2 (local_rank: 2)
299
+ exitcode : 1 (pid: 1030917)
300
+ error_file: <N/A>
301
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
302
+ ============================================================
303
+ srun: error: ip-26-0-161-178: task 0: Exited with exit code 1
304
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-256/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom