3outeille HF staff commited on
Commit
24f0f01
·
verified ·
1 Parent(s): 26c504b

Upload llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32

Browse files
llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=00:59:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=2
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=high
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 2 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32 llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32 --commit-message "Upload llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 4
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 4
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 32
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 8
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 32
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/log.out ADDED
@@ -0,0 +1,779 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Tue Jul 2 19:50:07 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ Already on 'bench_cluster'
10
+ M examples/config_tiny_llama.py
11
+ M examples/config_tiny_llama.yaml
12
+ M examples/train_tiny_llama.sh
13
+ M src/nanotron/models/llama.py
14
+ M src/nanotron/trainer.py
15
+ Your branch is up to date with 'origin/bench_cluster'.
16
+ Job status: RUNNING
17
+ W0702 19:50:13.980000 140190952634176 torch/distributed/run.py:757]
18
+ W0702 19:50:13.980000 140190952634176 torch/distributed/run.py:757] *****************************************
19
+ W0702 19:50:13.980000 140190952634176 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0702 19:50:13.980000 140190952634176 torch/distributed/run.py:757] *****************************************
21
+ W0702 19:50:13.997000 140213654959936 torch/distributed/run.py:757]
22
+ W0702 19:50:13.997000 140213654959936 torch/distributed/run.py:757] *****************************************
23
+ W0702 19:50:13.997000 140213654959936 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
24
+ W0702 19:50:13.997000 140213654959936 torch/distributed/run.py:757] *****************************************
25
+ [default0]:07/02/2024 19:50:37 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 3 dummy tokens (new size: 50260)
26
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config:
27
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster',
28
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid',
29
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42,
30
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None,
31
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None,
32
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None,
33
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True),
34
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=4,
35
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=1,
36
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=4,
37
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7f096239c910>,
38
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
39
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False,
40
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1),
41
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
42
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2,
43
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu',
44
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048,
45
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02,
46
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096,
47
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True,
48
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096,
49
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32,
50
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24,
51
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32,
52
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None,
53
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1,
54
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05,
55
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None,
56
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0,
57
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True,
58
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True,
59
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50260),
60
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025),
61
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16,
62
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1,
63
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25),
64
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
65
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None,
66
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None),
67
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
68
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000,
69
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False,
70
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None,
71
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False),
72
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info',
73
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info',
74
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1),
75
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096,
76
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20,
77
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=32,
78
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=8,
79
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1,
80
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0,
81
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0),
82
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
83
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9,
84
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95,
85
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True,
86
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'),
87
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1,
88
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01,
89
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0,
90
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True,
91
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
92
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1,
93
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear',
94
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear',
95
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19,
96
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None,
97
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)),
98
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage',
99
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1,
100
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
101
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train',
102
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None,
103
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64,
104
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False,
105
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'),
106
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42,
107
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=32))],
108
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32')),
109
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None)
110
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config:
111
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1,
112
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2,
113
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu',
114
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048,
115
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02,
116
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096,
117
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True,
118
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096,
119
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32,
120
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24,
121
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32,
122
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None,
123
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1,
124
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05,
125
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None,
126
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0,
127
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True,
128
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True,
129
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50260)
130
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model..
131
+ [default0]:07/02/2024 19:50:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks...
132
+ [default0]:07/02/2024 19:50:51 [INFO|DP=2|PP=0|TP=0|ip-26-0-171-102]: No checkpoint path provided.
133
+ [default1]:07/02/2024 19:50:51 [INFO|DP=2|PP=0|TP=1|ip-26-0-171-102]: No checkpoint path provided.
134
+ [default2]:07/02/2024 19:50:51 [INFO|DP=2|PP=0|TP=2|ip-26-0-171-102]: No checkpoint path provided.
135
+ [default3]:07/02/2024 19:50:51 [INFO|DP=2|PP=0|TP=3|ip-26-0-171-102]: No checkpoint path provided.
136
+ [default4]:07/02/2024 19:50:51 [INFO|DP=3|PP=0|TP=0|ip-26-0-171-102]: No checkpoint path provided.
137
+ [default6]:07/02/2024 19:50:51 [INFO|DP=3|PP=0|TP=2|ip-26-0-171-102]: No checkpoint path provided.
138
+ [default7]:07/02/2024 19:50:51 [INFO|DP=3|PP=0|TP=3|ip-26-0-171-102]: No checkpoint path provided.
139
+ [default3]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: Local number of parameters: 277M (529.27MiB)
140
+ [default1]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 277M (529.27MiB)
141
+ [default1]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
142
+ [default1]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided.
143
+ [default3]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
144
+ [default3]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided.
145
+ [default2]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: Local number of parameters: 277M (529.27MiB)
146
+ [default2]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
147
+ [default2]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided.
148
+ [default0]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.11G (2117.09MiB)
149
+ [default0]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 277M (529.27MiB)
150
+ [default0]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
151
+ [default0]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided.
152
+ [default0]:07/02/2024 19:50:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator
153
+ [default5]:07/02/2024 19:50:51 [INFO|DP=3|PP=0|TP=1|ip-26-0-171-102]: No checkpoint path provided.
154
+ [default4]:07/02/2024 19:50:51 [INFO|DP=1|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided.
155
+ [default5]:07/02/2024 19:50:51 [INFO|DP=1|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided.
156
+ [default7]:07/02/2024 19:50:51 [INFO|DP=1|PP=0|TP=3|ip-26-0-160-225]: No checkpoint path provided.
157
+ [default6]:07/02/2024 19:50:51 [INFO|DP=1|PP=0|TP=2|ip-26-0-160-225]: No checkpoint path provided.
158
+ [default0]:07/02/2024 19:50:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate
159
+ [default0]:07/02/2024 19:50:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank:
160
+ [default0]:07/02/2024 19:50:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 69.4M out of 277M (25.00%) params' optimizer states
161
+ [default0]:07/02/2024 19:50:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 1 has 69.4M out of 277M (25.00%) params' optimizer states
162
+ [default0]:07/02/2024 19:50:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 2 has 69.4M out of 277M (25.00%) params' optimizer states
163
+ [default0]:07/02/2024 19:50:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 3 has 69.4M out of 277M (25.00%) params' optimizer states
164
+ [default0]:07/02/2024 19:50:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
165
+ [default0]:07/02/2024 19:50:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library
166
+ [default0]:07/02/2024 19:50:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
167
+ [default0]:07/02/2024 19:50:56 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
168
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
169
+ [default0]:07/02/2024 19:50:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages
170
+ [default0]:07/02/2024 19:50:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1
171
+ [default0]:07/02/2024 19:50:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]:
172
+ [default0]:07/02/2024 19:50:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-02 19:50:57.481225 | mbs: 32 | grad_accum: 8 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
173
+ [default0]:07/02/2024 19:50:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
174
+ [default0]:07/02/2024 19:50:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 1877.40MiB. Peak allocated 1877.40MiB. Peak reserved: 1934.00MiB
175
+ [default0]:07/02/2024 19:50:57 [WARNING|DP=2|PP=0|TP=0|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty.
176
+ [default1]:07/02/2024 19:50:57 [WARNING|DP=2|PP=0|TP=1|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty.
177
+ [default2]:07/02/2024 19:50:57 [WARNING|DP=2|PP=0|TP=2|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty.
178
+ [default4]:07/02/2024 19:50:57 [WARNING|DP=3|PP=0|TP=0|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty.
179
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
180
+ [default6]:07/02/2024 19:50:57 [WARNING|DP=3|PP=0|TP=2|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty.
181
+ [default3]:07/02/2024 19:50:57 [WARNING|DP=2|PP=0|TP=3|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty.
182
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
183
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
184
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
185
+ [default7]:07/02/2024 19:50:57 [WARNING|DP=3|PP=0|TP=3|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty.
186
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
187
+ [default2]:07/02/2024 19:50:57 [WARNING|DP=0|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
188
+ [default7]:07/02/2024 19:50:57 [WARNING|DP=1|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
189
+ [default1]:07/02/2024 19:50:57 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
190
+ [default3]:07/02/2024 19:50:57 [WARNING|DP=0|PP=0|TP=3|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
191
+ [default5]:07/02/2024 19:50:57 [WARNING|DP=1|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
192
+ [default5]:07/02/2024 19:50:57 [WARNING|DP=3|PP=0|TP=1|ip-26-0-171-102]: Repo card metadata block was not found. Setting CardData to empty.
193
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
194
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
195
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
196
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
197
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
198
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
199
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
200
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
201
+ [default6]:07/02/2024 19:50:57 [WARNING|DP=1|PP=0|TP=2|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
202
+ [default4]:07/02/2024 19:50:57 [WARNING|DP=1|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
203
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
204
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
205
+ [default4]:[rank4]: Traceback (most recent call last):
206
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
207
+ [default4]:[rank4]: trainer.train(dataloader)
208
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
209
+ [default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
210
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
211
+ [default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter(
212
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
213
+ [default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
214
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
215
+ [default4]:[rank4]: output = model(**micro_batch)
216
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
217
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
218
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
219
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
220
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
221
+ [default4]:[rank4]: sharded_logits = self.model(
222
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
223
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
224
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
225
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
226
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
227
+ [default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
228
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
229
+ [default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
230
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
231
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
232
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
233
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
234
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
235
+ [default4]:[rank4]: output = self.pp_block(**new_kwargs)
236
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
237
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
238
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
239
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
240
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
241
+ [default4]:[rank4]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
242
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
243
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
244
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
245
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
246
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
247
+ [default4]:[rank4]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
248
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
249
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
250
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
251
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
252
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
253
+ [default4]:[rank4]: return row_linear(
254
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear
255
+ [default4]:[rank4]: out = F.linear(input, weight, bias)
256
+ [default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 289.94 MiB is free. Including non-PyTorch memory, this process has 79.03 GiB memory in use. Of the allocated memory 70.97 GiB is allocated by PyTorch, and 12.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
257
+ [default6]:[rank6]: Traceback (most recent call last):
258
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
259
+ [default6]:[rank6]: trainer.train(dataloader)
260
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
261
+ [default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
262
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
263
+ [default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter(
264
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
265
+ [default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
266
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
267
+ [default6]:[rank6]: output = model(**micro_batch)
268
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
269
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
270
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
271
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
272
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
273
+ [default6]:[rank6]: sharded_logits = self.model(
274
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
275
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
276
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
277
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
278
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
279
+ [default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
280
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
281
+ [default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
282
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
283
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
284
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
285
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
286
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
287
+ [default6]:[rank6]: output = self.pp_block(**new_kwargs)
288
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
289
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
290
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
291
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
292
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
293
+ [default6]:[rank6]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
294
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
295
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
296
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
297
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
298
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
299
+ [default6]:[rank6]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
300
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
301
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
302
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
303
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
304
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
305
+ [default6]:[rank6]: return row_linear(
306
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear
307
+ [default6]:[rank6]: out = F.linear(input, weight, bias)
308
+ [default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 65.94 MiB is free. Including non-PyTorch memory, this process has 79.25 GiB memory in use. Of the allocated memory 70.97 GiB is allocated by PyTorch, and 12.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
309
+ [default7]:[rank7]: Traceback (most recent call last):
310
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
311
+ [default7]:[rank7]: trainer.train(dataloader)
312
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
313
+ [default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
314
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
315
+ [default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter(
316
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
317
+ [default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
318
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
319
+ [default7]:[rank7]: output = model(**micro_batch)
320
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
321
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
322
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
323
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
324
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
325
+ [default7]:[rank7]: sharded_logits = self.model(
326
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
327
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
328
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
329
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
330
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
331
+ [default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
332
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
333
+ [default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
334
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
335
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
336
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
337
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
338
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
339
+ [default7]:[rank7]: output = self.pp_block(**new_kwargs)
340
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
341
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
342
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
343
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
344
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
345
+ [default7]:[rank7]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
346
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
347
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
348
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
349
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
350
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
351
+ [default7]:[rank7]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
352
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
353
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
354
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
355
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
356
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
357
+ [default7]:[rank7]: return row_linear(
358
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear
359
+ [default7]:[rank7]: out = differentiable_reduce_scatter_sum(out, group=group)
360
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum
361
+ [default7]:[rank7]: return DifferentiableReduceScatterSum.apply(tensor, group)
362
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
363
+ [default7]:[rank7]: return super().apply(*args, **kwargs) # type: ignore[misc]
364
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward
365
+ [default7]:[rank7]: sharded_tensor = torch.empty(
366
+ [default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU  has a total capacity of 79.33 GiB of which 17.94 MiB is free. Including non-PyTorch memory, this process has 79.30 GiB memory in use. Of the allocated memory 71.47 GiB is allocated by PyTorch, and 12.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
367
+ [default5]:[rank5]: Traceback (most recent call last):
368
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
369
+ [default5]:[rank5]: trainer.train(dataloader)
370
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
371
+ [default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
372
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
373
+ [default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter(
374
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
375
+ [default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
376
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
377
+ [default5]:[rank5]: output = model(**micro_batch)
378
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
379
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
380
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
381
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
382
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
383
+ [default5]:[rank5]: sharded_logits = self.model(
384
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
385
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
386
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
387
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
388
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
389
+ [default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
390
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
391
+ [default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
392
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
393
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
394
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
395
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
396
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
397
+ [default5]:[rank5]: output = self.pp_block(**new_kwargs)
398
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
399
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
400
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
401
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
402
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
403
+ [default5]:[rank5]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
404
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
405
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
406
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
407
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
408
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
409
+ [default5]:[rank5]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
410
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
411
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
412
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
413
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
414
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
415
+ [default5]:[rank5]: return row_linear(
416
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear
417
+ [default5]:[rank5]: out = F.linear(input, weight, bias)
418
+ [default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 41.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 70.97 GiB is allocated by PyTorch, and 12.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
419
+ [default0]:[rank0]: Traceback (most recent call last):
420
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
421
+ [default0]:[rank0]: trainer.train(dataloader)
422
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
423
+ [default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
424
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
425
+ [default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter(
426
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
427
+ [default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
428
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
429
+ [default0]:[rank0]: output = model(**micro_batch)
430
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
431
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
432
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
433
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
434
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
435
+ [default0]:[rank0]: sharded_logits = self.model(
436
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
437
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
438
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
439
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
440
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
441
+ [default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
442
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
443
+ [default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
444
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
445
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
446
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
447
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
448
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
449
+ [default0]:[rank0]: output = self.pp_block(**new_kwargs)
450
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
451
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
452
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
453
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
454
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward
455
+ [default0]:[rank0]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
456
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
457
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
458
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
459
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
460
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward
461
+ [default0]:[rank0]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous()
462
+ [default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU
463
+ W0702 19:51:06.212000 140213654959936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1653252 closing signal SIGTERM
464
+ W0702 19:51:06.213000 140213654959936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1653253 closing signal SIGTERM
465
+ W0702 19:51:06.220000 140213654959936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1653254 closing signal SIGTERM
466
+ W0702 19:51:06.223000 140213654959936 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1653255 closing signal SIGTERM
467
+ [default3]:[rank11]: Traceback (most recent call last):
468
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
469
+ [default3]:[rank11]: trainer.train(dataloader)
470
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
471
+ [default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
472
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
473
+ [default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter(
474
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
475
+ [default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
476
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
477
+ [default3]:[rank11]: output = model(**micro_batch)
478
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
479
+ [default3]:[rank11]: return self._call_impl(*args, **kwargs)
480
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
481
+ [default3]:[rank11]: return forward_call(*args, **kwargs)
482
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
483
+ [default3]:[rank11]: sharded_logits = self.model(
484
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
485
+ [default3]:[rank11]: return self._call_impl(*args, **kwargs)
486
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
487
+ [default3]:[rank11]: return forward_call(*args, **kwargs)
488
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
489
+ [default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
490
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
491
+ [default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
492
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
493
+ [default3]:[rank11]: return self._call_impl(*args, **kwargs)
494
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
495
+ [default3]:[rank11]: return forward_call(*args, **kwargs)
496
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
497
+ [default3]:[rank11]: output = self.pp_block(**new_kwargs)
498
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
499
+ [default3]:[rank11]: return self._call_impl(*args, **kwargs)
500
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
501
+ [default3]:[rank11]: return forward_call(*args, **kwargs)
502
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
503
+ [default3]:[rank11]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
504
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
505
+ [default3]:[rank11]: return self._call_impl(*args, **kwargs)
506
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
507
+ [default3]:[rank11]: return forward_call(*args, **kwargs)
508
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
509
+ [default3]:[rank11]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
510
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
511
+ [default3]:[rank11]: return self._call_impl(*args, **kwargs)
512
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
513
+ [default3]:[rank11]: return forward_call(*args, **kwargs)
514
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
515
+ [default3]:[rank11]: return row_linear(
516
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear
517
+ [default3]:[rank11]: out = F.linear(input, weight, bias)
518
+ [default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 481.94 MiB is free. Including non-PyTorch memory, this process has 78.85 GiB memory in use. Of the allocated memory 70.97 GiB is allocated by PyTorch, and 12.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
519
+ [default0]:[rank8]: Traceback (most recent call last):
520
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
521
+ [default0]:[rank8]: trainer.train(dataloader)
522
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
523
+ [default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
524
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
525
+ [default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter(
526
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
527
+ [default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
528
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
529
+ [default0]:[rank8]: output = model(**micro_batch)
530
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
531
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
532
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
533
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
534
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
535
+ [default0]:[rank8]: sharded_logits = self.model(
536
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
537
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
538
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
539
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
540
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
541
+ [default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
542
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
543
+ [default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
544
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
545
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
546
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
547
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
548
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
549
+ [default0]:[rank8]: output = self.pp_block(**new_kwargs)
550
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
551
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
552
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
553
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
554
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
555
+ [default0]:[rank8]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
556
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
557
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
558
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
559
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
560
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
561
+ [default0]:[rank8]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
562
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
563
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
564
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
565
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
566
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
567
+ [default0]:[rank8]: return row_linear(
568
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 479, in row_linear
569
+ [default0]:[rank8]: out = differentiable_reduce_scatter_sum(out, group=group)
570
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 145, in differentiable_reduce_scatter_sum
571
+ [default1]:[rank9]: Traceback (most recent call last):
572
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
573
+ [default1]:[rank9]: trainer.train(dataloader)
574
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
575
+ [default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
576
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
577
+ [default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter(
578
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
579
+ [default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
580
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
581
+ [default1]:[rank9]: output = model(**micro_batch)
582
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
583
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
584
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
585
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
586
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
587
+ [default1]:[rank9]: sharded_logits = self.model(
588
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
589
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
590
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
591
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
592
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
593
+ [default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
594
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
595
+ [default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
596
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
597
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
598
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
599
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
600
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
601
+ [default1]:[rank9]: output = self.pp_block(**new_kwargs)
602
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
603
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
604
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
605
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
606
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
607
+ [default1]:[rank9]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
608
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
609
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
610
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
611
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
612
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
613
+ [default1]:[rank9]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
614
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
615
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
616
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
617
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
618
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
619
+ [default1]:[rank9]: return row_linear(
620
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear
621
+ [default1]:[rank9]: out = F.linear(input, weight, bias)
622
+ [default1]:[rank9]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 309.94 MiB is free. Including non-PyTorch memory, this process has 79.01 GiB memory in use. Of the allocated memory 70.97 GiB is allocated by PyTorch, and 12.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
623
+ [default0]:[rank8]: return DifferentiableReduceScatterSum.apply(tensor, group)
624
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
625
+ [default0]:[rank8]: return super().apply(*args, **kwargs) # type: ignore[misc]
626
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py", line 111, in forward
627
+ [default0]:[rank8]: sharded_tensor = torch.empty(
628
+ [default0]:[rank8]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU
629
+ [default2]:[rank10]: Traceback (most recent call last):
630
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
631
+ [default2]:[rank10]: trainer.train(dataloader)
632
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
633
+ [default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
634
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
635
+ [default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter(
636
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
637
+ [default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
638
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
639
+ [default2]:[rank10]: output = model(**micro_batch)
640
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
641
+ [default2]:[rank10]: return self._call_impl(*args, **kwargs)
642
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
643
+ [default2]:[rank10]: return forward_call(*args, **kwargs)
644
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
645
+ [default2]:[rank10]: sharded_logits = self.model(
646
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
647
+ [default2]:[rank10]: return self._call_impl(*args, **kwargs)
648
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
649
+ [default2]:[rank10]: return forward_call(*args, **kwargs)
650
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
651
+ [default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
652
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
653
+ [default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
654
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
655
+ [default2]:[rank10]: return self._call_impl(*args, **kwargs)
656
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
657
+ [default2]:[rank10]: return forward_call(*args, **kwargs)
658
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
659
+ [default2]:[rank10]: output = self.pp_block(**new_kwargs)
660
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
661
+ [default2]:[rank10]: return self._call_impl(*args, **kwargs)
662
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
663
+ [default2]:[rank10]: return forward_call(*args, **kwargs)
664
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
665
+ [default2]:[rank10]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
666
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
667
+ [default2]:[rank10]: return self._call_impl(*args, **kwargs)
668
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
669
+ [default2]:[rank10]: return forward_call(*args, **kwargs)
670
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
671
+ [default2]:[rank10]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
672
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
673
+ [default2]:[rank10]: return self._call_impl(*args, **kwargs)
674
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
675
+ [default2]:[rank10]: return forward_call(*args, **kwargs)
676
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
677
+ [default2]:[rank10]: return row_linear(
678
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear
679
+ [default2]:[rank10]: out = F.linear(input, weight, bias)
680
+ [default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 223.94 MiB is free. Including non-PyTorch memory, this process has 79.10 GiB memory in use. Of the allocated memory 70.97 GiB is allocated by PyTorch, and 12.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
681
+ E0702 19:51:08.750000 140213654959936 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 4 (pid: 1653256) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
682
+ Traceback (most recent call last):
683
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
684
+ sys.exit(main())
685
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
686
+ return f(*args, **kwargs)
687
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
688
+ run(args)
689
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
690
+ elastic_launch(
691
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
692
+ return launch_agent(self._config, self._entrypoint, list(args))
693
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
694
+ raise ChildFailedError(
695
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
696
+ ============================================================
697
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
698
+ ------------------------------------------------------------
699
+ Failures:
700
+ [1]:
701
+ time : 2024-07-02_19:51:06
702
+ host : ip-26-0-160-225.ec2.internal
703
+ rank : 5 (local_rank: 5)
704
+ exitcode : 1 (pid: 1653257)
705
+ error_file: <N/A>
706
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
707
+ [2]:
708
+ time : 2024-07-02_19:51:06
709
+ host : ip-26-0-160-225.ec2.internal
710
+ rank : 6 (local_rank: 6)
711
+ exitcode : 1 (pid: 1653258)
712
+ error_file: <N/A>
713
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
714
+ [3]:
715
+ time : 2024-07-02_19:51:06
716
+ host : ip-26-0-160-225.ec2.internal
717
+ rank : 7 (local_rank: 7)
718
+ exitcode : 1 (pid: 1653259)
719
+ error_file: <N/A>
720
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
721
+ ------------------------------------------------------------
722
+ Root Cause (first observed failure):
723
+ [0]:
724
+ time : 2024-07-02_19:51:06
725
+ host : ip-26-0-160-225.ec2.internal
726
+ rank : 4 (local_rank: 4)
727
+ exitcode : 1 (pid: 1653256)
728
+ error_file: <N/A>
729
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
730
+ ============================================================
731
+ srun: error: ip-26-0-160-225: task 0: Exited with exit code 1
732
+ W0702 19:51:10.255000 140185285814016 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-102.ec2.internal_3646767_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError.
733
+ W0702 19:51:11.232000 140190952634176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3646841 closing signal SIGTERM
734
+ W0702 19:51:11.232000 140190952634176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3646842 closing signal SIGTERM
735
+ W0702 19:51:11.233000 140190952634176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3646843 closing signal SIGTERM
736
+ W0702 19:51:11.233000 140190952634176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3646844 closing signal SIGTERM
737
+ W0702 19:51:11.233000 140190952634176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3646845 closing signal SIGTERM
738
+ W0702 19:51:11.238000 140190952634176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3646846 closing signal SIGTERM
739
+ W0702 19:51:11.238000 140190952634176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3646847 closing signal SIGTERM
740
+ W0702 19:51:11.244000 140190952634176 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3646848 closing signal SIGTERM
741
+ W0702 19:51:13.786000 140190952634176 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3646767_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError.
742
+ W0702 19:51:13.797000 140190952634176 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-102.ec2.internal_3646767_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError.
743
+ Traceback (most recent call last):
744
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store
745
+ return getattr(self._store, store_op)(*args, **kwargs)
746
+ torch.distributed.DistNetworkError: Broken pipe
747
+
748
+ The above exception was the direct cause of the following exception:
749
+
750
+ Traceback (most recent call last):
751
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
752
+ sys.exit(main())
753
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
754
+ return f(*args, **kwargs)
755
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
756
+ run(args)
757
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
758
+ elastic_launch(
759
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
760
+ return launch_agent(self._config, self._entrypoint, list(args))
761
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent
762
+ result = agent.run()
763
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper
764
+ result = f(*args, **kwargs)
765
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run
766
+ result = self._invoke_run(role)
767
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run
768
+ num_nodes_waiting = rdzv_handler.num_nodes_waiting()
769
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting
770
+ self._state_holder.sync()
771
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync
772
+ get_response = self._backend.get_state()
773
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state
774
+ base64_state: bytes = self._call_store("get", self._key)
775
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store
776
+ raise RendezvousConnectionError(
777
+ torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details.
778
+ srun: error: ip-26-0-171-102: task 1: Exited with exit code 1
779
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/16_GPUS/dp-4_tp-4_pp-1_mbz-32/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom