3outeille HF staff commited on
Commit
e56f6b4
·
verified ·
1 Parent(s): ddd0ecd

Upload llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32

Browse files
llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=02:00:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=1
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=normal
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 1 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32 llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32 --commit-message "Upload llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 2
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 4
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 0
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 16
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 32
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/log.out ADDED
@@ -0,0 +1,664 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Wed Jul 3 23:45:41 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ fatal: Unable to create '/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/.git/index.lock': File exists.
10
+
11
+ Another git process seems to be running in this repository, e.g.
12
+ an editor opened by 'git commit'. Please make sure all processes
13
+ are terminated then try again. If it still fails, a git process
14
+ may have crashed in this repository earlier:
15
+ remove the file manually to continue.
16
+ Job status: RUNNING
17
+ W0703 23:45:49.311000 139891705947968 torch/distributed/run.py:757]
18
+ W0703 23:45:49.311000 139891705947968 torch/distributed/run.py:757] *****************************************
19
+ W0703 23:45:49.311000 139891705947968 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0703 23:45:49.311000 139891705947968 torch/distributed/run.py:757] *****************************************
21
+ [default0]:07/03/2024 23:46:10 [WARNING|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Vocab Size Padding] Padded vocab (size: 50257) with 3 dummy tokens (new size: 50260)
22
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Config:
23
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Config(general=GeneralArgs(project='bench_cluster',
24
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: run='%date_%jobid',
25
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: seed=42,
26
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: step=None,
27
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: consumed_train_samples=None,
28
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: benchmark_csv_path=None,
29
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: ignore_sanity_checks=True),
30
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: parallelism=ParallelismArgs(dp=2,
31
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pp=1,
32
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tp=4,
33
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7f147f838880>,
34
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
35
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tp_linear_async_communication=False,
36
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: expert_parallel_size=1),
37
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
38
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: eos_token_id=2,
39
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hidden_act='silu',
40
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hidden_size=2048,
41
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: initializer_range=0.02,
42
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: intermediate_size=4096,
43
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: is_llama_config=True,
44
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: max_position_embeddings=4096,
45
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_attention_heads=32,
46
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_hidden_layers=24,
47
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_key_value_heads=32,
48
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pad_token_id=None,
49
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pretraining_tp=1,
50
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rms_norm_eps=1e-05,
51
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rope_scaling=None,
52
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rope_theta=10000.0,
53
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tie_word_embeddings=True,
54
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: use_cache=True,
55
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: vocab_size=50260),
56
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: init_method=RandomInit(std=0.025),
57
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: dtype=torch.bfloat16,
58
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: make_vocab_size_divisible_by=1,
59
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: ddp_bucket_cap_mb=25),
60
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
61
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tokenizer_revision=None,
62
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tokenizer_max_length=None),
63
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
64
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: checkpoint_interval=100000,
65
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: save_initial_state=False,
66
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: resume_checkpoint_path=None,
67
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: checkpoints_path_is_shared_file_system=False),
68
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: logging=LoggingArgs(log_level='info',
69
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: log_level_replica='info',
70
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: iteration_step_info_interval=1),
71
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tokens=TokensArgs(sequence_length=4096,
72
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: train_steps=20,
73
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: micro_batch_size=32,
74
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: batch_accumulation_per_replica=16,
75
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: val_check_interval=-1,
76
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: limit_val_batches=0,
77
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: limit_test_batches=0),
78
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
79
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: adam_beta1=0.9,
80
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: adam_beta2=0.95,
81
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: torch_adam_is_fused=True,
82
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: name='adamW'),
83
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: zero_stage=1,
84
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: weight_decay=0.01,
85
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: clip_grad=1.0,
86
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: accumulate_grad_in_fp32=True,
87
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
88
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_warmup_steps=1,
89
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_warmup_style='linear',
90
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_decay_style='linear',
91
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_decay_steps=19,
92
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_decay_starting_step=None,
93
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: min_decay_lr=1e-05)),
94
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: data_stages=[DatasetStageArgs(name='Training Stage',
95
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: start_training_step=1,
96
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
97
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hf_dataset_splits='train',
98
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hf_dataset_config_name=None,
99
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: dataset_processing_num_proc_per_process=64,
100
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: dataset_overwrite_cache=False,
101
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: text_column_name='text'),
102
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: seed=42,
103
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_loading_workers=0))],
104
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32')),
105
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lighteval=None)
106
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Model Config:
107
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: LlamaConfig(bos_token_id=1,
108
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: eos_token_id=2,
109
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hidden_act='silu',
110
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hidden_size=2048,
111
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: initializer_range=0.02,
112
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: intermediate_size=4096,
113
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: is_llama_config=True,
114
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: max_position_embeddings=4096,
115
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_attention_heads=32,
116
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_hidden_layers=24,
117
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_key_value_heads=32,
118
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pad_token_id=None,
119
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pretraining_tp=1,
120
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rms_norm_eps=1e-05,
121
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rope_scaling=None,
122
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rope_theta=10000.0,
123
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tie_word_embeddings=True,
124
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: use_cache=True,
125
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: vocab_size=50260)
126
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Building model..
127
+ [default0]:07/03/2024 23:46:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Setting PP block ranks...
128
+ [default0]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Total number of parameters: 1.11G (2117.09MiB)
129
+ [default0]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Local number of parameters: 277M (529.27MiB)
130
+ [default0]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
131
+ [default0]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: No checkpoint path provided.
132
+ [default0]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Parametrizing model parameters using StandardParametrizator
133
+ [default2]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=2|ip-26-0-164-187]: Local number of parameters: 277M (529.27MiB)
134
+ [default2]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=2|ip-26-0-164-187]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
135
+ [default2]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=2|ip-26-0-164-187]: No checkpoint path provided.
136
+ [default1]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=1|ip-26-0-164-187]: Local number of parameters: 277M (529.27MiB)
137
+ [default1]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=1|ip-26-0-164-187]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
138
+ [default1]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=1|ip-26-0-164-187]: No checkpoint path provided.
139
+ [default3]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=3|ip-26-0-164-187]: Local number of parameters: 277M (529.27MiB)
140
+ [default3]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=3|ip-26-0-164-187]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
141
+ [default3]:07/03/2024 23:46:23 [INFO|DP=0|PP=0|TP=3|ip-26-0-164-187]: No checkpoint path provided.
142
+ [default4]:07/03/2024 23:46:23 [INFO|DP=1|PP=0|TP=0|ip-26-0-164-187]: No checkpoint path provided.
143
+ [default6]:07/03/2024 23:46:23 [INFO|DP=1|PP=0|TP=2|ip-26-0-164-187]: No checkpoint path provided.
144
+ [default7]:07/03/2024 23:46:23 [INFO|DP=1|PP=0|TP=3|ip-26-0-164-187]: No checkpoint path provided.
145
+ [default5]:07/03/2024 23:46:23 [INFO|DP=1|PP=0|TP=1|ip-26-0-164-187]: No checkpoint path provided.
146
+ [default0]:07/03/2024 23:46:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Optimizer Building] Using LearningRateForSP as learning rate
147
+ [default0]:07/03/2024 23:46:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [ZeRO sharding] Size of optimizer params per rank:
148
+ [default0]:07/03/2024 23:46:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [ZeRO sharding] DP Rank 0 has 139M out of 277M (50.00%) params' optimizer states
149
+ [default0]:07/03/2024 23:46:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [ZeRO sharding] DP Rank 1 has 139M out of 277M (50.00%) params' optimizer states
150
+ [default0]:07/03/2024 23:46:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
151
+ [default0]:07/03/2024 23:46:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Using `datasets` library
152
+ [default0]:07/03/2024 23:46:27 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
153
+ [default0]:07/03/2024 23:46:27 [WARNING|DP=0|PP=0|TP=0|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
154
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
155
+ [default0]:07/03/2024 23:46:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Training Plan] There are 1 training stages
156
+ [default0]:07/03/2024 23:46:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Stage Training Stage] start from step 1
157
+ [default0]:07/03/2024 23:46:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]:
158
+ [default0]:07/03/2024 23:46:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Start training] datetime: 2024-07-03 23:46:29.945526 | mbs: 32 | grad_accum: 16 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
159
+ [default0]:07/03/2024 23:46:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
160
+ [default0]:07/03/2024 23:46:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Memory usage: 2142.76MiB. Peak allocated 2142.76MiB. Peak reserved: 2198.00MiB
161
+ [default2]:07/03/2024 23:46:30 [WARNING|DP=0|PP=0|TP=2|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
162
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
163
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
164
+ [default4]:07/03/2024 23:46:30 [WARNING|DP=1|PP=0|TP=0|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
165
+ [default1]:07/03/2024 23:46:30 [WARNING|DP=0|PP=0|TP=1|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
166
+ [default6]:07/03/2024 23:46:30 [WARNING|DP=1|PP=0|TP=2|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
167
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
168
+ [default7]:07/03/2024 23:46:30 [WARNING|DP=1|PP=0|TP=3|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
169
+ [default3]:07/03/2024 23:46:30 [WARNING|DP=0|PP=0|TP=3|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
170
+ [default5]:07/03/2024 23:46:30 [WARNING|DP=1|PP=0|TP=1|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
171
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
172
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
173
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
174
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
175
+ [default2]:[rank2]: Traceback (most recent call last):
176
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
177
+ [default2]:[rank2]: trainer.train(dataloader)
178
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
179
+ [default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
180
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
181
+ [default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter(
182
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
183
+ [default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
184
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
185
+ [default2]:[rank2]: output = model(**micro_batch)
186
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
187
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
188
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
189
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
190
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
191
+ [default2]:[rank2]: sharded_logits = self.model(
192
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
193
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
194
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
195
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
196
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
197
+ [default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
198
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
199
+ [default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
200
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
201
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
202
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
203
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
204
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
205
+ [default2]:[rank2]: output = self.pp_block(**new_kwargs)
206
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
207
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
208
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
209
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
210
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
211
+ [default2]:[rank2]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
212
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
213
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
214
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
215
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
216
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
217
+ [default2]:[rank2]: merged_states = self.gate_up_proj(hidden_states)
218
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
219
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
220
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
221
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
222
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
223
+ [default2]:[rank2]: return column_linear(
224
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
225
+ [default2]:[rank2]: return F.linear(input, weight, bias)
226
+ [default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 505.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 67.10 GiB is allocated by PyTorch, and 272.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
227
+ [default1]:[rank1]: Traceback (most recent call last):
228
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
229
+ [default1]:[rank1]: trainer.train(dataloader)
230
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
231
+ [default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
232
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
233
+ [default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter(
234
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
235
+ [default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
236
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
237
+ [default1]:[rank1]: output = model(**micro_batch)
238
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
239
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
240
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
241
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
242
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
243
+ [default1]:[rank1]: sharded_logits = self.model(
244
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
245
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
246
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
247
+ [default0]:[rank0]: Traceback (most recent call last):
248
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
249
+ [default0]:[rank0]: trainer.train(dataloader)
250
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
251
+ [default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
252
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
253
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
254
+ [default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
255
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
256
+ [default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
257
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
258
+ [default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter(
259
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
260
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
261
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
262
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
263
+ [default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
264
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
265
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
266
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
267
+ [default1]:[rank1]: output = self.pp_block(**new_kwargs)
268
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
269
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
270
+ [default0]:[rank0]: output = model(**micro_batch)
271
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
272
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
273
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
274
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
275
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
276
+ [default1]:[rank1]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
277
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
278
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
279
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
280
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
281
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
282
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
283
+ [default0]:[rank0]: sharded_logits = self.model(
284
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
285
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
286
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
287
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
288
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
289
+ [default1]:[rank1]: merged_states = self.gate_up_proj(hidden_states)
290
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
291
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
292
+ [default3]:[rank3]: Traceback (most recent call last):
293
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
294
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
295
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
296
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
297
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
298
+ [default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
299
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
300
+ [default1]:[rank1]: return column_linear(
301
+ [default3]:[rank3]: trainer.train(dataloader)
302
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
303
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
304
+ [default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
305
+ [default1]:[rank1]: return F.linear(input, weight, bias)
306
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
307
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
308
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
309
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
310
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
311
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
312
+ [default0]:[rank0]: output = self.pp_block(**new_kwargs)
313
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
314
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
315
+ [default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 505.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 67.10 GiB is allocated by PyTorch, and 272.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
316
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
317
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
318
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
319
+ [default0]:[rank0]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
320
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
321
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
322
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
323
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
324
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
325
+ [default0]:[rank0]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
326
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
327
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
328
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
329
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
330
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
331
+ [default0]:[rank0]: return self.act(gate_states) * up_states
332
+ [default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU
333
+ [default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
334
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
335
+ [default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter(
336
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
337
+ [default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
338
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
339
+ [default3]:[rank3]: output = model(**micro_batch)
340
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
341
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
342
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
343
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
344
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
345
+ [default3]:[rank3]: sharded_logits = self.model(
346
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
347
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
348
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
349
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
350
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
351
+ [default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
352
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
353
+ [default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
354
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
355
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
356
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
357
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
358
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
359
+ [default3]:[rank3]: output = self.pp_block(**new_kwargs)
360
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
361
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
362
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
363
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
364
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
365
+ [default3]:[rank3]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
366
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
367
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
368
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
369
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
370
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
371
+ [default3]:[rank3]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
372
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
373
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
374
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
375
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
376
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
377
+ [default3]:[rank3]: return self.act(gate_states) * up_states
378
+ [default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU  has a total capacity of 79.33 GiB of which 233.94 MiB is free. Including non-PyTorch memory, this process has 79.09 GiB memory in use. Of the allocated memory 67.85 GiB is allocated by PyTorch, and 16.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
379
+ [default5]:[rank5]: Traceback (most recent call last):
380
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
381
+ [default5]:[rank5]: trainer.train(dataloader)
382
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
383
+ [default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
384
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
385
+ [default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter(
386
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
387
+ [default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
388
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
389
+ [default5]:[rank5]: output = model(**micro_batch)
390
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
391
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
392
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
393
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
394
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
395
+ [default5]:[rank5]: sharded_logits = self.model(
396
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
397
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
398
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
399
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
400
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
401
+ [default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
402
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
403
+ [default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
404
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
405
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
406
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
407
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
408
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
409
+ [default5]:[rank5]: output = self.pp_block(**new_kwargs)
410
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
411
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
412
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
413
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
414
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
415
+ [default5]:[rank5]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
416
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
417
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
418
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
419
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
420
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
421
+ [default5]:[rank5]: merged_states = self.gate_up_proj(hidden_states)
422
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
423
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
424
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
425
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
426
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
427
+ [default5]:[rank5]: return column_linear(
428
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
429
+ [default5]:[rank5]: return F.linear(input, weight, bias)
430
+ [default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 505.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 67.10 GiB is allocated by PyTorch, and 272.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
431
+ [default6]:[rank6]: Traceback (most recent call last):
432
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
433
+ [default6]:[rank6]: trainer.train(dataloader)
434
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
435
+ [default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
436
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
437
+ [default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter(
438
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
439
+ [default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
440
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
441
+ [default6]:[rank6]: output = model(**micro_batch)
442
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
443
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
444
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
445
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
446
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
447
+ [default6]:[rank6]: sharded_logits = self.model(
448
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
449
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
450
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
451
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
452
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
453
+ [default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
454
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
455
+ [default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
456
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
457
+ [default7]:[rank7]: Traceback (most recent call last):
458
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
459
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
460
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
461
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
462
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
463
+ [default6]:[rank6]: output = self.pp_block(**new_kwargs)
464
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
465
+ [default7]:[rank7]: trainer.train(dataloader)
466
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
467
+ [default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
468
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
469
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
470
+ [default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter(
471
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
472
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
473
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
474
+ [default6]:[rank6]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
475
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
476
+ [default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
477
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
478
+ [default7]:[rank7]: output = model(**micro_batch)
479
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
480
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
481
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
482
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
483
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
484
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
485
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
486
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
487
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
488
+ [default7]:[rank7]: sharded_logits = self.model(
489
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
490
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
491
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
492
+ [default6]:[rank6]: merged_states = self.gate_up_proj(hidden_states)
493
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
494
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
495
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
496
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
497
+ [default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
498
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
499
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
500
+ [default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
501
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
502
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
503
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
504
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
505
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
506
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
507
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
508
+ [default6]:[rank6]: return column_linear(
509
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
510
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
511
+ [default6]:[rank6]: return F.linear(input, weight, bias)
512
+ [default7]:[rank7]: output = self.pp_block(**new_kwargs)
513
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
514
+ [default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 505.94 MiB is free. Including non-PyTorch memory, this process has 78.82 GiB memory in use. Of the allocated memory 67.10 GiB is allocated by PyTorch, and 272.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
515
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
516
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
517
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
518
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
519
+ [default7]:[rank7]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
520
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
521
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
522
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
523
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
524
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
525
+ [default7]:[rank7]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
526
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
527
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
528
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
529
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
530
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
531
+ [default7]:[rank7]: return row_linear(
532
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear
533
+ [default7]:[rank7]: out = F.linear(input, weight, bias)
534
+ [default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 457.94 MiB is free. Including non-PyTorch memory, this process has 78.87 GiB memory in use. Of the allocated memory 68.10 GiB is allocated by PyTorch, and 16.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
535
+ [default4]:[rank4]: Traceback (most recent call last):
536
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
537
+ [default4]:[rank4]: trainer.train(dataloader)
538
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
539
+ [default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
540
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
541
+ [default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter(
542
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
543
+ [default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
544
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
545
+ [default4]:[rank4]: output = model(**micro_batch)
546
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
547
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
548
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
549
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
550
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
551
+ [default4]:[rank4]: sharded_logits = self.model(
552
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
553
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
554
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
555
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
556
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
557
+ [default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
558
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
559
+ [default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
560
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
561
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
562
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
563
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
564
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
565
+ [default4]:[rank4]: output = self.pp_block(**new_kwargs)
566
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
567
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
568
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
569
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
570
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
571
+ [default4]:[rank4]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
572
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
573
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
574
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
575
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
576
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
577
+ [default4]:[rank4]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
578
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
579
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
580
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
581
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
582
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
583
+ [default4]:[rank4]: return self.act(gate_states) * up_states
584
+ [default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU  has a total capacity of 79.33 GiB of which 41.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 67.85 GiB is allocated by PyTorch, and 16.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
585
+ E0703 23:46:44.821000 139891705947968 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 34122) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
586
+ Traceback (most recent call last):
587
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
588
+ sys.exit(main())
589
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
590
+ return f(*args, **kwargs)
591
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
592
+ run(args)
593
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
594
+ elastic_launch(
595
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
596
+ return launch_agent(self._config, self._entrypoint, list(args))
597
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
598
+ raise ChildFailedError(
599
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
600
+ ============================================================
601
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
602
+ ------------------------------------------------------------
603
+ Failures:
604
+ [1]:
605
+ time : 2024-07-03_23:46:44
606
+ host : ip-26-0-164-187.ec2.internal
607
+ rank : 1 (local_rank: 1)
608
+ exitcode : 1 (pid: 34123)
609
+ error_file: <N/A>
610
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
611
+ [2]:
612
+ time : 2024-07-03_23:46:44
613
+ host : ip-26-0-164-187.ec2.internal
614
+ rank : 2 (local_rank: 2)
615
+ exitcode : 1 (pid: 34124)
616
+ error_file: <N/A>
617
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
618
+ [3]:
619
+ time : 2024-07-03_23:46:44
620
+ host : ip-26-0-164-187.ec2.internal
621
+ rank : 3 (local_rank: 3)
622
+ exitcode : 1 (pid: 34125)
623
+ error_file: <N/A>
624
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
625
+ [4]:
626
+ time : 2024-07-03_23:46:44
627
+ host : ip-26-0-164-187.ec2.internal
628
+ rank : 4 (local_rank: 4)
629
+ exitcode : 1 (pid: 34126)
630
+ error_file: <N/A>
631
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
632
+ [5]:
633
+ time : 2024-07-03_23:46:44
634
+ host : ip-26-0-164-187.ec2.internal
635
+ rank : 5 (local_rank: 5)
636
+ exitcode : 1 (pid: 34127)
637
+ error_file: <N/A>
638
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
639
+ [6]:
640
+ time : 2024-07-03_23:46:44
641
+ host : ip-26-0-164-187.ec2.internal
642
+ rank : 6 (local_rank: 6)
643
+ exitcode : 1 (pid: 34128)
644
+ error_file: <N/A>
645
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
646
+ [7]:
647
+ time : 2024-07-03_23:46:44
648
+ host : ip-26-0-164-187.ec2.internal
649
+ rank : 7 (local_rank: 7)
650
+ exitcode : 1 (pid: 34129)
651
+ error_file: <N/A>
652
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
653
+ ------------------------------------------------------------
654
+ Root Cause (first observed failure):
655
+ [0]:
656
+ time : 2024-07-03_23:46:44
657
+ host : ip-26-0-164-187.ec2.internal
658
+ rank : 0 (local_rank: 0)
659
+ exitcode : 1 (pid: 34122)
660
+ error_file: <N/A>
661
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
662
+ ============================================================
663
+ srun: error: ip-26-0-164-187: task 0: Exited with exit code 1
664
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-32/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom