3outeille HF Staff commited on
Commit
85d405f
·
verified ·
1 Parent(s): b7b1765

Upload llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32

Browse files
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=02:00:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=1
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=normal
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 1 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32 llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32 --commit-message "Upload llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 4
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 2
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 0
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 8
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 32
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/log.out ADDED
@@ -0,0 +1,600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Wed Jul 3 23:40:50 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ Already on 'bench_cluster'
10
+ M examples/config_tiny_llama.py
11
+ M examples/config_tiny_llama.yaml
12
+ M examples/train_tiny_llama.sh
13
+ M src/nanotron/models/llama.py
14
+ M src/nanotron/trainer.py
15
+ Your branch is up to date with 'origin/bench_cluster'.
16
+ Job status: RUNNING
17
+ W0703 23:40:57.107000 139670796371776 torch/distributed/run.py:757]
18
+ W0703 23:40:57.107000 139670796371776 torch/distributed/run.py:757] *****************************************
19
+ W0703 23:40:57.107000 139670796371776 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0703 23:40:57.107000 139670796371776 torch/distributed/run.py:757] *****************************************
21
+ [default0]:07/03/2024 23:41:18 [WARNING|DP=0|PP=0|TP=0|ip-26-0-169-86]: [Vocab Size Padding] Padded vocab (size: 50257) with 1 dummy tokens (new size: 50258)
22
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: Config:
23
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: Config(general=GeneralArgs(project='bench_cluster',
24
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: run='%date_%jobid',
25
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: seed=42,
26
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: step=None,
27
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: consumed_train_samples=None,
28
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: benchmark_csv_path=None,
29
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: ignore_sanity_checks=True),
30
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: parallelism=ParallelismArgs(dp=4,
31
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: pp=1,
32
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: tp=2,
33
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7faf63ad5090>,
34
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
35
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: tp_linear_async_communication=False,
36
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: expert_parallel_size=1),
37
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
38
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: eos_token_id=2,
39
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: hidden_act='silu',
40
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: hidden_size=2048,
41
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: initializer_range=0.02,
42
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: intermediate_size=4096,
43
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: is_llama_config=True,
44
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: max_position_embeddings=4096,
45
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: num_attention_heads=32,
46
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: num_hidden_layers=24,
47
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: num_key_value_heads=32,
48
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: pad_token_id=None,
49
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: pretraining_tp=1,
50
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: rms_norm_eps=1e-05,
51
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: rope_scaling=None,
52
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: rope_theta=10000.0,
53
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: tie_word_embeddings=True,
54
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: use_cache=True,
55
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: vocab_size=50258),
56
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: init_method=RandomInit(std=0.025),
57
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: dtype=torch.bfloat16,
58
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: make_vocab_size_divisible_by=1,
59
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: ddp_bucket_cap_mb=25),
60
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
61
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: tokenizer_revision=None,
62
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: tokenizer_max_length=None),
63
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
64
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: checkpoint_interval=100000,
65
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: save_initial_state=False,
66
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: resume_checkpoint_path=None,
67
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: checkpoints_path_is_shared_file_system=False),
68
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: logging=LoggingArgs(log_level='info',
69
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: log_level_replica='info',
70
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: iteration_step_info_interval=1),
71
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: tokens=TokensArgs(sequence_length=4096,
72
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: train_steps=20,
73
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: micro_batch_size=32,
74
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: batch_accumulation_per_replica=8,
75
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: val_check_interval=-1,
76
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: limit_val_batches=0,
77
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: limit_test_batches=0),
78
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
79
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: adam_beta1=0.9,
80
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: adam_beta2=0.95,
81
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: torch_adam_is_fused=True,
82
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: name='adamW'),
83
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: zero_stage=1,
84
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: weight_decay=0.01,
85
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: clip_grad=1.0,
86
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: accumulate_grad_in_fp32=True,
87
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
88
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: lr_warmup_steps=1,
89
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: lr_warmup_style='linear',
90
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: lr_decay_style='linear',
91
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: lr_decay_steps=19,
92
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: lr_decay_starting_step=None,
93
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: min_decay_lr=1e-05)),
94
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: data_stages=[DatasetStageArgs(name='Training Stage',
95
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: start_training_step=1,
96
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
97
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: hf_dataset_splits='train',
98
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: hf_dataset_config_name=None,
99
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: dataset_processing_num_proc_per_process=64,
100
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: dataset_overwrite_cache=False,
101
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: text_column_name='text'),
102
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: seed=42,
103
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: num_loading_workers=0))],
104
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32')),
105
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: lighteval=None)
106
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: Model Config:
107
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: LlamaConfig(bos_token_id=1,
108
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: eos_token_id=2,
109
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: hidden_act='silu',
110
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: hidden_size=2048,
111
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: initializer_range=0.02,
112
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: intermediate_size=4096,
113
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: is_llama_config=True,
114
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: max_position_embeddings=4096,
115
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: num_attention_heads=32,
116
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: num_hidden_layers=24,
117
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: num_key_value_heads=32,
118
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: pad_token_id=None,
119
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: pretraining_tp=1,
120
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: rms_norm_eps=1e-05,
121
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: rope_scaling=None,
122
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: rope_theta=10000.0,
123
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: tie_word_embeddings=True,
124
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: use_cache=True,
125
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: vocab_size=50258)
126
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: Building model..
127
+ [default0]:07/03/2024 23:41:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: Setting PP block ranks...
128
+ [default1]:07/03/2024 23:41:29 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-86]: Local number of parameters: 555M (1058.35MiB)
129
+ [default1]:07/03/2024 23:41:29 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-86]: [After model building] Memory usage: 1082.37MiB. Peak allocated: 1182.56MiB Peak reserved: 1200.00MiB
130
+ [default1]:07/03/2024 23:41:29 [INFO|DP=0|PP=0|TP=1|ip-26-0-169-86]: No checkpoint path provided.
131
+ [default0]:07/03/2024 23:41:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: Total number of parameters: 1.11G (2116.70MiB)
132
+ [default0]:07/03/2024 23:41:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: Local number of parameters: 555M (1058.35MiB)
133
+ [default0]:07/03/2024 23:41:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: [After model building] Memory usage: 1082.37MiB. Peak allocated: 1182.56MiB Peak reserved: 1200.00MiB
134
+ [default0]:07/03/2024 23:41:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: No checkpoint path provided.
135
+ [default0]:07/03/2024 23:41:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: Parametrizing model parameters using StandardParametrizator
136
+ [default3]:07/03/2024 23:41:29 [INFO|DP=1|PP=0|TP=1|ip-26-0-169-86]: No checkpoint path provided.
137
+ [default2]:07/03/2024 23:41:29 [INFO|DP=1|PP=0|TP=0|ip-26-0-169-86]: No checkpoint path provided.
138
+ [default4]:07/03/2024 23:41:29 [INFO|DP=2|PP=0|TP=0|ip-26-0-169-86]: No checkpoint path provided.
139
+ [default5]:07/03/2024 23:41:29 [INFO|DP=2|PP=0|TP=1|ip-26-0-169-86]: No checkpoint path provided.
140
+ [default6]:07/03/2024 23:41:29 [INFO|DP=3|PP=0|TP=0|ip-26-0-169-86]: No checkpoint path provided.
141
+ [default7]:07/03/2024 23:41:29 [INFO|DP=3|PP=0|TP=1|ip-26-0-169-86]: No checkpoint path provided.
142
+ [default0]:07/03/2024 23:41:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: [Optimizer Building] Using LearningRateForSP as learning rate
143
+ [default0]:07/03/2024 23:41:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: [ZeRO sharding] Size of optimizer params per rank:
144
+ [default0]:07/03/2024 23:41:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: [ZeRO sharding] DP Rank 0 has 139M out of 555M (25.00%) params' optimizer states
145
+ [default0]:07/03/2024 23:41:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: [ZeRO sharding] DP Rank 1 has 139M out of 555M (25.00%) params' optimizer states
146
+ [default0]:07/03/2024 23:41:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: [ZeRO sharding] DP Rank 2 has 139M out of 555M (25.00%) params' optimizer states
147
+ [default0]:07/03/2024 23:41:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: [ZeRO sharding] DP Rank 3 has 139M out of 555M (25.00%) params' optimizer states
148
+ [default0]:07/03/2024 23:41:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
149
+ [default0]:07/03/2024 23:41:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: Using `datasets` library
150
+ [default0]:07/03/2024 23:41:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
151
+ [default0]:07/03/2024 23:41:36 [WARNING|DP=0|PP=0|TP=0|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty.
152
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
153
+ [default0]:07/03/2024 23:41:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: [Training Plan] There are 1 training stages
154
+ [default0]:07/03/2024 23:41:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: [Stage Training Stage] start from step 1
155
+ [default0]:07/03/2024 23:41:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]:
156
+ [default0]:07/03/2024 23:41:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: [Start training] datetime: 2024-07-03 23:41:38.199155 | mbs: 32 | grad_accum: 8 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
157
+ [default0]:07/03/2024 23:41:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
158
+ [default0]:07/03/2024 23:41:38 [INFO|DP=0|PP=0|TP=0|ip-26-0-169-86]: Memory usage: 3729.08MiB. Peak allocated 3729.08MiB. Peak reserved: 3848.00MiB
159
+ [default5]:07/03/2024 23:41:38 [WARNING|DP=2|PP=0|TP=1|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty.
160
+ [default2]:07/03/2024 23:41:38 [WARNING|DP=1|PP=0|TP=0|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty.
161
+ [default1]:07/03/2024 23:41:38 [WARNING|DP=0|PP=0|TP=1|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty.
162
+ [default4]:07/03/2024 23:41:38 [WARNING|DP=2|PP=0|TP=0|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty.
163
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
164
+ [default3]:07/03/2024 23:41:38 [WARNING|DP=1|PP=0|TP=1|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty.
165
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
166
+ [default7]:07/03/2024 23:41:38 [WARNING|DP=3|PP=0|TP=1|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty.
167
+ [default6]:07/03/2024 23:41:38 [WARNING|DP=3|PP=0|TP=0|ip-26-0-169-86]: Repo card metadata block was not found. Setting CardData to empty.
168
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
169
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
170
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
171
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
172
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
173
+ [default0]:[rank0]: Traceback (most recent call last):
174
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
175
+ [default0]:[rank0]: trainer.train(dataloader)
176
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
177
+ [default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
178
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
179
+ [default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter(
180
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
181
+ [default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
182
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
183
+ [default0]:[rank0]: output = model(**micro_batch)
184
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
185
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
186
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
187
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
188
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
189
+ [default0]:[rank0]: sharded_logits = self.model(
190
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
191
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
192
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
193
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
194
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
195
+ [default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
196
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
197
+ [default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
198
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
199
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
200
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
201
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
202
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
203
+ [default0]:[rank0]: output = self.pp_block(**new_kwargs)
204
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
205
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
206
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
207
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
208
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
209
+ [default0]:[rank0]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
210
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
211
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
212
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
213
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
214
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
215
+ [default0]:[rank0]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
216
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
217
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
218
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
219
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
220
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
221
+ [default0]:[rank0]: return row_linear(
222
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear
223
+ [default0]:[rank0]: out = F.linear(input, weight, bias)
224
+ [default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU
225
+ [default1]:[rank1]: Traceback (most recent call last):
226
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
227
+ [default1]:[rank1]: trainer.train(dataloader)
228
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
229
+ [default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
230
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
231
+ [default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter(
232
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
233
+ [default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
234
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
235
+ [default1]:[rank1]: output = model(**micro_batch)
236
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
237
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
238
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
239
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
240
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
241
+ [default1]:[rank1]: sharded_logits = self.model(
242
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
243
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
244
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
245
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
246
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
247
+ [default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
248
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
249
+ [default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
250
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
251
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
252
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
253
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
254
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
255
+ [default1]:[rank1]: output = self.pp_block(**new_kwargs)
256
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
257
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
258
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
259
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
260
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
261
+ [default1]:[rank1]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
262
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
263
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
264
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
265
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
266
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
267
+ [default1]:[rank1]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
268
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
269
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
270
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
271
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
272
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
273
+ [default1]:[rank1]: return self.act(gate_states) * up_states
274
+ [default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 461.94 MiB is free. Including non-PyTorch memory, this process has 78.87 GiB memory in use. Of the allocated memory 66.78 GiB is allocated by PyTorch, and 689.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
275
+ [default6]:[rank6]: Traceback (most recent call last):
276
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
277
+ [default6]:[rank6]: trainer.train(dataloader)
278
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
279
+ [default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
280
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
281
+ [default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter(
282
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
283
+ [default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
284
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
285
+ [default6]:[rank6]: output = model(**micro_batch)
286
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
287
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
288
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
289
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
290
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
291
+ [default6]:[rank6]: sharded_logits = self.model(
292
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
293
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
294
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
295
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
296
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
297
+ [default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
298
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
299
+ [default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
300
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
301
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
302
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
303
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
304
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
305
+ [default6]:[rank6]: output = self.pp_block(**new_kwargs)
306
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
307
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
308
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
309
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
310
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
311
+ [default6]:[rank6]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
312
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
313
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
314
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
315
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
316
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
317
+ [default6]:[rank6]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
318
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
319
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
320
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
321
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
322
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
323
+ [default6]:[rank6]: return row_linear(
324
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear
325
+ [default6]:[rank6]: out = F.linear(input, weight, bias)
326
+ [default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 141.94 MiB is free. Including non-PyTorch memory, this process has 79.18 GiB memory in use. Of the allocated memory 67.28 GiB is allocated by PyTorch, and 689.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
327
+ [default5]:[rank5]: Traceback (most recent call last):
328
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
329
+ [default5]:[rank5]: trainer.train(dataloader)
330
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
331
+ [default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
332
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
333
+ [default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter(
334
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
335
+ [default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
336
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
337
+ [default5]:[rank5]: output = model(**micro_batch)
338
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
339
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
340
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
341
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
342
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
343
+ [default5]:[rank5]: sharded_logits = self.model(
344
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
345
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
346
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
347
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
348
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
349
+ [default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
350
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
351
+ [default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
352
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
353
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
354
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
355
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
356
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
357
+ [default5]:[rank5]: output = self.pp_block(**new_kwargs)
358
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
359
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
360
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
361
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
362
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
363
+ [default5]:[rank5]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
364
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
365
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
366
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
367
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
368
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
369
+ [default5]:[rank5]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
370
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
371
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
372
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
373
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
374
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
375
+ [default5]:[rank5]: return self.act(gate_states) * up_states
376
+ [default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 413.94 MiB is free. Including non-PyTorch memory, this process has 78.91 GiB memory in use. Of the allocated memory 66.78 GiB is allocated by PyTorch, and 689.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
377
+ [default4]:[rank4]: Traceback (most recent call last):
378
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
379
+ [default4]:[rank4]: trainer.train(dataloader)
380
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
381
+ [default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
382
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
383
+ [default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter(
384
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
385
+ [default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
386
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
387
+ [default4]:[rank4]: output = model(**micro_batch)
388
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
389
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
390
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
391
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
392
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
393
+ [default4]:[rank4]: sharded_logits = self.model(
394
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
395
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
396
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
397
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
398
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
399
+ [default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
400
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
401
+ [default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
402
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
403
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
404
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
405
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
406
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
407
+ [default4]:[rank4]: output = self.pp_block(**new_kwargs)
408
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
409
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
410
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
411
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
412
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
413
+ [default4]:[rank4]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
414
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
415
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
416
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
417
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
418
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
419
+ [default4]:[rank4]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
420
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
421
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
422
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
423
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
424
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
425
+ [default4]:[rank4]: return self.act(gate_states) * up_states
426
+ [default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 413.94 MiB is free. Including non-PyTorch memory, this process has 78.91 GiB memory in use. Of the allocated memory 66.78 GiB is allocated by PyTorch, and 689.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
427
+ [default2]:[rank2]: Traceback (most recent call last):
428
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
429
+ [default2]:[rank2]: trainer.train(dataloader)
430
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
431
+ [default3]:[rank3]: Traceback (most recent call last):
432
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
433
+ [default3]:[rank3]: trainer.train(dataloader)
434
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
435
+ [default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
436
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
437
+ [default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter(
438
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
439
+ [default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
440
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
441
+ [default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
442
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
443
+ [default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter(
444
+ [default3]:[rank3]: output = model(**micro_batch)
445
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
446
+ [default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
447
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
448
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
449
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
450
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
451
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
452
+ [default3]:[rank3]: sharded_logits = self.model(
453
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
454
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
455
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
456
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
457
+ [default2]:[rank2]: output = model(**micro_batch)
458
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
459
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
460
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
461
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
462
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
463
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
464
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
465
+ [default2]:[rank2]: sharded_logits = self.model(
466
+ [default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
467
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
468
+ [default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
469
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
470
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
471
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
472
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
473
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
474
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
475
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
476
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
477
+ [default3]:[rank3]: output = self.pp_block(**new_kwargs)
478
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
479
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
480
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
481
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
482
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
483
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
484
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
485
+ [default3]:[rank3]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
486
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
487
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
488
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
489
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
490
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
491
+ [default3]:[rank3]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
492
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
493
+ [default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
494
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
495
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
496
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
497
+ [default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
498
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
499
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
500
+ [default3]:[rank3]: return self.act(gate_states) * up_states
501
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
502
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
503
+ [default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 413.94 MiB is free. Including non-PyTorch memory, this process has 78.91 GiB memory in use. Of the allocated memory 66.78 GiB is allocated by PyTorch, and 689.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
504
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
505
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
506
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
507
+ [default2]:[rank2]: output = self.pp_block(**new_kwargs)
508
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
509
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
510
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
511
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
512
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
513
+ [default2]:[rank2]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
514
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
515
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
516
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
517
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
518
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
519
+ [default2]:[rank2]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
520
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
521
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
522
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
523
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
524
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
525
+ [default2]:[rank2]: return self.act(gate_states) * up_states
526
+ [default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 413.94 MiB is free. Including non-PyTorch memory, this process has 78.91 GiB memory in use. Of the allocated memory 66.78 GiB is allocated by PyTorch, and 689.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
527
+ W0703 23:41:47.490000 139670796371776 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2276671 closing signal SIGTERM
528
+ E0703 23:41:48.312000 139670796371776 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 2276664) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
529
+ Traceback (most recent call last):
530
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
531
+ sys.exit(main())
532
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
533
+ return f(*args, **kwargs)
534
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
535
+ run(args)
536
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
537
+ elastic_launch(
538
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
539
+ return launch_agent(self._config, self._entrypoint, list(args))
540
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
541
+ raise ChildFailedError(
542
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
543
+ ============================================================
544
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
545
+ ------------------------------------------------------------
546
+ Failures:
547
+ [1]:
548
+ time : 2024-07-03_23:41:47
549
+ host : ip-26-0-169-86.ec2.internal
550
+ rank : 1 (local_rank: 1)
551
+ exitcode : 1 (pid: 2276665)
552
+ error_file: <N/A>
553
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
554
+ [2]:
555
+ time : 2024-07-03_23:41:47
556
+ host : ip-26-0-169-86.ec2.internal
557
+ rank : 2 (local_rank: 2)
558
+ exitcode : 1 (pid: 2276666)
559
+ error_file: <N/A>
560
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
561
+ [3]:
562
+ time : 2024-07-03_23:41:47
563
+ host : ip-26-0-169-86.ec2.internal
564
+ rank : 3 (local_rank: 3)
565
+ exitcode : 1 (pid: 2276667)
566
+ error_file: <N/A>
567
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
568
+ [4]:
569
+ time : 2024-07-03_23:41:47
570
+ host : ip-26-0-169-86.ec2.internal
571
+ rank : 4 (local_rank: 4)
572
+ exitcode : 1 (pid: 2276668)
573
+ error_file: <N/A>
574
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
575
+ [5]:
576
+ time : 2024-07-03_23:41:47
577
+ host : ip-26-0-169-86.ec2.internal
578
+ rank : 5 (local_rank: 5)
579
+ exitcode : 1 (pid: 2276669)
580
+ error_file: <N/A>
581
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
582
+ [6]:
583
+ time : 2024-07-03_23:41:47
584
+ host : ip-26-0-169-86.ec2.internal
585
+ rank : 6 (local_rank: 6)
586
+ exitcode : 1 (pid: 2276670)
587
+ error_file: <N/A>
588
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
589
+ ------------------------------------------------------------
590
+ Root Cause (first observed failure):
591
+ [0]:
592
+ time : 2024-07-03_23:41:47
593
+ host : ip-26-0-169-86.ec2.internal
594
+ rank : 0 (local_rank: 0)
595
+ exitcode : 1 (pid: 2276664)
596
+ error_file: <N/A>
597
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
598
+ ============================================================
599
+ srun: error: ip-26-0-169-86: task 0: Exited with exit code 1
600
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-32/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom