3outeille HF Staff commited on
Commit
3457378
·
verified ·
1 Parent(s): d19d790

Upload llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16

Browse files
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=02:00:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=1
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=normal
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 1 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16 llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16 --commit-message "Upload llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 4
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 2
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 0
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 16
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 16
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/log.out ADDED
@@ -0,0 +1,566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Wed Jul 3 22:42:50 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ fatal: Unable to create '/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/.git/index.lock': File exists.
10
+
11
+ Another git process seems to be running in this repository, e.g.
12
+ an editor opened by 'git commit'. Please make sure all processes
13
+ are terminated then try again. If it still fails, a git process
14
+ may have crashed in this repository earlier:
15
+ remove the file manually to continue.
16
+ Job status: RUNNING
17
+ W0703 22:42:58.927000 140633245345600 torch/distributed/run.py:757]
18
+ W0703 22:42:58.927000 140633245345600 torch/distributed/run.py:757] *****************************************
19
+ W0703 22:42:58.927000 140633245345600 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0703 22:42:58.927000 140633245345600 torch/distributed/run.py:757] *****************************************
21
+ [default0]:07/03/2024 22:43:20 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Vocab Size Padding] Padded vocab (size: 50257) with 1 dummy tokens (new size: 50258)
22
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Config:
23
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Config(general=GeneralArgs(project='bench_cluster',
24
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: run='%date_%jobid',
25
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: seed=42,
26
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: step=None,
27
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: consumed_train_samples=None,
28
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: benchmark_csv_path=None,
29
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: ignore_sanity_checks=True),
30
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: parallelism=ParallelismArgs(dp=4,
31
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pp=1,
32
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tp=2,
33
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7f0d9451c820>,
34
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
35
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tp_linear_async_communication=False,
36
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: expert_parallel_size=1),
37
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
38
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: eos_token_id=2,
39
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hidden_act='silu',
40
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hidden_size=2048,
41
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: initializer_range=0.02,
42
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: intermediate_size=4096,
43
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: is_llama_config=True,
44
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: max_position_embeddings=4096,
45
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_attention_heads=32,
46
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_hidden_layers=24,
47
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_key_value_heads=32,
48
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pad_token_id=None,
49
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pretraining_tp=1,
50
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rms_norm_eps=1e-05,
51
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rope_scaling=None,
52
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rope_theta=10000.0,
53
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tie_word_embeddings=True,
54
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: use_cache=True,
55
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: vocab_size=50258),
56
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: init_method=RandomInit(std=0.025),
57
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: dtype=torch.bfloat16,
58
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: make_vocab_size_divisible_by=1,
59
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: ddp_bucket_cap_mb=25),
60
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
61
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tokenizer_revision=None,
62
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tokenizer_max_length=None),
63
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
64
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: checkpoint_interval=100000,
65
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: save_initial_state=False,
66
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: resume_checkpoint_path=None,
67
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: checkpoints_path_is_shared_file_system=False),
68
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: logging=LoggingArgs(log_level='info',
69
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: log_level_replica='info',
70
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: iteration_step_info_interval=1),
71
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tokens=TokensArgs(sequence_length=4096,
72
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: train_steps=20,
73
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: micro_batch_size=16,
74
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: batch_accumulation_per_replica=16,
75
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: val_check_interval=-1,
76
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: limit_val_batches=0,
77
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: limit_test_batches=0),
78
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
79
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: adam_beta1=0.9,
80
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: adam_beta2=0.95,
81
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: torch_adam_is_fused=True,
82
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: name='adamW'),
83
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: zero_stage=1,
84
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: weight_decay=0.01,
85
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: clip_grad=1.0,
86
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: accumulate_grad_in_fp32=True,
87
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
88
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_warmup_steps=1,
89
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_warmup_style='linear',
90
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_decay_style='linear',
91
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_decay_steps=19,
92
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_decay_starting_step=None,
93
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: min_decay_lr=1e-05)),
94
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: data_stages=[DatasetStageArgs(name='Training Stage',
95
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: start_training_step=1,
96
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
97
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hf_dataset_splits='train',
98
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hf_dataset_config_name=None,
99
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: dataset_processing_num_proc_per_process=64,
100
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: dataset_overwrite_cache=False,
101
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: text_column_name='text'),
102
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: seed=42,
103
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_loading_workers=0))],
104
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16')),
105
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lighteval=None)
106
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Model Config:
107
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: LlamaConfig(bos_token_id=1,
108
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: eos_token_id=2,
109
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hidden_act='silu',
110
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hidden_size=2048,
111
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: initializer_range=0.02,
112
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: intermediate_size=4096,
113
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: is_llama_config=True,
114
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: max_position_embeddings=4096,
115
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_attention_heads=32,
116
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_hidden_layers=24,
117
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_key_value_heads=32,
118
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pad_token_id=None,
119
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pretraining_tp=1,
120
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rms_norm_eps=1e-05,
121
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rope_scaling=None,
122
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rope_theta=10000.0,
123
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tie_word_embeddings=True,
124
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: use_cache=True,
125
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: vocab_size=50258)
126
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Building model..
127
+ [default0]:07/03/2024 22:43:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Setting PP block ranks...
128
+ [default1]:07/03/2024 22:43:31 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-178]: Local number of parameters: 555M (1058.35MiB)
129
+ [default1]:07/03/2024 22:43:31 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-178]: [After model building] Memory usage: 1082.37MiB. Peak allocated: 1182.56MiB Peak reserved: 1200.00MiB
130
+ [default1]:07/03/2024 22:43:31 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-178]: No checkpoint path provided.
131
+ [default0]:07/03/2024 22:43:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Total number of parameters: 1.11G (2116.70MiB)
132
+ [default0]:07/03/2024 22:43:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Local number of parameters: 555M (1058.35MiB)
133
+ [default0]:07/03/2024 22:43:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [After model building] Memory usage: 1082.37MiB. Peak allocated: 1182.56MiB Peak reserved: 1200.00MiB
134
+ [default0]:07/03/2024 22:43:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: No checkpoint path provided.
135
+ [default0]:07/03/2024 22:43:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Parametrizing model parameters using StandardParametrizator
136
+ [default2]:07/03/2024 22:43:31 [INFO|DP=1|PP=0|TP=0|ip-26-0-161-178]: No checkpoint path provided.
137
+ [default3]:07/03/2024 22:43:31 [INFO|DP=1|PP=0|TP=1|ip-26-0-161-178]: No checkpoint path provided.
138
+ [default5]:07/03/2024 22:43:31 [INFO|DP=2|PP=0|TP=1|ip-26-0-161-178]: No checkpoint path provided.
139
+ [default4]:07/03/2024 22:43:31 [INFO|DP=2|PP=0|TP=0|ip-26-0-161-178]: No checkpoint path provided.
140
+ [default6]:07/03/2024 22:43:31 [INFO|DP=3|PP=0|TP=0|ip-26-0-161-178]: No checkpoint path provided.
141
+ [default7]:07/03/2024 22:43:31 [INFO|DP=3|PP=0|TP=1|ip-26-0-161-178]: No checkpoint path provided.
142
+ [default0]:07/03/2024 22:43:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Optimizer Building] Using LearningRateForSP as learning rate
143
+ [default0]:07/03/2024 22:43:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [ZeRO sharding] Size of optimizer params per rank:
144
+ [default0]:07/03/2024 22:43:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [ZeRO sharding] DP Rank 0 has 139M out of 555M (25.00%) params' optimizer states
145
+ [default0]:07/03/2024 22:43:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [ZeRO sharding] DP Rank 1 has 139M out of 555M (25.00%) params' optimizer states
146
+ [default0]:07/03/2024 22:43:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [ZeRO sharding] DP Rank 2 has 139M out of 555M (25.00%) params' optimizer states
147
+ [default0]:07/03/2024 22:43:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [ZeRO sharding] DP Rank 3 has 139M out of 555M (25.00%) params' optimizer states
148
+ [default0]:07/03/2024 22:43:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
149
+ [default0]:07/03/2024 22:43:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Using `datasets` library
150
+ [default0]:07/03/2024 22:43:37 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
151
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
152
+ [default0]:07/03/2024 22:43:37 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
153
+ [default0]:07/03/2024 22:43:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Training Plan] There are 1 training stages
154
+ [default0]:07/03/2024 22:43:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Stage Training Stage] start from step 1
155
+ [default0]:07/03/2024 22:43:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]:
156
+ [default0]:07/03/2024 22:43:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Start training] datetime: 2024-07-03 22:43:39.757881 | mbs: 16 | grad_accum: 16 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
157
+ [default0]:07/03/2024 22:43:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
158
+ [default0]:07/03/2024 22:43:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Memory usage: 3729.08MiB. Peak allocated 3729.08MiB. Peak reserved: 3848.00MiB
159
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
160
+ [default7]:07/03/2024 22:43:39 [WARNING|DP=3|PP=0|TP=1|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
161
+ [default6]:07/03/2024 22:43:39 [WARNING|DP=3|PP=0|TP=0|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
162
+ [default1]:07/03/2024 22:43:39 [WARNING|DP=0|PP=0|TP=1|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
163
+ [default5]:07/03/2024 22:43:39 [WARNING|DP=2|PP=0|TP=1|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
164
+ [default3]:07/03/2024 22:43:39 [WARNING|DP=1|PP=0|TP=1|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
165
+ [default4]:07/03/2024 22:43:39 [WARNING|DP=2|PP=0|TP=0|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
166
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
167
+ [default2]:07/03/2024 22:43:39 [WARNING|DP=1|PP=0|TP=0|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
168
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
169
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
170
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
171
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
172
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
173
+ [default0]:[rank0]: Traceback (most recent call last):
174
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
175
+ [default0]:[rank0]: trainer.train(dataloader)
176
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
177
+ [default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
178
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
179
+ [default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter(
180
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
181
+ [default1]:[rank1]: Traceback (most recent call last):
182
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
183
+ [default1]:[rank1]: trainer.train(dataloader)
184
+ [default5]:[rank5]: Traceback (most recent call last):
185
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
186
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
187
+ [default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
188
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
189
+ [default3]:[rank3]: Traceback (most recent call last):
190
+ [default5]:[rank5]: trainer.train(dataloader)
191
+ [default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
192
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
193
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
194
+ [default0]:[rank0]: output = model(**micro_batch)
195
+ [default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter(
196
+ [default3]:[rank3]: trainer.train(dataloader)
197
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
198
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
199
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
200
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
201
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
202
+ [default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
203
+ [default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
204
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
205
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
206
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
207
+ [default2]:[rank2]: Traceback (most recent call last):
208
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
209
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
210
+ [default1]:[rank1]: output = model(**micro_batch)
211
+ [default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
212
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
213
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
214
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
215
+ [default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter(
216
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
217
+ [default0]:[rank0]: sharded_logits = self.model(
218
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
219
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
220
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
221
+ [default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
222
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
223
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
224
+ [default1]:[rank1]: sharded_logits = self.model(
225
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
226
+ [default5]:[rank5]: output = model(**micro_batch)
227
+ [default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter(
228
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
229
+ [default2]:[rank2]: trainer.train(dataloader)
230
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
231
+ [default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
232
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
233
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
234
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
235
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
236
+ [default3]:[rank3]: output = model(**micro_batch)
237
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
238
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
239
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
240
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
241
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
242
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
243
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
244
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
245
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
246
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
247
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
248
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
249
+ [default5]:[rank5]: sharded_logits = self.model(
250
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
251
+ [default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
252
+ [default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
253
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
254
+ [default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
255
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
256
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
257
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 784, in forward_with_hidden_states
258
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
259
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 784, in forward_with_hidden_states
260
+ [default0]:[rank0]: sharded_logits = self.lm_head(x=hidden_states)["logits"]
261
+ [default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter(
262
+ [default1]:[rank1]: sharded_logits = self.lm_head(x=hidden_states)["logits"]
263
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
264
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
265
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
266
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
267
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
268
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
269
+ [default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
270
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
271
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
272
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
273
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
274
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
275
+ [default2]:[rank2]: output = model(**micro_batch)
276
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
277
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
278
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
279
+ [default1]:[rank1]: output = self.pp_block(**new_kwargs)
280
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
281
+ [default3]:[rank3]: sharded_logits = self.model(
282
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
283
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
284
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
285
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
286
+ [default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
287
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
288
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
289
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
290
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
291
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
292
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
293
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
294
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 784, in forward_with_hidden_states
295
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
296
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
297
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
298
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
299
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
300
+ [default0]:[rank0]: output = self.pp_block(**new_kwargs)
301
+ [default1]:[rank1]: return column_linear(
302
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
303
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
304
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
305
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
306
+ [default1]:[rank1]: return F.linear(input, weight, bias)
307
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
308
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
309
+ [default5]:[rank5]: sharded_logits = self.lm_head(x=hidden_states)["logits"]
310
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
311
+ [default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
312
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 784, in forward_with_hidden_states
313
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
314
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
315
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
316
+ [default0]:[rank0]: return column_linear(
317
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
318
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
319
+ [default2]:[rank2]: sharded_logits = self.model(
320
+ [default0]:[rank0]: return F.linear(input, weight, bias)
321
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
322
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
323
+ [default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.07 GiB. GPU
324
+ [default5]:[rank5]: output = self.pp_block(**new_kwargs)
325
+ [default3]:[rank3]: sharded_logits = self.lm_head(x=hidden_states)["logits"]
326
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
327
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
328
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
329
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
330
+ [default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.07 GiB. GPU  has a total capacity of 79.33 GiB of which 329.94 MiB is free. Including non-PyTorch memory, this process has 79.00 GiB memory in use. Of the allocated memory 67.29 GiB is allocated by PyTorch, and 305.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
331
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
332
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
333
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
334
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
335
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
336
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
337
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
338
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
339
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
340
+ [default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
341
+ [default5]:[rank5]: return column_linear(
342
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
343
+ [default3]:[rank3]: output = self.pp_block(**new_kwargs)
344
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
345
+ [default5]:[rank5]: return F.linear(input, weight, bias)
346
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 784, in forward_with_hidden_states
347
+ [default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.07 GiB. GPU  has a total capacity of 79.33 GiB of which 281.94 MiB is free. Including non-PyTorch memory, this process has 79.04 GiB memory in use. Of the allocated memory 67.29 GiB is allocated by PyTorch, and 305.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
348
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
349
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
350
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
351
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
352
+ [default2]:[rank2]: sharded_logits = self.lm_head(x=hidden_states)["logits"]
353
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
354
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
355
+ [default3]:[rank3]: return column_linear(
356
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
357
+ [default3]:[rank3]: return F.linear(input, weight, bias)
358
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
359
+ [default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.07 GiB. GPU  has a total capacity of 79.33 GiB of which 281.94 MiB is free. Including non-PyTorch memory, this process has 79.04 GiB memory in use. Of the allocated memory 67.29 GiB is allocated by PyTorch, and 305.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
360
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
361
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
362
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
363
+ [default2]:[rank2]: output = self.pp_block(**new_kwargs)
364
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
365
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
366
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
367
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
368
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
369
+ [default2]:[rank2]: return column_linear(
370
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
371
+ [default2]:[rank2]: return F.linear(input, weight, bias)
372
+ [default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.07 GiB. GPU  has a total capacity of 79.33 GiB of which 281.94 MiB is free. Including non-PyTorch memory, this process has 79.04 GiB memory in use. Of the allocated memory 67.29 GiB is allocated by PyTorch, and 305.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
373
+ [default4]:[rank4]: Traceback (most recent call last):
374
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
375
+ [default4]:[rank4]: trainer.train(dataloader)
376
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
377
+ [default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
378
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
379
+ [default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter(
380
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
381
+ [default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
382
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
383
+ [default4]:[rank4]: output = model(**micro_batch)
384
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
385
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
386
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
387
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
388
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
389
+ [default4]:[rank4]: sharded_logits = self.model(
390
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
391
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
392
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
393
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
394
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
395
+ [default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
396
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 784, in forward_with_hidden_states
397
+ [default4]:[rank4]: sharded_logits = self.lm_head(x=hidden_states)["logits"]
398
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
399
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
400
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
401
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
402
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
403
+ [default4]:[rank4]: output = self.pp_block(**new_kwargs)
404
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
405
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
406
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
407
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
408
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
409
+ [default4]:[rank4]: return column_linear(
410
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
411
+ [default4]:[rank4]: return F.linear(input, weight, bias)
412
+ [default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.07 GiB. GPU  has a total capacity of 79.33 GiB of which 281.94 MiB is free. Including non-PyTorch memory, this process has 79.04 GiB memory in use. Of the allocated memory 67.29 GiB is allocated by PyTorch, and 305.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
413
+ [default6]:[rank6]: Traceback (most recent call last):
414
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
415
+ [default6]:[rank6]: trainer.train(dataloader)
416
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
417
+ [default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
418
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
419
+ [default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter(
420
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
421
+ [default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
422
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
423
+ [default6]:[rank6]: output = model(**micro_batch)
424
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
425
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
426
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
427
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
428
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
429
+ [default6]:[rank6]: sharded_logits = self.model(
430
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
431
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
432
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
433
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
434
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
435
+ [default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
436
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 784, in forward_with_hidden_states
437
+ [default6]:[rank6]: sharded_logits = self.lm_head(x=hidden_states)["logits"]
438
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
439
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
440
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
441
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
442
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
443
+ [default6]:[rank6]: output = self.pp_block(**new_kwargs)
444
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
445
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
446
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
447
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
448
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
449
+ [default6]:[rank6]: return column_linear(
450
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
451
+ [default6]:[rank6]: return F.linear(input, weight, bias)
452
+ [default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.07 GiB. GPU  has a total capacity of 79.33 GiB of which 521.94 MiB is free. Including non-PyTorch memory, this process has 78.81 GiB memory in use. Of the allocated memory 67.29 GiB is allocated by PyTorch, and 305.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
453
+ [default7]:[rank7]: Traceback (most recent call last):
454
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
455
+ [default7]:[rank7]: trainer.train(dataloader)
456
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
457
+ [default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
458
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
459
+ [default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter(
460
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
461
+ [default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
462
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
463
+ [default7]:[rank7]: output = model(**micro_batch)
464
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
465
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
466
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
467
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
468
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
469
+ [default7]:[rank7]: sharded_logits = self.model(
470
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
471
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
472
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
473
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
474
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
475
+ [default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
476
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 784, in forward_with_hidden_states
477
+ [default7]:[rank7]: sharded_logits = self.lm_head(x=hidden_states)["logits"]
478
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
479
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
480
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
481
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
482
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
483
+ [default7]:[rank7]: output = self.pp_block(**new_kwargs)
484
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
485
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
486
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
487
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
488
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
489
+ [default7]:[rank7]: return column_linear(
490
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
491
+ [default7]:[rank7]: return F.linear(input, weight, bias)
492
+ [default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.07 GiB. GPU  has a total capacity of 79.33 GiB of which 1001.94 MiB is free. Including non-PyTorch memory, this process has 78.34 GiB memory in use. Of the allocated memory 67.29 GiB is allocated by PyTorch, and 305.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
493
+ W0703 22:43:49.264000 140633245345600 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1028021 closing signal SIGTERM
494
+ E0703 22:43:49.682000 140633245345600 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1028020) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
495
+ Traceback (most recent call last):
496
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
497
+ sys.exit(main())
498
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
499
+ return f(*args, **kwargs)
500
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
501
+ run(args)
502
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
503
+ elastic_launch(
504
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
505
+ return launch_agent(self._config, self._entrypoint, list(args))
506
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
507
+ raise ChildFailedError(
508
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
509
+ ============================================================
510
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
511
+ ------------------------------------------------------------
512
+ Failures:
513
+ [1]:
514
+ time : 2024-07-03_22:43:49
515
+ host : ip-26-0-161-178.ec2.internal
516
+ rank : 2 (local_rank: 2)
517
+ exitcode : 1 (pid: 1028022)
518
+ error_file: <N/A>
519
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
520
+ [2]:
521
+ time : 2024-07-03_22:43:49
522
+ host : ip-26-0-161-178.ec2.internal
523
+ rank : 3 (local_rank: 3)
524
+ exitcode : 1 (pid: 1028023)
525
+ error_file: <N/A>
526
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
527
+ [3]:
528
+ time : 2024-07-03_22:43:49
529
+ host : ip-26-0-161-178.ec2.internal
530
+ rank : 4 (local_rank: 4)
531
+ exitcode : 1 (pid: 1028024)
532
+ error_file: <N/A>
533
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
534
+ [4]:
535
+ time : 2024-07-03_22:43:49
536
+ host : ip-26-0-161-178.ec2.internal
537
+ rank : 5 (local_rank: 5)
538
+ exitcode : 1 (pid: 1028025)
539
+ error_file: <N/A>
540
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
541
+ [5]:
542
+ time : 2024-07-03_22:43:49
543
+ host : ip-26-0-161-178.ec2.internal
544
+ rank : 6 (local_rank: 6)
545
+ exitcode : 1 (pid: 1028026)
546
+ error_file: <N/A>
547
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
548
+ [6]:
549
+ time : 2024-07-03_22:43:49
550
+ host : ip-26-0-161-178.ec2.internal
551
+ rank : 7 (local_rank: 7)
552
+ exitcode : 1 (pid: 1028027)
553
+ error_file: <N/A>
554
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
555
+ ------------------------------------------------------------
556
+ Root Cause (first observed failure):
557
+ [0]:
558
+ time : 2024-07-03_22:43:49
559
+ host : ip-26-0-161-178.ec2.internal
560
+ rank : 0 (local_rank: 0)
561
+ exitcode : 1 (pid: 1028020)
562
+ error_file: <N/A>
563
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
564
+ ============================================================
565
+ srun: error: ip-26-0-161-178: task 0: Exited with exit code 1
566
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-16/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom