diff --git "a/log.out" "b/log.out" new file mode 100644--- /dev/null +++ "b/log.out" @@ -0,0 +1,4253 @@ +slurm submission log: 2022-12-22 23:07:26.299599 +created following sbatch script: + +############################### + +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=2 +#SBATCH --exclude=john0,john1,john2,john3,john4,john5,john6,john7,john8,john9,john10,john11,jagupard15 +#SBATCH --gres=gpu:1 +#SBATCH --job-name=lxuechen-job-3319645 +#SBATCH --mem=32G +#SBATCH --open-mode=append +#SBATCH --output=/nlp/scr/lxuechen/human-feedback/tldr_pretrain/gpt2-xl-1e-05-3/log.out +#SBATCH --partition=sphinx +#SBATCH --time=3-0 + +# activate your desired anaconda environment +. /u/nlp/anaconda/main/anaconda3/envs/lxuechen-torch-stable/etc/profile.d/conda.sh ; conda activate lxuechen-torch-stable + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'python tldr_pretrain.py --fp16 True --half_precision_backend "cuda_amp" --model_name_or_path "gpt2-xl" --output_dir "/nlp/scr/lxuechen/human-feedback/tldr_pretrain/gpt2-xl-1e-05-3" --num_train_epochs 3 --per_device_train_batch_size 2 --per_device_eval_batch_size 4 --gradient_accumulation_steps 64 --evaluation_strategy "steps" --eval_steps 500 --save_strategy "steps" --save_steps 500 --save_total_limit 1 --learning_rate 1e-05 --warmup_ratio 0.03 --lr_scheduler_type "cosine" --logging_steps 100' + +############################### + +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 5410770 + + + +############################### + +/var/lib/slurm/slurmd/job5410770/slurm_script: line 15: /u/nlp/anaconda/main/anaconda3/envs/lxuechen-torch-stable/etc/profile.d/conda.sh: No such file or directory + +CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'. +To initialize your shell, run + + $ conda init + +Currently supported shells are: + - bash + - fish + - tcsh + - xonsh + - zsh + - powershell + +See 'conda init --help' for more information and options. + +IMPORTANT: You may need to close and restart your shell after running 'conda init'. + + +############################### +start time: 2022-12-22 23:07:29.715547 +machine: sphinx7 +conda env: lxuechen-torch-stable +############################### +running following processes + + python tldr_pretrain.py --fp16 True --half_precision_backend "cuda_amp" --model_name_or_path "gpt2-xl" --output_dir "/nlp/scr/lxuechen/human-feedback/tldr_pretrain/gpt2-xl-1e-05-3" --num_train_epochs 3 --per_device_train_batch_size 2 --per_device_eval_batch_size 4 --gradient_accumulation_steps 64 --evaluation_strategy "steps" --eval_steps 500 --save_strategy "steps" --save_steps 500 --save_total_limit 1 --learning_rate 1e-05 --warmup_ratio 0.03 --lr_scheduler_type "cosine" --logging_steps 100 + + +############################### +command outputs: + + +Using cuda_amp half precision backend +/u/nlp/anaconda/main/anaconda3/envs/lxuechen-torch-stable/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning + warnings.warn( +***** Running training ***** + Num examples = 116722 + Num Epochs = 3 + Instantaneous batch size per device = 2 + Total train batch size (w. parallel, distributed & accumulation) = 128 + Gradient Accumulation steps = 64 + Total optimization steps = 2733 + Number of trainable parameters = 1557612800 +Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true" +wandb: Currently logged in as: lxuechen. Use `wandb login --relogin` to force relogin +wandb: Tracking run with wandb version 0.13.7 +wandb: Run data is saved locally in /sailhome/lxuechen/software/human-feedback/summ/wandb/run-20221222_230947-1aawy2qo +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run /nlp/scr/lxuechen/human-feedback/tldr_pretrain/gpt2-xl-1e-05-3 +wandb: ⭐️ View project at https://wandb.ai/lxuechen/tldr_pretrain +wandb: πŸš€ View run at https://wandb.ai/lxuechen/tldr_pretrain/runs/1aawy2qo + 0%| | 0/2733 [00:00