#SBATCH --job-name=taiyi-sd-dreambooth # create a short name for your job | |
#SBATCH --nodes=1 # node count | |
#SBATCH --ntasks-per-node=1 # number of tasks to run per node | |
#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) | |
#SBATCH --gres=gpu:1 # number of gpus per node | |
#SBATCH -o %x-%j.log # output and error log file names (%x for job id) | |
#SBATCH -x dgx050 | |
# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen | |
ROOT_DIR=../../workspace | |
# export CUDA_VISIBLE_DEVICES='7' | |
export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions | |
MODEL_NAME=taiyi-sd-dreambooth | |
MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME} | |
if [ ! -d ${MODEL_ROOT_DIR} ];then | |
mkdir ${MODEL_ROOT_DIR} | |
fi | |
NNODES=1 | |
GPUS_PER_NODE=1 | |
MICRO_BATCH_SIZE=1 | |
INSTANCE_PROMPT="小黄鸭" | |
OUTPUT_DIR="saved_model_tinyduck" | |
INSTANCE_DIR="train_images_duck" | |
DATA_ARGS="\ | |
--dataloader_workers 2 \ | |
--train_batchsize $MICRO_BATCH_SIZE \ | |
--val_batchsize $MICRO_BATCH_SIZE \ | |
--test_batchsize $MICRO_BATCH_SIZE \ | |
--instance_data_dir=$INSTANCE_DIR \ | |
--instance_prompt=$INSTANCE_PROMPT \ | |
--resolution=512 \ | |
" | |
MODEL_ARGS="\ | |
--model_path $MODEL_ROOT_DIR/pretrain/Taiyi-Stable-Diffusion-1B-Chinese-v0.1/ \ | |
--train_text_encoder \ | |
--learning_rate 1e-6 \ | |
--scheduler_type constant \ | |
--warmup_steps 100 \ | |
" | |
MODEL_CHECKPOINT_ARGS="\ | |
--save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \ | |
--load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \ | |
" | |
TRAINER_ARGS="\ | |
--max_steps 1200 \ | |
--gpus $GPUS_PER_NODE \ | |
--num_nodes $NNODES \ | |
--strategy ddp \ | |
--log_every_n_steps 100 \ | |
--precision 32 \ | |
--default_root_dir ${MODEL_ROOT_DIR} \ | |
--replace_sampler_ddp False \ | |
--num_sanity_val_steps 0 \ | |
--limit_val_batches 0 \ | |
" | |
# num_sanity_val_steps, limit_val_batches 通过这俩参数把validation关了 | |
export options=" \ | |
$DATA_ARGS \ | |
$MODEL_ARGS \ | |
$MODEL_CHECKPOINT_ARGS \ | |
$TRAINER_ARGS \ | |
" | |
# run local | |
python train.py $options | |
# run on slurm | |
# srun python train.py $options |