tess-2-demo / shell_scripts /run_pretrain_35k.sh
hamishivi's picture
commit
17ff0d8 verified
PYTHON_CMD="
accelerate launch
--mixed_precision bf16 -m sdlm.run_pretrain \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--do_train \
--do_eval \
--log_level info \
--evaluation_strategy steps \
--report_to tensorboard \
--max_seq_length 512 \
--simplex_value 5 \
--num_diffusion_steps 5000 \
--lr_scheduler_type constant_with_warmup \
--learning_rate 1e-5 \
--pad_to_max_length \
--beta_schedule squaredcos_improved_ddpm \
--top_p 0.99 \
--max_steps 35000 \
--warmup_steps 5000 \
--logging_steps 50 \
--save_total_limit 1 \
--conditional_generation ul2 \
--self_condition logits_mean \
--self_condition_mix_before_weights \
--streaming \
--bf16 \
--gradient_checkpointing \
--use_flash_attention2 \
--is_causal false \
--mask_padding_in_loss false \
--without_compute_metrics true \
--dataloader_num_workers 8 \
--remove_unused_columns false \
--dispatch_batches false \
--shuffle true \
--preprocessing_num_workers 16 \
--line_by_line false \
--model_revision 26bca36bde8333b5d7f72e9ed20ccda6a618af24 \
--fsdp auto_wrap \
--fsdp_transformer_layer_cls_to_wrap MistralDecoderLayer \
"
# --min_sample_seq_length 650 \ # set to filter by length
# # cdcd
# --fsdp auto_wrap \
# --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \
GANTRY_CMD="
gantry run -y -n random_35k_pretrain -t random_35k_pretrain --allow-dirty \
--workspace ai2/tess2 \
--gpus 7 \
--priority normal \
--budget ai2/allennlp \
--preemptible \
--env PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python \
--env-secret HF_TOKEN=HF_TOKEN \
--beaker-image ai2/pytorch2.0.0-cuda11.8-python3.10 \
--venv base \
--pip requirements.txt \
--no-nfs \
--cluster ai2/jupiter-cirrascale-2 \
--weka oe-data-default:/data/input \
-- ${PYTHON_CMD} \
--from_scratch True \
--model_name_or_path mistralai/Mistral-7B-v0.1 \
--eval_steps 2000 \
--save_steps 2000 \
--ignore_data_skip \
--seed 101 \
--max_eval_samples 200 \
--gradient_accumulation_steps 16 \
--num_inference_diffusion_steps 100 \
--overwrite_output_dir false \
--beaker \
--output_dir /results \
"
if [ ! -z "${BEAKER}" ]; then
if [ ! -z "${WEKA}" ]; then
${GANTRY_CMD}
else
${GANTRY_CMD}
fi
else
${PYTHON_CMD} \
--model_name_or_path meta-llama/Llama-3.1-8B \
--eval_steps 10 \
--save_steps 50 \
--max_eval_samples 16 \
--gradient_accumulation_steps 1 \
--num_inference_diffusion_steps 10 \
--output_dir outputs/test \
--overwrite_output_dir true
fi
# accelerate launch \
# --mixed_precision bf16 -m sdlm.run_pretrain \
# --per_device_train_batch_size 1 \
# --per_device_eval_batch_size 1 \
# --do_train \
# --do_eval \
# --log_level info \
# --evaluation_strategy steps \
# --report_to tensorboard \
# --max_seq_length 2048 \
# --simplex_value 5 \
# --num_diffusion_steps 5000 \
# --lr_scheduler_type constant_with_warmup \
# --learning_rate 1e-5 \
# --pad_to_max_length \
# --beta_schedule squaredcos_improved_ddpm \
# --top_p 0.99 \
# --max_steps 10000000 \
# --warmup_steps 5000 \
# --logging_steps 50 \
# --save_total_limit 1 \
# --conditional_generation ul2 \
# --self_condition logits_mean \
# --self_condition_mix_before_weights \
# --streaming \
# --bf16 \
# --optim adamw_torch_fused \
# --gradient_checkpointing \
# --is_causal false \
# --mask_padding_in_loss false \
# --ddp_find_unused_parameters false \
# --without_compute_metrics true \
# --dataloader_num_workers 0 \
# --remove_unused_columns false \
# --dispatch_batches false \
# --shuffle true \
# --preprocessing_num_workers 16 \
# --model_revision 26bca36bde8333b5d7f72e9ed20ccda6a618af24 \
# --line_by_line false \
# --model_name_or_path mistralai/Mistral-7B-v0.1 \
# --eval_steps 2000 \
# --save_steps 2000 \
# --max_eval_samples 200 \
# --gradient_accumulation_steps 16 \
# --num_inference_diffusion_steps 100 \
# --overwrite_output_dir false \
# --output_dir testing_output \
# --use_fast_tokenizer false