Spaces:
Sleeping
Sleeping
File size: 4,404 Bytes
17ff0d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
PYTHON_CMD="
accelerate launch
--mixed_precision bf16 -m sdlm.run_pretrain \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--do_train \
--do_eval \
--log_level info \
--evaluation_strategy steps \
--report_to tensorboard \
--max_seq_length 512 \
--simplex_value 5 \
--num_diffusion_steps 5000 \
--lr_scheduler_type constant_with_warmup \
--learning_rate 1e-5 \
--pad_to_max_length \
--beta_schedule squaredcos_improved_ddpm \
--top_p 0.99 \
--max_steps 35000 \
--warmup_steps 5000 \
--logging_steps 50 \
--save_total_limit 1 \
--conditional_generation ul2 \
--self_condition logits_mean \
--self_condition_mix_before_weights \
--streaming \
--bf16 \
--gradient_checkpointing \
--use_flash_attention2 \
--is_causal false \
--mask_padding_in_loss false \
--without_compute_metrics true \
--dataloader_num_workers 8 \
--remove_unused_columns false \
--dispatch_batches false \
--shuffle true \
--preprocessing_num_workers 16 \
--line_by_line false \
--model_revision 26bca36bde8333b5d7f72e9ed20ccda6a618af24 \
--fsdp auto_wrap \
--fsdp_transformer_layer_cls_to_wrap MistralDecoderLayer \
"
# --min_sample_seq_length 650 \ # set to filter by length
# # cdcd
# --fsdp auto_wrap \
# --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \
GANTRY_CMD="
gantry run -y -n random_35k_pretrain -t random_35k_pretrain --allow-dirty \
--workspace ai2/tess2 \
--gpus 7 \
--priority normal \
--budget ai2/allennlp \
--preemptible \
--env PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python \
--env-secret HF_TOKEN=HF_TOKEN \
--beaker-image ai2/pytorch2.0.0-cuda11.8-python3.10 \
--venv base \
--pip requirements.txt \
--no-nfs \
--cluster ai2/jupiter-cirrascale-2 \
--weka oe-data-default:/data/input \
-- ${PYTHON_CMD} \
--from_scratch True \
--model_name_or_path mistralai/Mistral-7B-v0.1 \
--eval_steps 2000 \
--save_steps 2000 \
--ignore_data_skip \
--seed 101 \
--max_eval_samples 200 \
--gradient_accumulation_steps 16 \
--num_inference_diffusion_steps 100 \
--overwrite_output_dir false \
--beaker \
--output_dir /results \
"
if [ ! -z "${BEAKER}" ]; then
if [ ! -z "${WEKA}" ]; then
${GANTRY_CMD}
else
${GANTRY_CMD}
fi
else
${PYTHON_CMD} \
--model_name_or_path meta-llama/Llama-3.1-8B \
--eval_steps 10 \
--save_steps 50 \
--max_eval_samples 16 \
--gradient_accumulation_steps 1 \
--num_inference_diffusion_steps 10 \
--output_dir outputs/test \
--overwrite_output_dir true
fi
# accelerate launch \
# --mixed_precision bf16 -m sdlm.run_pretrain \
# --per_device_train_batch_size 1 \
# --per_device_eval_batch_size 1 \
# --do_train \
# --do_eval \
# --log_level info \
# --evaluation_strategy steps \
# --report_to tensorboard \
# --max_seq_length 2048 \
# --simplex_value 5 \
# --num_diffusion_steps 5000 \
# --lr_scheduler_type constant_with_warmup \
# --learning_rate 1e-5 \
# --pad_to_max_length \
# --beta_schedule squaredcos_improved_ddpm \
# --top_p 0.99 \
# --max_steps 10000000 \
# --warmup_steps 5000 \
# --logging_steps 50 \
# --save_total_limit 1 \
# --conditional_generation ul2 \
# --self_condition logits_mean \
# --self_condition_mix_before_weights \
# --streaming \
# --bf16 \
# --optim adamw_torch_fused \
# --gradient_checkpointing \
# --is_causal false \
# --mask_padding_in_loss false \
# --ddp_find_unused_parameters false \
# --without_compute_metrics true \
# --dataloader_num_workers 0 \
# --remove_unused_columns false \
# --dispatch_batches false \
# --shuffle true \
# --preprocessing_num_workers 16 \
# --model_revision 26bca36bde8333b5d7f72e9ed20ccda6a618af24 \
# --line_by_line false \
# --model_name_or_path mistralai/Mistral-7B-v0.1 \
# --eval_steps 2000 \
# --save_steps 2000 \
# --max_eval_samples 200 \
# --gradient_accumulation_steps 16 \
# --num_inference_diffusion_steps 100 \
# --overwrite_output_dir false \
# --output_dir testing_output \
# --use_fast_tokenizer false |