Spaces:
Sleeping
Sleeping
accelerate launch \ | |
--mixed_precision bf16 -m sdlm.run_pretrain_ar \ | |
--per_device_train_batch_size 1 \ | |
--per_device_eval_batch_size 1 \ | |
--do_train \ | |
--do_eval \ | |
--log_level info \ | |
--evaluation_strategy steps \ | |
--report_to tensorboard \ | |
--max_seq_length 1 \ | |
--lr_scheduler_type constant_with_warmup \ | |
--learning_rate 1e-5 \ | |
--pad_to_max_length \ | |
--max_steps 10000000 \ | |
--warmup_steps 5000 \ | |
--logging_steps 50 \ | |
--save_total_limit 1 \ | |
--dataset_name emozilla/dolma-v1_7-305B \ | |
--streaming \ | |
--bf16 \ | |
--optim adamw_torch_fused \ | |
--gradient_checkpointing \ | |
--use_flash_attention2 \ | |
--ddp_find_unused_parameters false \ | |
--without_compute_metrics true \ | |
--dataloader_num_workers 8 \ | |
--remove_unused_columns true \ | |
--dispatch_batches false \ | |
--shuffle true \ | |
--preprocessing_num_workers 16 \ | |
--model_name_or_path mistralai/Mistral-7B-v0.1 \ | |
--model_revision 26bca36bde8333b5d7f72e9ed20ccda6a618af24 \ | |
--eval_steps 10 \ | |
--save_steps 50 \ | |
--max_eval_samples 16 \ | |
--gradient_accumulation_steps 1 \ | |
--output_dir outputs/test \ | |
--overwrite_output_dir true \ | |
--tokenizer_padding_side "left" \ | |
--num_diffusion_steps 0 | |