!!python/object:aether.train.train.TrainingArguments output_dir: /mnt/disks/persist/data/checkpoints/dm16 overwrite_output_dir: false do_train: false do_eval: false do_predict: false eval_strategy: 'no' prediction_loss_only: false per_device_train_batch_size: 32 per_device_eval_batch_size: 8 per_gpu_train_batch_size: null per_gpu_eval_batch_size: null gradient_accumulation_steps: 1 eval_accumulation_steps: null eval_delay: 0 torch_empty_cache_steps: null learning_rate: 0.001 weight_decay: 0.05 adam_beta1: 0.9 adam_beta2: 0.999 adam_epsilon: 1.0e-08 max_grad_norm: 1.0 num_train_epochs: 3.0 max_steps: 75000 lr_scheduler_type: constant lr_scheduler_kwargs: {} warmup_ratio: 0.0 warmup_steps: 0 log_level: passive log_level_replica: warning log_on_each_node: true logging_dir: null logging_strategy: steps logging_first_step: true logging_steps: 250 logging_nan_inf_filter: true save_strategy: steps save_steps: 300 save_total_limit: null save_safetensors: true save_on_each_node: false save_only_model: false restore_callback_states_from_checkpoint: false no_cuda: false use_cpu: false use_mps_device: false seed: 42 data_seed: null jit_mode_eval: false use_ipex: false bf16: false fp16: false fp16_opt_level: O1 half_precision_backend: auto bf16_full_eval: false fp16_full_eval: false tf32: null local_rank: -1 ddp_backend: null tpu_num_cores: null tpu_metrics_debug: false debug: '' dataloader_drop_last: false eval_steps: null dataloader_num_workers: 0 dataloader_prefetch_factor: null past_index: -1 run_name: dm16 disable_tqdm: null remove_unused_columns: false label_names: - input_ids load_best_model_at_end: false metric_for_best_model: null greater_is_better: null ignore_data_skip: false fsdp: '' fsdp_min_num_params: 0 fsdp_config: null fsdp_transformer_layer_cls_to_wrap: null accelerator_config: null deepspeed: null label_smoothing_factor: 0.0 optim: adamw_torch optim_args: null adafactor: false group_by_length: false length_column_name: length report_to: null ddp_find_unused_parameters: null ddp_bucket_cap_mb: null ddp_broadcast_buffers: null dataloader_pin_memory: true dataloader_persistent_workers: false skip_memory_metrics: true use_legacy_prediction_loop: false push_to_hub: true resume_from_checkpoint: null hub_model_id: timaeus/dm16 hub_strategy: every_save hub_token: null hub_private_repo: false hub_always_push: false gradient_checkpointing: false gradient_checkpointing_kwargs: null include_inputs_for_metrics: false eval_do_concat_batches: true fp16_backend: auto evaluation_strategy: null push_to_hub_model_id: null push_to_hub_organization: null push_to_hub_token: null mp_parameters: '' auto_find_batch_size: false full_determinism: false torchdynamo: null ray_scope: last ddp_timeout: 1800 torch_compile: false torch_compile_backend: null torch_compile_mode: null dispatch_batches: null split_batches: null include_tokens_per_second: false include_num_input_tokens_seen: false neftune_noise_alpha: null optim_target_modules: null batch_eval_metrics: false eval_on_start: false use_liger_kernel: false eval_use_gather_object: false checkpoints_dir: /mnt/disks/persist/data/checkpoints save_log_steps: 250 bucket_name: devinterp-language s3_folder: checkpoints/dm16 delete_after_upload: false push_to_aws: true project_name: train_slms_pile13m is_debug: false group_name: dm job_type: train notes: null tags: null extra_save_steps: - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 2 - 2 - 2 - 2 - 2 - 2 - 2 - 2 - 2 - 3 - 3 - 3 - 3 - 3 - 3 - 4 - 4 - 4 - 4 - 4 - 5 - 5 - 5 - 5 - 6 - 6 - 6 - 6 - 7 - 7 - 7 - 8 - 8 - 9 - 9 - 9 - 10 - 10 - 11 - 11 - 12 - 13 - 13 - 14 - 14 - 15 - 16 - 17 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 28 - 29 - 30 - 32 - 33 - 35 - 36 - 38 - 40 - 42 - 44 - 46 - 48 - 50 - 52 - 55 - 57 - 60 - 63 - 66 - 69 - 72 - 75 - 79 - 82 - 86 - 90 - 94 - 99 - 103 - 108 - 113 - 118 - 124 - 130 - 136 - 142 - 149 - 155 - 163 - 170 - 178 - 186 - 195 - 204 - 213 - 223 - 233 - 244 - 255 - 267 - 280 - 293 - 306 - 320 - 335 - 350 - 367 - 384 - 401 - 420 - 439 - 459 - 481 - 503 - 526 - 550 - 576 - 602 - 630 - 659 - 690 - 721 - 755 - 789 - 826 - 864 - 904 - 946 - 989 - 1035 - 1083 - 1133 - 1185 - 1239 - 1297 - 1356 - 1419 - 1485 - 1553 - 1625 - 1700 - 1778 - 1860 - 1946 - 2035 - 2129 - 2228 - 2330 - 2438 - 2550 - 2668 - 2791 - 2920 - 3054 - 3195 - 3343 - 3497 - 3658 - 3827 - 4003 - 4188 - 4381 - 4583 - 4794 - 5015 - 5247 - 5489 - 5742 - 6007 - 6284 - 6573 - 6876 - 7194 - 7525 - 7872 - 8235 - 8615 - 9012 - 9428 - 9863 - 10318 - 10794 - 11291 - 11812 - 12357 - 12926 - 13523 - 14146 - 14799 - 15481 - 16195 - 16942 - 17723 - 18540 - 19395 - 20290 - 21225 - 22204 - 23228 - 24299 - 25420 - 26592 - 27818 - 29101 - 30443 - 31847 - 33315 - 34851 - 36458 - 38140 - 39898 - 41738 - 43663 - 45676 - 47783 - 49986 - 52291 - 54703 - 57225 - 59864 - 62624 - 65512 - 68533 - 71693 - 75000