End of training

Browse files

Files changed (12) hide show

README.md +16 -30
adapter_config.json +4 -4
adapter_model.bin +1 -1
adapter_model.safetensors +1 -1
last-checkpoint/adapter_config.json +4 -4
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +52 -507
last-checkpoint/training_args.bin +1 -1
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -6,7 +6,7 @@ tags:
 - axolotl
 - generated_from_trainer
 model-index:
-- name: 0b3574a5-2111-435d-ad18-66df2f9c6bce
   results: []
 ---
@@ -18,12 +18,6 @@ should probably proofread and complete it, then remove this comment. -->
 axolotl version: `0.4.1`
 ```yaml
-accelerate_config:
-  dynamo_backend: inductor
-  mixed_precision: bf16
-  num_machines: 1
-  num_processes: auto
-  use_cpu: false
 adapter: lora
 base_model: NousResearch/Nous-Hermes-2-SOLAR-10.7B
 bf16: auto
@@ -44,7 +38,6 @@ datasets:
     system_prompt: ''
 debug: null
 deepspeed: null
-device_map: auto
 early_stopping_patience: null
 eval_max_new_tokens: 128
 eval_table_size: null
@@ -53,14 +46,16 @@ flash_attention: false
 fp16: null
 fsdp: null
 fsdp_config: null
-gradient_accumulation_steps: 16
-gradient_checkpointing: true
 group_by_length: false
 hub_model_id: null
 hub_repo: null
 hub_strategy: checkpoint
 hub_token: null
-learning_rate: 0.0001
 local_rank: null
 logging_steps: 1
 lora_alpha: 16
@@ -69,13 +64,8 @@ lora_fan_in_fan_out: null
 lora_model_dir: null
 lora_r: 8
 lora_target_linear: true
-lora_target_modules:
-- q_proj
-- v_proj
 lr_scheduler: cosine
-max_memory:
-  0: 70GiB
-max_steps: 100
 micro_batch_size: 2
 mlflow_experiment_name: /tmp/51190afaf71fd339_train_data.json
 model_type: AutoModelForCausalLM
@@ -83,9 +73,6 @@ num_epochs: 1
 optimizer: adamw_bnb_8bit
 output_dir: miner_id_24
 pad_to_sequence_len: true
-quantization_config:
-  llm_int8_enable_fp32_cpu_offload: true
-  load_in_8bit: true
 resume_from_checkpoint: null
 s2_attention: null
 sample_packing: false
@@ -94,7 +81,6 @@ sequence_len: 512
 strict: false
 tf32: false
 tokenizer_type: AutoTokenizer
-torch_compile: true
 train_on_inputs: false
 trust_remote_code: true
 val_set_size: 0.05
@@ -112,7 +98,7 @@ xformers_attention: null
 </details><br>
-# 0b3574a5-2111-435d-ad18-66df2f9c6bce
 This model is a fine-tuned version of [NousResearch/Nous-Hermes-2-SOLAR-10.7B](https://huggingface.co/NousResearch/Nous-Hermes-2-SOLAR-10.7B) on the None dataset.
 It achieves the following results on the evaluation set:
@@ -135,25 +121,25 @@ More information needed
 ### Training hyperparameters
 The following hyperparameters were used during training:
-- learning_rate: 0.0001
 - train_batch_size: 2
 - eval_batch_size: 2
 - seed: 42
-- gradient_accumulation_steps: 16
-- total_train_batch_size: 32
 - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_steps: 10
-- training_steps: 75
 ### Training results
 | Training Loss | Epoch  | Step | Validation Loss |
 |:-------------:|:------:|:----:|:---------------:|
-| 0.0           | 0.0135 | 1    | nan             |
-| 0.0           | 0.2559 | 19   | nan             |
-| 0.0           | 0.5118 | 38   | nan             |
-| 0.0           | 0.7677 | 57   | nan             |
 ### Framework versions

 - axolotl
 - generated_from_trainer
 model-index:
+- name: 55b120a1-a673-4ab1-9dd3-b611873580b1
   results: []
 ---
 axolotl version: `0.4.1`
 ```yaml
 adapter: lora
 base_model: NousResearch/Nous-Hermes-2-SOLAR-10.7B
 bf16: auto
     system_prompt: ''
 debug: null
 deepspeed: null
 early_stopping_patience: null
 eval_max_new_tokens: 128
 eval_table_size: null
 fp16: null
 fsdp: null
 fsdp_config: null
+gradient_accumulation_steps: 4
+gradient_checkpointing: false
 group_by_length: false
 hub_model_id: null
 hub_repo: null
 hub_strategy: checkpoint
 hub_token: null
+learning_rate: 0.0002
+load_in_4bit: false
+load_in_8bit: false
 local_rank: null
 logging_steps: 1
 lora_alpha: 16
 lora_model_dir: null
 lora_r: 8
 lora_target_linear: true
 lr_scheduler: cosine
+max_steps: 10
 micro_batch_size: 2
 mlflow_experiment_name: /tmp/51190afaf71fd339_train_data.json
 model_type: AutoModelForCausalLM
 optimizer: adamw_bnb_8bit
 output_dir: miner_id_24
 pad_to_sequence_len: true
 resume_from_checkpoint: null
 s2_attention: null
 sample_packing: false
 strict: false
 tf32: false
 tokenizer_type: AutoTokenizer
 train_on_inputs: false
 trust_remote_code: true
 val_set_size: 0.05
 </details><br>
+# 55b120a1-a673-4ab1-9dd3-b611873580b1
 This model is a fine-tuned version of [NousResearch/Nous-Hermes-2-SOLAR-10.7B](https://huggingface.co/NousResearch/Nous-Hermes-2-SOLAR-10.7B) on the None dataset.
 It achieves the following results on the evaluation set:
 ### Training hyperparameters
 The following hyperparameters were used during training:
+- learning_rate: 0.0002
 - train_batch_size: 2
 - eval_batch_size: 2
 - seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 8
 - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_steps: 10
+- training_steps: 10
 ### Training results
 | Training Loss | Epoch  | Step | Validation Loss |
 |:-------------:|:------:|:----:|:---------------:|
+| 0.0           | 0.0034 | 1    | nan             |
+| 0.0           | 0.0101 | 3    | nan             |
+| 0.0           | 0.0202 | 6    | nan             |
+| 0.0           | 0.0303 | 9    | nan             |
 ### Framework versions

adapter_config.json CHANGED Viewed

@@ -20,13 +20,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "gate_proj",
-    "v_proj",
     "o_proj",
-    "k_proj",
     "q_proj",
     "up_proj",
-    "down_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "o_proj",
+    "down_proj",
+    "gate_proj",
     "q_proj",
     "up_proj",
+    "k_proj",
+    "v_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

adapter_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3962ae538a37106b9230c411d00dd14a1a4d18451a407a58c8c0565410fc10e2
 size 126071114

 version https://git-lfs.github.com/spec/v1
+oid sha256:5e8bb7d78d7c04332ee97fa24cb7c00b737875da4fad4e7b9a6b65ff648e8d95
 size 126071114

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4b595e2bc55c1bdc4d36dd7ed94481aa3a42c383344c13986a9a0782fc63301b
 size 125918320

 version https://git-lfs.github.com/spec/v1
+oid sha256:39403457ec48ee4da263ff75bec875ab9649f029d3dc6760d739395885c6eae1
 size 125918320

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -20,13 +20,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "gate_proj",
-    "v_proj",
     "o_proj",
-    "k_proj",
     "q_proj",
     "up_proj",
-    "down_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "o_proj",
+    "down_proj",
+    "gate_proj",
     "q_proj",
     "up_proj",
+    "k_proj",
+    "v_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4b595e2bc55c1bdc4d36dd7ed94481aa3a42c383344c13986a9a0782fc63301b
 size 125918320

 version https://git-lfs.github.com/spec/v1
+oid sha256:39403457ec48ee4da263ff75bec875ab9649f029d3dc6760d739395885c6eae1
 size 125918320

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e4940715e14c0c0a1b6d2656a2e6d4c1b4600d09324620b5e419745ad05f7a7
-size 64683540

 version https://git-lfs.github.com/spec/v1
+oid sha256:a8ac9d41287de7debc68651490676aa50ef230d744b065699d21d04375d83588
+size 64683604

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a05bd24210dca68e6fef6cd05952af76d9448d93ee12473a5ab35577fe0a064c
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:163020c3a5e3deb0c9f202a04693b11d0b5de442bd4ef423bf0f0a2016603d4d
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:212837ccb433e5430b061dc107b19dc09e932e6cfb62a751187d0903b7b0d94e
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:bb578e75c11a81e85dda67a691f96ba4793a02960f1409fd3e1511aac873491a
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,576 +1,121 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.0101010101010102,
-  "eval_steps": 19,
-  "global_step": 75,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.013468013468013467,
       "grad_norm": NaN,
-      "learning_rate": 1e-05,
       "loss": 0.0,
       "step": 1
     },
     {
-      "epoch": 0.013468013468013467,
       "eval_loss": NaN,
-      "eval_runtime": 150.507,
-      "eval_samples_per_second": 0.831,
-      "eval_steps_per_second": 0.419,
       "step": 1
     },
     {
-      "epoch": 0.026936026936026935,
-      "grad_norm": NaN,
-      "learning_rate": 2e-05,
-      "loss": 0.0,
-      "step": 2
-    },
-    {
-      "epoch": 0.04040404040404041,
-      "grad_norm": NaN,
-      "learning_rate": 3e-05,
-      "loss": 0.0,
-      "step": 3
-    },
-    {
-      "epoch": 0.05387205387205387,
       "grad_norm": NaN,
       "learning_rate": 4e-05,
       "loss": 0.0,
-      "step": 4
-    },
-    {
-      "epoch": 0.06734006734006734,
-      "grad_norm": NaN,
-      "learning_rate": 5e-05,
-      "loss": 0.0,
-      "step": 5
     },
     {
-      "epoch": 0.08080808080808081,
       "grad_norm": NaN,
       "learning_rate": 6e-05,
       "loss": 0.0,
-      "step": 6
     },
     {
-      "epoch": 0.09427609427609428,
-      "grad_norm": NaN,
-      "learning_rate": 7e-05,
-      "loss": 0.0,
-      "step": 7
     },
     {
-      "epoch": 0.10774410774410774,
       "grad_norm": NaN,
       "learning_rate": 8e-05,
       "loss": 0.0,
-      "step": 8
-    },
-    {
-      "epoch": 0.12121212121212122,
-      "grad_norm": NaN,
-      "learning_rate": 9e-05,
-      "loss": 0.0,
-      "step": 9
     },
     {
-      "epoch": 0.13468013468013468,
       "grad_norm": NaN,
       "learning_rate": 0.0001,
       "loss": 0.0,
-      "step": 10
-    },
-    {
-      "epoch": 0.14814814814814814,
-      "grad_norm": NaN,
-      "learning_rate": 9.994161134161634e-05,
-      "loss": 0.0,
-      "step": 11
-    },
-    {
-      "epoch": 0.16161616161616163,
-      "grad_norm": NaN,
-      "learning_rate": 9.976658173588244e-05,
-      "loss": 0.0,
-      "step": 12
-    },
-    {
-      "epoch": 0.1750841750841751,
-      "grad_norm": NaN,
-      "learning_rate": 9.947531997255256e-05,
-      "loss": 0.0,
-      "step": 13
-    },
-    {
-      "epoch": 0.18855218855218855,
-      "grad_norm": NaN,
-      "learning_rate": 9.906850630697068e-05,
-      "loss": 0.0,
-      "step": 14
-    },
-    {
-      "epoch": 0.20202020202020202,
-      "grad_norm": NaN,
-      "learning_rate": 9.85470908713026e-05,
-      "loss": 0.0,
-      "step": 15
-    },
-    {
-      "epoch": 0.21548821548821548,
-      "grad_norm": NaN,
-      "learning_rate": 9.791229145545831e-05,
-      "loss": 0.0,
-      "step": 16
-    },
-    {
-      "epoch": 0.22895622895622897,
-      "grad_norm": NaN,
-      "learning_rate": 9.716559066288715e-05,
-      "loss": 0.0,
-      "step": 17
-    },
-    {
-      "epoch": 0.24242424242424243,
-      "grad_norm": NaN,
-      "learning_rate": 9.630873244788883e-05,
-      "loss": 0.0,
-      "step": 18
-    },
-    {
-      "epoch": 0.2558922558922559,
-      "grad_norm": NaN,
-      "learning_rate": 9.534371804252728e-05,
-      "loss": 0.0,
-      "step": 19
-    },
-    {
-      "epoch": 0.2558922558922559,
-      "eval_loss": NaN,
-      "eval_runtime": 4.1562,
-      "eval_samples_per_second": 30.076,
-      "eval_steps_per_second": 15.158,
-      "step": 19
-    },
-    {
-      "epoch": 0.26936026936026936,
-      "grad_norm": NaN,
-      "learning_rate": 9.42728012826605e-05,
-      "loss": 0.0,
-      "step": 20
-    },
-    {
-      "epoch": 0.2828282828282828,
-      "grad_norm": NaN,
-      "learning_rate": 9.309848334400246e-05,
-      "loss": 0.0,
-      "step": 21
-    },
-    {
-      "epoch": 0.2962962962962963,
-      "grad_norm": NaN,
-      "learning_rate": 9.182350690051133e-05,
-      "loss": 0.0,
-      "step": 22
-    },
-    {
-      "epoch": 0.30976430976430974,
-      "grad_norm": NaN,
-      "learning_rate": 9.045084971874738e-05,
-      "loss": 0.0,
-      "step": 23
-    },
-    {
-      "epoch": 0.32323232323232326,
-      "grad_norm": NaN,
-      "learning_rate": 8.898371770316111e-05,
-      "loss": 0.0,
-      "step": 24
-    },
-    {
-      "epoch": 0.3367003367003367,
-      "grad_norm": NaN,
-      "learning_rate": 8.742553740855506e-05,
-      "loss": 0.0,
-      "step": 25
-    },
-    {
-      "epoch": 0.3501683501683502,
-      "grad_norm": NaN,
-      "learning_rate": 8.577994803720606e-05,
-      "loss": 0.0,
-      "step": 26
-    },
-    {
-      "epoch": 0.36363636363636365,
-      "grad_norm": NaN,
-      "learning_rate": 8.405079293933986e-05,
-      "loss": 0.0,
-      "step": 27
-    },
-    {
-      "epoch": 0.3771043771043771,
-      "grad_norm": NaN,
-      "learning_rate": 8.224211063680853e-05,
-      "loss": 0.0,
-      "step": 28
-    },
-    {
-      "epoch": 0.39057239057239057,
-      "grad_norm": NaN,
-      "learning_rate": 8.035812539093557e-05,
-      "loss": 0.0,
-      "step": 29
-    },
-    {
-      "epoch": 0.40404040404040403,
-      "grad_norm": NaN,
-      "learning_rate": 7.840323733655778e-05,
-      "loss": 0.0,
-      "step": 30
-    },
-    {
-      "epoch": 0.4175084175084175,
-      "grad_norm": NaN,
-      "learning_rate": 7.638201220530665e-05,
-      "loss": 0.0,
-      "step": 31
-    },
-    {
-      "epoch": 0.43097643097643096,
-      "grad_norm": NaN,
-      "learning_rate": 7.42991706621303e-05,
-      "loss": 0.0,
-      "step": 32
-    },
-    {
-      "epoch": 0.4444444444444444,
-      "grad_norm": NaN,
-      "learning_rate": 7.215957727996207e-05,
-      "loss": 0.0,
-      "step": 33
-    },
-    {
-      "epoch": 0.45791245791245794,
-      "grad_norm": NaN,
-      "learning_rate": 6.996822917828477e-05,
-      "loss": 0.0,
-      "step": 34
-    },
-    {
-      "epoch": 0.4713804713804714,
-      "grad_norm": NaN,
-      "learning_rate": 6.773024435212678e-05,
-      "loss": 0.0,
-      "step": 35
-    },
-    {
-      "epoch": 0.48484848484848486,
-      "grad_norm": NaN,
-      "learning_rate": 6.545084971874738e-05,
-      "loss": 0.0,
-      "step": 36
-    },
-    {
-      "epoch": 0.4983164983164983,
-      "grad_norm": NaN,
-      "learning_rate": 6.313536890992935e-05,
-      "loss": 0.0,
-      "step": 37
     },
     {
-      "epoch": 0.5117845117845118,
       "grad_norm": NaN,
-      "learning_rate": 6.078920983839031e-05,
       "loss": 0.0,
-      "step": 38
     },
     {
-      "epoch": 0.5117845117845118,
       "eval_loss": NaN,
-      "eval_runtime": 4.1693,
-      "eval_samples_per_second": 29.981,
-      "eval_steps_per_second": 15.11,
-      "step": 38
-    },
-    {
-      "epoch": 0.5252525252525253,
-      "grad_norm": NaN,
-      "learning_rate": 5.841785206735192e-05,
-      "loss": 0.0,
-      "step": 39
-    },
-    {
-      "epoch": 0.5387205387205387,
-      "grad_norm": NaN,
-      "learning_rate": 5.602683401276615e-05,
-      "loss": 0.0,
-      "step": 40
-    },
-    {
-      "epoch": 0.5521885521885522,
-      "grad_norm": NaN,
-      "learning_rate": 5.3621740008088126e-05,
-      "loss": 0.0,
-      "step": 41
-    },
-    {
-      "epoch": 0.5656565656565656,
-      "grad_norm": NaN,
-      "learning_rate": 5.1208187261806615e-05,
-      "loss": 0.0,
-      "step": 42
-    },
-    {
-      "epoch": 0.5791245791245792,
-      "grad_norm": NaN,
-      "learning_rate": 4.87918127381934e-05,
-      "loss": 0.0,
-      "step": 43
-    },
-    {
-      "epoch": 0.5925925925925926,
-      "grad_norm": NaN,
-      "learning_rate": 4.6378259991911886e-05,
-      "loss": 0.0,
-      "step": 44
-    },
-    {
-      "epoch": 0.6060606060606061,
-      "grad_norm": NaN,
-      "learning_rate": 4.397316598723385e-05,
-      "loss": 0.0,
-      "step": 45
-    },
-    {
-      "epoch": 0.6195286195286195,
-      "grad_norm": NaN,
-      "learning_rate": 4.1582147932648074e-05,
-      "loss": 0.0,
-      "step": 46
-    },
-    {
-      "epoch": 0.632996632996633,
-      "grad_norm": NaN,
-      "learning_rate": 3.92107901616097e-05,
-      "loss": 0.0,
-      "step": 47
-    },
-    {
-      "epoch": 0.6464646464646465,
-      "grad_norm": NaN,
-      "learning_rate": 3.6864631090070655e-05,
-      "loss": 0.0,
-      "step": 48
-    },
-    {
-      "epoch": 0.6599326599326599,
-      "grad_norm": NaN,
-      "learning_rate": 3.4549150281252636e-05,
-      "loss": 0.0,
-      "step": 49
-    },
-    {
-      "epoch": 0.6734006734006734,
-      "grad_norm": NaN,
-      "learning_rate": 3.226975564787322e-05,
-      "loss": 0.0,
-      "step": 50
-    },
-    {
-      "epoch": 0.6868686868686869,
-      "grad_norm": NaN,
-      "learning_rate": 3.003177082171523e-05,
-      "loss": 0.0,
-      "step": 51
-    },
-    {
-      "epoch": 0.7003367003367004,
-      "grad_norm": NaN,
-      "learning_rate": 2.784042272003794e-05,
-      "loss": 0.0,
-      "step": 52
-    },
-    {
-      "epoch": 0.7138047138047138,
-      "grad_norm": NaN,
-      "learning_rate": 2.57008293378697e-05,
-      "loss": 0.0,
-      "step": 53
-    },
-    {
-      "epoch": 0.7272727272727273,
-      "grad_norm": NaN,
-      "learning_rate": 2.361798779469336e-05,
-      "loss": 0.0,
-      "step": 54
     },
     {
-      "epoch": 0.7407407407407407,
       "grad_norm": NaN,
-      "learning_rate": 2.1596762663442218e-05,
       "loss": 0.0,
-      "step": 55
     },
     {
-      "epoch": 0.7542087542087542,
       "grad_norm": NaN,
-      "learning_rate": 1.9641874609064443e-05,
       "loss": 0.0,
-      "step": 56
     },
     {
-      "epoch": 0.7676767676767676,
       "grad_norm": NaN,
-      "learning_rate": 1.7757889363191483e-05,
       "loss": 0.0,
-      "step": 57
     },
     {
-      "epoch": 0.7676767676767676,
       "eval_loss": NaN,
-      "eval_runtime": 6.5724,
-      "eval_samples_per_second": 19.019,
-      "eval_steps_per_second": 9.586,
-      "step": 57
-    },
-    {
-      "epoch": 0.7811447811447811,
-      "grad_norm": NaN,
-      "learning_rate": 1.5949207060660138e-05,
-      "loss": 0.0,
-      "step": 58
-    },
-    {
-      "epoch": 0.7946127946127947,
-      "grad_norm": NaN,
-      "learning_rate": 1.422005196279395e-05,
-      "loss": 0.0,
-      "step": 59
-    },
-    {
-      "epoch": 0.8080808080808081,
-      "grad_norm": NaN,
-      "learning_rate": 1.257446259144494e-05,
-      "loss": 0.0,
-      "step": 60
-    },
-    {
-      "epoch": 0.8215488215488216,
-      "grad_norm": NaN,
-      "learning_rate": 1.1016282296838887e-05,
-      "loss": 0.0,
-      "step": 61
-    },
-    {
-      "epoch": 0.835016835016835,
-      "grad_norm": NaN,
-      "learning_rate": 9.549150281252633e-06,
-      "loss": 0.0,
-      "step": 62
-    },
-    {
-      "epoch": 0.8484848484848485,
-      "grad_norm": NaN,
-      "learning_rate": 8.176493099488663e-06,
-      "loss": 0.0,
-      "step": 63
-    },
-    {
-      "epoch": 0.8619528619528619,
-      "grad_norm": NaN,
-      "learning_rate": 6.901516655997536e-06,
-      "loss": 0.0,
-      "step": 64
-    },
-    {
-      "epoch": 0.8754208754208754,
-      "grad_norm": NaN,
-      "learning_rate": 5.727198717339511e-06,
-      "loss": 0.0,
-      "step": 65
-    },
-    {
-      "epoch": 0.8888888888888888,
-      "grad_norm": NaN,
-      "learning_rate": 4.65628195747273e-06,
-      "loss": 0.0,
-      "step": 66
-    },
-    {
-      "epoch": 0.9023569023569024,
-      "grad_norm": NaN,
-      "learning_rate": 3.691267552111183e-06,
-      "loss": 0.0,
-      "step": 67
-    },
-    {
-      "epoch": 0.9158249158249159,
-      "grad_norm": NaN,
-      "learning_rate": 2.8344093371128424e-06,
-      "loss": 0.0,
-      "step": 68
-    },
-    {
-      "epoch": 0.9292929292929293,
-      "grad_norm": NaN,
-      "learning_rate": 2.087708544541689e-06,
-      "loss": 0.0,
-      "step": 69
-    },
-    {
-      "epoch": 0.9427609427609428,
-      "grad_norm": NaN,
-      "learning_rate": 1.4529091286973995e-06,
-      "loss": 0.0,
-      "step": 70
-    },
-    {
-      "epoch": 0.9562289562289562,
-      "grad_norm": NaN,
-      "learning_rate": 9.314936930293283e-07,
-      "loss": 0.0,
-      "step": 71
-    },
-    {
-      "epoch": 0.9696969696969697,
-      "grad_norm": NaN,
-      "learning_rate": 5.246800274474439e-07,
-      "loss": 0.0,
-      "step": 72
-    },
-    {
-      "epoch": 0.9831649831649831,
-      "grad_norm": NaN,
-      "learning_rate": 2.334182641175686e-07,
-      "loss": 0.0,
-      "step": 73
-    },
-    {
-      "epoch": 0.9966329966329966,
-      "grad_norm": NaN,
-      "learning_rate": 5.838865838366792e-08,
-      "loss": 0.0,
-      "step": 74
     },
     {
-      "epoch": 1.0101010101010102,
       "grad_norm": NaN,
-      "learning_rate": 0.0,
       "loss": 0.0,
-      "step": 75
     }
   ],
   "logging_steps": 1,
-  "max_steps": 75,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 2,
-  "save_steps": 19,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
@@ -583,7 +128,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 7.83543405945815e+16,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.03367003367003367,
+  "eval_steps": 3,
+  "global_step": 10,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.003367003367003367,
       "grad_norm": NaN,
+      "learning_rate": 2e-05,
       "loss": 0.0,
       "step": 1
     },
     {
+      "epoch": 0.003367003367003367,
       "eval_loss": NaN,
+      "eval_runtime": 6.8066,
+      "eval_samples_per_second": 18.364,
+      "eval_steps_per_second": 9.256,
       "step": 1
     },
     {
+      "epoch": 0.006734006734006734,
       "grad_norm": NaN,
       "learning_rate": 4e-05,
       "loss": 0.0,
+      "step": 2
     },
     {
+      "epoch": 0.010101010101010102,
       "grad_norm": NaN,
       "learning_rate": 6e-05,
       "loss": 0.0,
+      "step": 3
     },
     {
+      "epoch": 0.010101010101010102,
+      "eval_loss": NaN,
+      "eval_runtime": 5.5633,
+      "eval_samples_per_second": 22.469,
+      "eval_steps_per_second": 11.324,
+      "step": 3
     },
     {
+      "epoch": 0.013468013468013467,
       "grad_norm": NaN,
       "learning_rate": 8e-05,
       "loss": 0.0,
+      "step": 4
     },
     {
+      "epoch": 0.016835016835016835,
       "grad_norm": NaN,
       "learning_rate": 0.0001,
       "loss": 0.0,
+      "step": 5
     },
     {
+      "epoch": 0.020202020202020204,
       "grad_norm": NaN,
+      "learning_rate": 0.00012,
       "loss": 0.0,
+      "step": 6
     },
     {
+      "epoch": 0.020202020202020204,
       "eval_loss": NaN,
+      "eval_runtime": 5.6056,
+      "eval_samples_per_second": 22.299,
+      "eval_steps_per_second": 11.239,
+      "step": 6
     },
     {
+      "epoch": 0.02356902356902357,
       "grad_norm": NaN,
+      "learning_rate": 0.00014,
       "loss": 0.0,
+      "step": 7
     },
     {
+      "epoch": 0.026936026936026935,
       "grad_norm": NaN,
+      "learning_rate": 0.00016,
       "loss": 0.0,
+      "step": 8
     },
     {
+      "epoch": 0.030303030303030304,
       "grad_norm": NaN,
+      "learning_rate": 0.00018,
       "loss": 0.0,
+      "step": 9
     },
     {
+      "epoch": 0.030303030303030304,
       "eval_loss": NaN,
+      "eval_runtime": 5.6022,
+      "eval_samples_per_second": 22.313,
+      "eval_steps_per_second": 11.246,
+      "step": 9
     },
     {
+      "epoch": 0.03367003367003367,
       "grad_norm": NaN,
+      "learning_rate": 0.0002,
       "loss": 0.0,
+      "step": 10
     }
   ],
   "logging_steps": 1,
+  "max_steps": 10,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 3,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
       "attributes": {}
     }
   },
+  "total_flos": 2612900061511680.0,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:712377ddb2b9ed8df52942629336e14e6ab5e7c9bd3a4e8b3d31e125b7e34bf0
 size 6776

 version https://git-lfs.github.com/spec/v1
+oid sha256:efc3defe0dd503e0c0cdf0a7d2ed65766aa36a61bcd93b992b3e10bfe3993f74
 size 6776

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:712377ddb2b9ed8df52942629336e14e6ab5e7c9bd3a4e8b3d31e125b7e34bf0
 size 6776

 version https://git-lfs.github.com/spec/v1
+oid sha256:efc3defe0dd503e0c0cdf0a7d2ed65766aa36a61bcd93b992b3e10bfe3993f74
 size 6776