Training in progress, step 600, checkpoint

Browse files

Files changed (7) hide show

checkpoint-600/README.md +1 -1
checkpoint-600/adapter_config.json +9 -2
checkpoint-600/adapter_model.safetensors +2 -2
checkpoint-600/optimizer.pt +2 -2
checkpoint-600/scheduler.pt +1 -1
checkpoint-600/trainer_state.json +123 -123
checkpoint-600/training_args.bin +1 -1

checkpoint-600/README.md CHANGED Viewed

@@ -199,4 +199,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
 [More Information Needed]
 ### Framework versions
-- PEFT 0.13.2

 [More Information Needed]
 ### Framework versions
+- PEFT 0.14.0

checkpoint-600/adapter_config.json CHANGED Viewed

@@ -3,6 +3,8 @@
   "auto_mapping": null,
   "base_model_name_or_path": "unsloth/Llama-3.2-1B-Instruct",
   "bias": "none",
   "fan_in_fan_out": false,
   "inference_mode": true,
   "init_lora_weights": true,
@@ -11,6 +13,7 @@
   "layers_to_transform": null,
   "loftq_config": {},
   "lora_alpha": 16,
   "lora_dropout": 0,
   "megatron_config": null,
   "megatron_core": "megatron.core",
@@ -20,9 +23,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "up_proj",
     "down_proj",
-    "gate_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "auto_mapping": null,
   "base_model_name_or_path": "unsloth/Llama-3.2-1B-Instruct",
   "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
   "fan_in_fan_out": false,
   "inference_mode": true,
   "init_lora_weights": true,
   "layers_to_transform": null,
   "loftq_config": {},
   "lora_alpha": 16,
+  "lora_bias": false,
   "lora_dropout": 0,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "q_proj",
+    "o_proj",
+    "v_proj",
     "down_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

checkpoint-600/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6d55511db55c22e9481cdb62e936cd07e378ba9c5435ba8bdc16485fefb63f2e
-size 31469800

 version https://git-lfs.github.com/spec/v1
+oid sha256:7e4abd84eb501a43e0be823bf73473d2641b4e716bfd5d73bb82757c8d193555
+size 45118424

checkpoint-600/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e90561c4d9affdace9ed78167aba765dbd349020c201815e4e1a845547f85c94
-size 16089082

 version https://git-lfs.github.com/spec/v1
+oid sha256:93326e1274960d6decb84243e3e57fd46599fd57647309c56240c686aacce6bc
+size 23159546

checkpoint-600/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ae95b990300e394bedcc204f87da7868a72ef45d2c286d15335be5fed4753224
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:8e6ab2b1312663c255ba7ae61be8a65664cb48e4e09398877c3783034333efeb
 size 1064

checkpoint-600/trainer_state.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.7317073170731707,
   "eval_steps": 500,
   "global_step": 600,
   "is_hyper_param_search": false,
@@ -9,218 +9,218 @@
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.024390243902439025,
-      "grad_norm": 0.33140748739242554,
-      "learning_rate": 0.00019631901840490797,
-      "loss": 2.6356,
       "step": 20
     },
     {
-      "epoch": 0.04878048780487805,
-      "grad_norm": 0.26923373341560364,
-      "learning_rate": 0.0001914110429447853,
-      "loss": 2.3438,
       "step": 40
     },
     {
-      "epoch": 0.07317073170731707,
-      "grad_norm": 0.259231835603714,
-      "learning_rate": 0.00018650306748466258,
-      "loss": 2.3363,
       "step": 60
     },
     {
-      "epoch": 0.0975609756097561,
-      "grad_norm": 0.2962253987789154,
-      "learning_rate": 0.00018159509202453987,
-      "loss": 2.278,
       "step": 80
     },
     {
-      "epoch": 0.12195121951219512,
-      "grad_norm": 0.2886357605457306,
-      "learning_rate": 0.0001766871165644172,
-      "loss": 2.2543,
       "step": 100
     },
     {
-      "epoch": 0.14634146341463414,
-      "grad_norm": 0.40607205033302307,
-      "learning_rate": 0.0001717791411042945,
-      "loss": 2.2058,
       "step": 120
     },
     {
-      "epoch": 0.17073170731707318,
-      "grad_norm": 0.4145870804786682,
-      "learning_rate": 0.00016687116564417177,
-      "loss": 2.3036,
       "step": 140
     },
     {
-      "epoch": 0.1951219512195122,
-      "grad_norm": 0.2872335612773895,
-      "learning_rate": 0.00016196319018404909,
-      "loss": 2.1944,
       "step": 160
     },
     {
-      "epoch": 0.21951219512195122,
-      "grad_norm": 0.4880731403827667,
-      "learning_rate": 0.0001570552147239264,
-      "loss": 2.1981,
       "step": 180
     },
     {
-      "epoch": 0.24390243902439024,
-      "grad_norm": 0.3285306394100189,
-      "learning_rate": 0.0001521472392638037,
-      "loss": 2.2533,
       "step": 200
     },
     {
-      "epoch": 0.2682926829268293,
-      "grad_norm": 0.304231196641922,
-      "learning_rate": 0.00014723926380368098,
-      "loss": 2.2148,
       "step": 220
     },
     {
-      "epoch": 0.2926829268292683,
-      "grad_norm": 0.26624172925949097,
-      "learning_rate": 0.00014233128834355828,
-      "loss": 2.204,
       "step": 240
     },
     {
-      "epoch": 0.3170731707317073,
-      "grad_norm": 0.32761627435684204,
-      "learning_rate": 0.0001374233128834356,
-      "loss": 2.1843,
       "step": 260
     },
     {
-      "epoch": 0.34146341463414637,
-      "grad_norm": 0.2757970094680786,
-      "learning_rate": 0.00013251533742331288,
-      "loss": 2.1697,
       "step": 280
     },
     {
-      "epoch": 0.36585365853658536,
-      "grad_norm": 0.31138375401496887,
-      "learning_rate": 0.00012760736196319017,
-      "loss": 2.188,
       "step": 300
     },
     {
-      "epoch": 0.3902439024390244,
-      "grad_norm": 0.31954225897789,
-      "learning_rate": 0.0001226993865030675,
-      "loss": 2.181,
       "step": 320
     },
     {
-      "epoch": 0.4146341463414634,
-      "grad_norm": 0.2957305908203125,
-      "learning_rate": 0.0001177914110429448,
-      "loss": 2.2415,
       "step": 340
     },
     {
-      "epoch": 0.43902439024390244,
-      "grad_norm": 0.30089232325553894,
-      "learning_rate": 0.00011288343558282209,
-      "loss": 2.1094,
       "step": 360
     },
     {
-      "epoch": 0.4634146341463415,
-      "grad_norm": 0.2776000201702118,
-      "learning_rate": 0.00010797546012269939,
-      "loss": 2.2362,
       "step": 380
     },
     {
-      "epoch": 0.4878048780487805,
-      "grad_norm": 0.3909365236759186,
-      "learning_rate": 0.0001030674846625767,
-      "loss": 2.2577,
       "step": 400
     },
     {
-      "epoch": 0.5121951219512195,
-      "grad_norm": 0.3422461748123169,
-      "learning_rate": 9.815950920245399e-05,
-      "loss": 2.2153,
       "step": 420
     },
     {
-      "epoch": 0.5365853658536586,
-      "grad_norm": 0.32766982913017273,
-      "learning_rate": 9.325153374233129e-05,
-      "loss": 2.1739,
       "step": 440
     },
     {
-      "epoch": 0.5609756097560976,
-      "grad_norm": 0.3100208640098572,
-      "learning_rate": 8.83435582822086e-05,
-      "loss": 2.2126,
       "step": 460
     },
     {
-      "epoch": 0.5853658536585366,
-      "grad_norm": 0.368093878030777,
-      "learning_rate": 8.343558282208588e-05,
-      "loss": 2.1664,
       "step": 480
     },
     {
-      "epoch": 0.6097560975609756,
-      "grad_norm": 0.2647826671600342,
-      "learning_rate": 7.85276073619632e-05,
-      "loss": 2.2018,
       "step": 500
     },
     {
-      "epoch": 0.6341463414634146,
-      "grad_norm": 0.3373850882053375,
-      "learning_rate": 7.361963190184049e-05,
-      "loss": 2.1439,
       "step": 520
     },
     {
-      "epoch": 0.6585365853658537,
-      "grad_norm": 0.3536331355571747,
-      "learning_rate": 6.87116564417178e-05,
-      "loss": 2.1583,
       "step": 540
     },
     {
-      "epoch": 0.6829268292682927,
-      "grad_norm": 0.3301340341567993,
-      "learning_rate": 6.380368098159509e-05,
-      "loss": 2.179,
       "step": 560
     },
     {
-      "epoch": 0.7073170731707317,
-      "grad_norm": 0.3494204878807068,
-      "learning_rate": 5.88957055214724e-05,
-      "loss": 2.1753,
       "step": 580
     },
     {
-      "epoch": 0.7317073170731707,
-      "grad_norm": 0.3503289520740509,
-      "learning_rate": 5.3987730061349695e-05,
-      "loss": 2.1724,
       "step": 600
     }
   ],
   "logging_steps": 20,
-  "max_steps": 820,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 200,
@@ -236,7 +236,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.439777998503936e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.64,
   "eval_steps": 500,
   "global_step": 600,
   "is_hyper_param_search": false,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 0.46613356471061707,
+      "learning_rate": 0.00019678111587982831,
+      "loss": 1.2799,
       "step": 20
     },
     {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 0.3468495309352875,
+      "learning_rate": 0.0001924892703862661,
+      "loss": 0.8919,
       "step": 40
     },
     {
+      "epoch": 0.064,
+      "grad_norm": 0.4602198004722595,
+      "learning_rate": 0.00018819742489270387,
+      "loss": 0.8586,
       "step": 60
     },
     {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.480325311422348,
+      "learning_rate": 0.00018390557939914164,
+      "loss": 0.7571,
       "step": 80
     },
     {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.30179363489151,
+      "learning_rate": 0.00017961373390557942,
+      "loss": 0.7793,
       "step": 100
     },
     {
+      "epoch": 0.128,
+      "grad_norm": 0.3483397364616394,
+      "learning_rate": 0.00017532188841201717,
+      "loss": 0.7647,
       "step": 120
     },
     {
+      "epoch": 0.14933333333333335,
+      "grad_norm": 0.29965728521347046,
+      "learning_rate": 0.00017103004291845494,
+      "loss": 0.6741,
       "step": 140
     },
     {
+      "epoch": 0.17066666666666666,
+      "grad_norm": 0.26644188165664673,
+      "learning_rate": 0.00016673819742489272,
+      "loss": 0.7586,
       "step": 160
     },
     {
+      "epoch": 0.192,
+      "grad_norm": 0.2962466776371002,
+      "learning_rate": 0.0001624463519313305,
+      "loss": 0.7364,
       "step": 180
     },
     {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.36480244994163513,
+      "learning_rate": 0.00015815450643776824,
+      "loss": 0.7944,
       "step": 200
     },
     {
+      "epoch": 0.23466666666666666,
+      "grad_norm": 0.23963908851146698,
+      "learning_rate": 0.000153862660944206,
+      "loss": 0.7055,
       "step": 220
     },
     {
+      "epoch": 0.256,
+      "grad_norm": 0.3207215666770935,
+      "learning_rate": 0.00014957081545064377,
+      "loss": 0.7495,
       "step": 240
     },
     {
+      "epoch": 0.2773333333333333,
+      "grad_norm": 0.34940779209136963,
+      "learning_rate": 0.00014527896995708155,
+      "loss": 0.7739,
       "step": 260
     },
     {
+      "epoch": 0.2986666666666667,
+      "grad_norm": 0.2764255702495575,
+      "learning_rate": 0.00014098712446351932,
+      "loss": 0.7126,
       "step": 280
     },
     {
+      "epoch": 0.32,
+      "grad_norm": 0.27612850069999695,
+      "learning_rate": 0.0001366952789699571,
+      "loss": 0.7308,
       "step": 300
     },
     {
+      "epoch": 0.3413333333333333,
+      "grad_norm": 0.2778555154800415,
+      "learning_rate": 0.00013240343347639485,
+      "loss": 0.6974,
       "step": 320
     },
     {
+      "epoch": 0.3626666666666667,
+      "grad_norm": 0.33450472354888916,
+      "learning_rate": 0.00012811158798283262,
+      "loss": 0.7557,
       "step": 340
     },
     {
+      "epoch": 0.384,
+      "grad_norm": 0.2894728183746338,
+      "learning_rate": 0.0001238197424892704,
+      "loss": 0.7222,
       "step": 360
     },
     {
+      "epoch": 0.4053333333333333,
+      "grad_norm": 0.34458762407302856,
+      "learning_rate": 0.00011952789699570816,
+      "loss": 0.7467,
       "step": 380
     },
     {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.42291197180747986,
+      "learning_rate": 0.00011523605150214594,
+      "loss": 0.6999,
       "step": 400
     },
     {
+      "epoch": 0.448,
+      "grad_norm": 0.3315523862838745,
+      "learning_rate": 0.0001109442060085837,
+      "loss": 0.7554,
       "step": 420
     },
     {
+      "epoch": 0.4693333333333333,
+      "grad_norm": 0.3429376482963562,
+      "learning_rate": 0.00010665236051502145,
+      "loss": 0.7347,
       "step": 440
     },
     {
+      "epoch": 0.49066666666666664,
+      "grad_norm": 0.35784032940864563,
+      "learning_rate": 0.00010236051502145923,
+      "loss": 0.7075,
       "step": 460
     },
     {
+      "epoch": 0.512,
+      "grad_norm": 0.3431866765022278,
+      "learning_rate": 9.8068669527897e-05,
+      "loss": 0.7338,
       "step": 480
     },
     {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.34916117787361145,
+      "learning_rate": 9.377682403433476e-05,
+      "loss": 0.727,
       "step": 500
     },
     {
+      "epoch": 0.5546666666666666,
+      "grad_norm": 0.39798951148986816,
+      "learning_rate": 8.948497854077254e-05,
+      "loss": 0.7505,
       "step": 520
     },
     {
+      "epoch": 0.576,
+      "grad_norm": 0.33197829127311707,
+      "learning_rate": 8.51931330472103e-05,
+      "loss": 0.7085,
       "step": 540
     },
     {
+      "epoch": 0.5973333333333334,
+      "grad_norm": 0.31665658950805664,
+      "learning_rate": 8.090128755364808e-05,
+      "loss": 0.699,
       "step": 560
     },
     {
+      "epoch": 0.6186666666666667,
+      "grad_norm": 0.3004639744758606,
+      "learning_rate": 7.660944206008584e-05,
+      "loss": 0.7368,
       "step": 580
     },
     {
+      "epoch": 0.64,
+      "grad_norm": 0.32609978318214417,
+      "learning_rate": 7.23175965665236e-05,
+      "loss": 0.7168,
       "step": 600
     }
   ],
   "logging_steps": 20,
+  "max_steps": 937,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 200,
       "attributes": {}
     }
   },
+  "total_flos": 1.444779555028992e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

checkpoint-600/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e8d1652c64d5701d7542543217cc8eb0453db379d2bc34a961ad431eb6ebe7fc
 size 5560

 version https://git-lfs.github.com/spec/v1
+oid sha256:f519abf615386e0857c941fcb28a9140901798289aceaff057539afc5159bd3d
 size 5560