Model save

Browse files

Files changed (5) hide show

README.md +3 -3
adapter_config.json +3 -3
adapter_model.safetensors +1 -1
trainer_state.json +39 -438
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -33,12 +33,12 @@ More information needed
 ### Training hyperparameters
 The following hyperparameters were used during training:
-- learning_rate: 5e-05
 - train_batch_size: 16
 - eval_batch_size: 8
 - seed: 42
-- gradient_accumulation_steps: 2
-- total_train_batch_size: 32
 - optimizer: Use adamw_hf with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: linear
 - num_epochs: 4

 ### Training hyperparameters
 The following hyperparameters were used during training:
+- learning_rate: 0.0001
 - train_batch_size: 16
 - eval_batch_size: 8
 - seed: 42
+- gradient_accumulation_steps: 16
+- total_train_batch_size: 256
 - optimizer: Use adamw_hf with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: linear
 - num_epochs: 4

adapter_config.json CHANGED Viewed

@@ -23,12 +23,12 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "q_proj",
     "down_proj",
-    "gate_proj",
-    "up_proj",
     "o_proj",
     "v_proj",
     "k_proj"
   ],
   "task_type": "CAUSAL_LM",

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "down_proj",
+    "q_proj",
     "o_proj",
     "v_proj",
+    "up_proj",
+    "gate_proj",
     "k_proj"
   ],
   "task_type": "CAUSAL_LM",

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:16cd9a8e5f46f50293e70457511a4af4b76946ea70231fbd4014257df5bae496
 size 1684597880

 version https://git-lfs.github.com/spec/v1
+oid sha256:8f75e051f545f7b4b5d813a9d83adce99064d6e449e1f56f879bb0c6f1513994
 size 1684597880

trainer_state.json CHANGED Viewed

@@ -3,478 +3,79 @@
   "best_model_checkpoint": null,
   "epoch": 4.0,
   "eval_steps": 500,
-  "global_step": 16480,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
-    {
-      "epoch": 0.06067961165048544,
-      "grad_norm": 1.273590087890625,
-      "learning_rate": 4.9241504854368934e-05,
-      "loss": 2.4247,
-      "step": 250
-    },
-    {
-      "epoch": 0.12135922330097088,
-      "grad_norm": 1.7102854251861572,
-      "learning_rate": 4.8483009708737866e-05,
-      "loss": 1.5428,
-      "step": 500
-    },
-    {
-      "epoch": 0.1820388349514563,
-      "grad_norm": 1.6643282175064087,
-      "learning_rate": 4.77245145631068e-05,
-      "loss": 1.3308,
-      "step": 750
-    },
-    {
-      "epoch": 0.24271844660194175,
-      "grad_norm": 2.5313572883605957,
-      "learning_rate": 4.696601941747573e-05,
-      "loss": 1.1866,
-      "step": 1000
-    },
-    {
-      "epoch": 0.30339805825242716,
-      "grad_norm": 2.8626794815063477,
-      "learning_rate": 4.620752427184466e-05,
-      "loss": 1.0742,
-      "step": 1250
-    },
-    {
-      "epoch": 0.3640776699029126,
-      "grad_norm": 6.631696701049805,
-      "learning_rate": 4.544902912621359e-05,
-      "loss": 0.9929,
-      "step": 1500
-    },
-    {
-      "epoch": 0.42475728155339804,
-      "grad_norm": 3.0348339080810547,
-      "learning_rate": 4.469053398058253e-05,
-      "loss": 0.916,
-      "step": 1750
-    },
     {
       "epoch": 0.4854368932038835,
-      "grad_norm": 2.9426558017730713,
-      "learning_rate": 4.393203883495146e-05,
-      "loss": 0.8828,
-      "step": 2000
-    },
-    {
-      "epoch": 0.5461165048543689,
-      "grad_norm": 2.728257417678833,
-      "learning_rate": 4.3173543689320386e-05,
-      "loss": 0.8532,
-      "step": 2250
-    },
-    {
-      "epoch": 0.6067961165048543,
-      "grad_norm": 3.021036386489868,
-      "learning_rate": 4.2415048543689325e-05,
-      "loss": 0.8377,
-      "step": 2500
-    },
-    {
-      "epoch": 0.6674757281553398,
-      "grad_norm": 2.473598003387451,
-      "learning_rate": 4.1656553398058256e-05,
-      "loss": 0.8116,
-      "step": 2750
-    },
-    {
-      "epoch": 0.7281553398058253,
-      "grad_norm": 2.487483024597168,
-      "learning_rate": 4.089805825242719e-05,
-      "loss": 0.7921,
-      "step": 3000
-    },
-    {
-      "epoch": 0.7888349514563107,
-      "grad_norm": 2.476027488708496,
-      "learning_rate": 4.013956310679612e-05,
-      "loss": 0.7764,
-      "step": 3250
-    },
-    {
-      "epoch": 0.8495145631067961,
-      "grad_norm": 2.163785696029663,
-      "learning_rate": 3.938106796116505e-05,
-      "loss": 0.7689,
-      "step": 3500
-    },
-    {
-      "epoch": 0.9101941747572816,
-      "grad_norm": 2.3139920234680176,
-      "learning_rate": 3.862257281553398e-05,
-      "loss": 0.7681,
-      "step": 3750
     },
     {
       "epoch": 0.970873786407767,
-      "grad_norm": 2.3527328968048096,
-      "learning_rate": 3.7864077669902914e-05,
-      "loss": 0.7573,
-      "step": 4000
-    },
-    {
-      "epoch": 1.0315533980582525,
-      "grad_norm": 2.44330096244812,
-      "learning_rate": 3.7105582524271846e-05,
-      "loss": 0.7521,
-      "step": 4250
-    },
-    {
-      "epoch": 1.0922330097087378,
-      "grad_norm": 2.1583352088928223,
-      "learning_rate": 3.634708737864078e-05,
-      "loss": 0.7423,
-      "step": 4500
-    },
-    {
-      "epoch": 1.1529126213592233,
-      "grad_norm": 2.255215644836426,
-      "learning_rate": 3.558859223300971e-05,
-      "loss": 0.7321,
-      "step": 4750
-    },
-    {
-      "epoch": 1.2135922330097086,
-      "grad_norm": 2.872974395751953,
-      "learning_rate": 3.483009708737864e-05,
-      "loss": 0.7289,
-      "step": 5000
-    },
-    {
-      "epoch": 1.2742718446601942,
-      "grad_norm": 2.6156952381134033,
-      "learning_rate": 3.407160194174757e-05,
-      "loss": 0.7313,
-      "step": 5250
-    },
-    {
-      "epoch": 1.3349514563106797,
-      "grad_norm": 2.401933193206787,
-      "learning_rate": 3.3313106796116504e-05,
-      "loss": 0.7337,
-      "step": 5500
-    },
-    {
-      "epoch": 1.395631067961165,
-      "grad_norm": 2.6419358253479004,
-      "learning_rate": 3.255461165048544e-05,
-      "loss": 0.728,
-      "step": 5750
     },
     {
       "epoch": 1.4563106796116505,
-      "grad_norm": 3.051607608795166,
-      "learning_rate": 3.1796116504854373e-05,
-      "loss": 0.7162,
-      "step": 6000
-    },
-    {
-      "epoch": 1.516990291262136,
-      "grad_norm": 2.9298923015594482,
-      "learning_rate": 3.10376213592233e-05,
-      "loss": 0.7044,
-      "step": 6250
-    },
-    {
-      "epoch": 1.5776699029126213,
-      "grad_norm": 2.739171028137207,
-      "learning_rate": 3.0279126213592237e-05,
-      "loss": 0.698,
-      "step": 6500
-    },
-    {
-      "epoch": 1.6383495145631068,
-      "grad_norm": 3.524129629135132,
-      "learning_rate": 2.9520631067961168e-05,
-      "loss": 0.7058,
-      "step": 6750
-    },
-    {
-      "epoch": 1.6990291262135924,
-      "grad_norm": 3.441566228866577,
-      "learning_rate": 2.8762135922330096e-05,
-      "loss": 0.6878,
-      "step": 7000
-    },
-    {
-      "epoch": 1.7597087378640777,
-      "grad_norm": 3.8912746906280518,
-      "learning_rate": 2.800364077669903e-05,
-      "loss": 0.6875,
-      "step": 7250
-    },
-    {
-      "epoch": 1.820388349514563,
-      "grad_norm": 3.1602139472961426,
-      "learning_rate": 2.7245145631067963e-05,
-      "loss": 0.6851,
-      "step": 7500
-    },
-    {
-      "epoch": 1.8810679611650487,
-      "grad_norm": 3.7168660163879395,
-      "learning_rate": 2.648665048543689e-05,
-      "loss": 0.6841,
-      "step": 7750
     },
     {
       "epoch": 1.941747572815534,
-      "grad_norm": 3.787256956100464,
-      "learning_rate": 2.5728155339805826e-05,
-      "loss": 0.6741,
-      "step": 8000
-    },
-    {
-      "epoch": 2.0024271844660193,
-      "grad_norm": 3.769205331802368,
-      "learning_rate": 2.4969660194174758e-05,
-      "loss": 0.6617,
-      "step": 8250
-    },
-    {
-      "epoch": 2.063106796116505,
-      "grad_norm": 5.271848201751709,
-      "learning_rate": 2.4211165048543692e-05,
-      "loss": 0.6576,
-      "step": 8500
-    },
-    {
-      "epoch": 2.1237864077669903,
-      "grad_norm": 4.380245208740234,
-      "learning_rate": 2.345266990291262e-05,
-      "loss": 0.6534,
-      "step": 8750
-    },
-    {
-      "epoch": 2.1844660194174756,
-      "grad_norm": 4.646461009979248,
-      "learning_rate": 2.2694174757281556e-05,
-      "loss": 0.6467,
-      "step": 9000
-    },
-    {
-      "epoch": 2.2451456310679614,
-      "grad_norm": 4.234325885772705,
-      "learning_rate": 2.1935679611650487e-05,
-      "loss": 0.6494,
-      "step": 9250
-    },
-    {
-      "epoch": 2.3058252427184467,
-      "grad_norm": 4.548443794250488,
-      "learning_rate": 2.117718446601942e-05,
-      "loss": 0.6407,
-      "step": 9500
-    },
-    {
-      "epoch": 2.366504854368932,
-      "grad_norm": 4.539821624755859,
-      "learning_rate": 2.041868932038835e-05,
-      "loss": 0.6275,
-      "step": 9750
     },
     {
       "epoch": 2.4271844660194173,
-      "grad_norm": 5.217247009277344,
-      "learning_rate": 1.9660194174757282e-05,
-      "loss": 0.6258,
-      "step": 10000
-    },
-    {
-      "epoch": 2.487864077669903,
-      "grad_norm": 5.887857913970947,
-      "learning_rate": 1.8901699029126217e-05,
-      "loss": 0.6184,
-      "step": 10250
-    },
-    {
-      "epoch": 2.5485436893203883,
-      "grad_norm": 4.96919584274292,
-      "learning_rate": 1.814320388349515e-05,
-      "loss": 0.622,
-      "step": 10500
-    },
-    {
-      "epoch": 2.6092233009708736,
-      "grad_norm": 5.683660984039307,
-      "learning_rate": 1.7384708737864077e-05,
-      "loss": 0.6203,
-      "step": 10750
-    },
-    {
-      "epoch": 2.6699029126213594,
-      "grad_norm": 7.270623683929443,
-      "learning_rate": 1.662621359223301e-05,
-      "loss": 0.6193,
-      "step": 11000
-    },
-    {
-      "epoch": 2.7305825242718447,
-      "grad_norm": 6.315695762634277,
-      "learning_rate": 1.5867718446601943e-05,
-      "loss": 0.6109,
-      "step": 11250
-    },
-    {
-      "epoch": 2.79126213592233,
-      "grad_norm": 5.317906856536865,
-      "learning_rate": 1.5109223300970873e-05,
-      "loss": 0.606,
-      "step": 11500
-    },
-    {
-      "epoch": 2.8519417475728153,
-      "grad_norm": 5.550199508666992,
-      "learning_rate": 1.4350728155339808e-05,
-      "loss": 0.6033,
-      "step": 11750
     },
     {
       "epoch": 2.912621359223301,
-      "grad_norm": 5.8281941413879395,
-      "learning_rate": 1.3592233009708738e-05,
-      "loss": 0.5946,
-      "step": 12000
-    },
-    {
-      "epoch": 2.9733009708737863,
-      "grad_norm": 5.76870059967041,
-      "learning_rate": 1.2833737864077671e-05,
-      "loss": 0.59,
-      "step": 12250
-    },
-    {
-      "epoch": 3.033980582524272,
-      "grad_norm": 7.43914270401001,
-      "learning_rate": 1.2075242718446603e-05,
-      "loss": 0.5843,
-      "step": 12500
-    },
-    {
-      "epoch": 3.0946601941747574,
-      "grad_norm": 5.4104390144348145,
-      "learning_rate": 1.1316747572815536e-05,
-      "loss": 0.5726,
-      "step": 12750
-    },
-    {
-      "epoch": 3.1553398058252426,
-      "grad_norm": 7.808868885040283,
-      "learning_rate": 1.0558252427184466e-05,
-      "loss": 0.5755,
-      "step": 13000
-    },
-    {
-      "epoch": 3.216019417475728,
-      "grad_norm": 5.40352201461792,
-      "learning_rate": 9.799757281553399e-06,
-      "loss": 0.5661,
-      "step": 13250
-    },
-    {
-      "epoch": 3.2766990291262137,
-      "grad_norm": 6.617573261260986,
-      "learning_rate": 9.04126213592233e-06,
-      "loss": 0.5662,
-      "step": 13500
-    },
-    {
-      "epoch": 3.337378640776699,
-      "grad_norm": 6.450667858123779,
-      "learning_rate": 8.282766990291264e-06,
-      "loss": 0.5668,
-      "step": 13750
     },
     {
       "epoch": 3.3980582524271843,
-      "grad_norm": 7.024094104766846,
-      "learning_rate": 7.524271844660194e-06,
-      "loss": 0.5654,
-      "step": 14000
-    },
-    {
-      "epoch": 3.45873786407767,
-      "grad_norm": 6.141932487487793,
-      "learning_rate": 6.765776699029126e-06,
-      "loss": 0.564,
-      "step": 14250
-    },
-    {
-      "epoch": 3.5194174757281553,
-      "grad_norm": 6.2105817794799805,
-      "learning_rate": 6.0072815533980584e-06,
-      "loss": 0.5592,
-      "step": 14500
-    },
-    {
-      "epoch": 3.5800970873786406,
-      "grad_norm": 5.812750816345215,
-      "learning_rate": 5.24878640776699e-06,
-      "loss": 0.5577,
-      "step": 14750
-    },
-    {
-      "epoch": 3.6407766990291264,
-      "grad_norm": 6.361168384552002,
-      "learning_rate": 4.490291262135922e-06,
-      "loss": 0.556,
-      "step": 15000
-    },
-    {
-      "epoch": 3.7014563106796117,
-      "grad_norm": 5.725775241851807,
-      "learning_rate": 3.7317961165048544e-06,
-      "loss": 0.5499,
-      "step": 15250
-    },
-    {
-      "epoch": 3.762135922330097,
-      "grad_norm": 5.730428695678711,
-      "learning_rate": 2.973300970873787e-06,
-      "loss": 0.5507,
-      "step": 15500
-    },
-    {
-      "epoch": 3.8228155339805827,
-      "grad_norm": 6.114981651306152,
-      "learning_rate": 2.2148058252427188e-06,
-      "loss": 0.5561,
-      "step": 15750
     },
     {
       "epoch": 3.883495145631068,
-      "grad_norm": 5.782876491546631,
-      "learning_rate": 1.4563106796116506e-06,
-      "loss": 0.5454,
-      "step": 16000
-    },
-    {
-      "epoch": 3.9441747572815533,
-      "grad_norm": 5.954553127288818,
-      "learning_rate": 6.978155339805825e-07,
-      "loss": 0.5511,
-      "step": 16250
     },
     {
       "epoch": 4.0,
-      "step": 16480,
       "total_flos": 1.4507840127190487e+18,
-      "train_loss": 0.7319364158852586,
-      "train_runtime": 27052.3827,
-      "train_samples_per_second": 19.494,
-      "train_steps_per_second": 0.609
     }
   ],
   "logging_steps": 250,
-  "max_steps": 16480,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 4,
   "save_steps": 500,

   "best_model_checkpoint": null,
   "epoch": 4.0,
   "eval_steps": 500,
+  "global_step": 2060,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.4854368932038835,
+      "grad_norm": 9.382500648498535,
+      "learning_rate": 8.786407766990292e-05,
+      "loss": 14.3109,
+      "step": 250
     },
     {
       "epoch": 0.970873786407767,
+      "grad_norm": 8.183174133300781,
+      "learning_rate": 7.572815533980583e-05,
+      "loss": 7.1849,
+      "step": 500
     },
     {
       "epoch": 1.4563106796116505,
+      "grad_norm": 6.754209995269775,
+      "learning_rate": 6.359223300970875e-05,
+      "loss": 6.0557,
+      "step": 750
     },
     {
       "epoch": 1.941747572815534,
+      "grad_norm": 7.222278118133545,
+      "learning_rate": 5.145631067961165e-05,
+      "loss": 5.7214,
+      "step": 1000
     },
     {
       "epoch": 2.4271844660194173,
+      "grad_norm": 7.9512224197387695,
+      "learning_rate": 3.9320388349514564e-05,
+      "loss": 5.4678,
+      "step": 1250
     },
     {
       "epoch": 2.912621359223301,
+      "grad_norm": 10.402533531188965,
+      "learning_rate": 2.7184466019417475e-05,
+      "loss": 5.2727,
+      "step": 1500
     },
     {
       "epoch": 3.3980582524271843,
+      "grad_norm": 10.867375373840332,
+      "learning_rate": 1.5048543689320387e-05,
+      "loss": 5.0298,
+      "step": 1750
     },
     {
       "epoch": 3.883495145631068,
+      "grad_norm": 10.53590202331543,
+      "learning_rate": 2.912621359223301e-06,
+      "loss": 4.9242,
+      "step": 2000
     },
     {
       "epoch": 4.0,
+      "step": 2060,
       "total_flos": 1.4507840127190487e+18,
+      "train_loss": 6.692458765715071,
+      "train_runtime": 26455.3227,
+      "train_samples_per_second": 19.934,
+      "train_steps_per_second": 0.078
     }
   ],
   "logging_steps": 250,
+  "max_steps": 2060,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 4,
   "save_steps": 500,

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b2e7716b1d6e8dbd33b04ecd011c472406754200622be7ce2152850376ddfd0c
 size 5304

 version https://git-lfs.github.com/spec/v1
+oid sha256:e88d1bb0004a1075ae449c7aad3943d80eab26cf78f5dab269161b817eae4ee7
 size 5304