End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +1144 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: top_14_ranking_stackexchange
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # top_14_ranking_stackexchange
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.8005

 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: top_14_ranking_stackexchange
 # top_14_ranking_stackexchange
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/top_14_ranking_stackexchange dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.8005

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.997084548104956,
+    "eval_loss": 0.8005240559577942,
+    "eval_runtime": 555.4162,
+    "eval_samples_per_second": 24.96,
+    "eval_steps_per_second": 0.391,
+    "total_flos": 2582698052812800.0,
+    "train_loss": 0.7834810720498151,
+    "train_runtime": 91136.0835,
+    "train_samples_per_second": 8.67,
+    "train_steps_per_second": 0.017
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.997084548104956,
+    "eval_loss": 0.8005240559577942,
+    "eval_runtime": 555.4162,
+    "eval_samples_per_second": 24.96,
+    "eval_steps_per_second": 0.391
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.997084548104956,
+    "total_flos": 2582698052812800.0,
+    "train_loss": 0.7834810720498151,
+    "train_runtime": 91136.0835,
+    "train_samples_per_second": 8.67,
+    "train_steps_per_second": 0.017
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1144 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.997084548104956,
+  "eval_steps": 500,
+  "global_step": 1542,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.019436345966958212,
+      "grad_norm": 3.5477320746078096,
+      "learning_rate": 5e-06,
+      "loss": 1.0532,
+      "step": 10
+    },
+    {
+      "epoch": 0.038872691933916424,
+      "grad_norm": 2.8209235996953512,
+      "learning_rate": 5e-06,
+      "loss": 0.9684,
+      "step": 20
+    },
+    {
+      "epoch": 0.05830903790087463,
+      "grad_norm": 1.1257852692164845,
+      "learning_rate": 5e-06,
+      "loss": 0.9294,
+      "step": 30
+    },
+    {
+      "epoch": 0.07774538386783285,
+      "grad_norm": 0.888481539231949,
+      "learning_rate": 5e-06,
+      "loss": 0.9201,
+      "step": 40
+    },
+    {
+      "epoch": 0.09718172983479106,
+      "grad_norm": 0.6469537614046023,
+      "learning_rate": 5e-06,
+      "loss": 0.8971,
+      "step": 50
+    },
+    {
+      "epoch": 0.11661807580174927,
+      "grad_norm": 0.592544408615531,
+      "learning_rate": 5e-06,
+      "loss": 0.8837,
+      "step": 60
+    },
+    {
+      "epoch": 0.1360544217687075,
+      "grad_norm": 0.6433313667438364,
+      "learning_rate": 5e-06,
+      "loss": 0.8749,
+      "step": 70
+    },
+    {
+      "epoch": 0.1554907677356657,
+      "grad_norm": 1.344429915972531,
+      "learning_rate": 5e-06,
+      "loss": 0.867,
+      "step": 80
+    },
+    {
+      "epoch": 0.1749271137026239,
+      "grad_norm": 0.8707935340603853,
+      "learning_rate": 5e-06,
+      "loss": 0.8623,
+      "step": 90
+    },
+    {
+      "epoch": 0.19436345966958213,
+      "grad_norm": 0.7828161719097383,
+      "learning_rate": 5e-06,
+      "loss": 0.8624,
+      "step": 100
+    },
+    {
+      "epoch": 0.21379980563654033,
+      "grad_norm": 0.682126021781355,
+      "learning_rate": 5e-06,
+      "loss": 0.8611,
+      "step": 110
+    },
+    {
+      "epoch": 0.23323615160349853,
+      "grad_norm": 0.8081328723422389,
+      "learning_rate": 5e-06,
+      "loss": 0.8551,
+      "step": 120
+    },
+    {
+      "epoch": 0.25267249757045673,
+      "grad_norm": 0.6691994077909735,
+      "learning_rate": 5e-06,
+      "loss": 0.8583,
+      "step": 130
+    },
+    {
+      "epoch": 0.272108843537415,
+      "grad_norm": 0.586140300081327,
+      "learning_rate": 5e-06,
+      "loss": 0.8523,
+      "step": 140
+    },
+    {
+      "epoch": 0.2915451895043732,
+      "grad_norm": 0.8552001050583659,
+      "learning_rate": 5e-06,
+      "loss": 0.8497,
+      "step": 150
+    },
+    {
+      "epoch": 0.3109815354713314,
+      "grad_norm": 0.501523965918074,
+      "learning_rate": 5e-06,
+      "loss": 0.8464,
+      "step": 160
+    },
+    {
+      "epoch": 0.3304178814382896,
+      "grad_norm": 0.6785445153206255,
+      "learning_rate": 5e-06,
+      "loss": 0.848,
+      "step": 170
+    },
+    {
+      "epoch": 0.3498542274052478,
+      "grad_norm": 0.6494060293856031,
+      "learning_rate": 5e-06,
+      "loss": 0.8459,
+      "step": 180
+    },
+    {
+      "epoch": 0.369290573372206,
+      "grad_norm": 0.6020737926390343,
+      "learning_rate": 5e-06,
+      "loss": 0.8484,
+      "step": 190
+    },
+    {
+      "epoch": 0.38872691933916426,
+      "grad_norm": 0.48912875825316915,
+      "learning_rate": 5e-06,
+      "loss": 0.8449,
+      "step": 200
+    },
+    {
+      "epoch": 0.40816326530612246,
+      "grad_norm": 0.8781137047011839,
+      "learning_rate": 5e-06,
+      "loss": 0.8395,
+      "step": 210
+    },
+    {
+      "epoch": 0.42759961127308066,
+      "grad_norm": 0.5879468826056136,
+      "learning_rate": 5e-06,
+      "loss": 0.8387,
+      "step": 220
+    },
+    {
+      "epoch": 0.44703595724003886,
+      "grad_norm": 0.6017675792916065,
+      "learning_rate": 5e-06,
+      "loss": 0.8363,
+      "step": 230
+    },
+    {
+      "epoch": 0.46647230320699706,
+      "grad_norm": 0.616647981494789,
+      "learning_rate": 5e-06,
+      "loss": 0.8336,
+      "step": 240
+    },
+    {
+      "epoch": 0.4859086491739553,
+      "grad_norm": 0.855268617382177,
+      "learning_rate": 5e-06,
+      "loss": 0.8336,
+      "step": 250
+    },
+    {
+      "epoch": 0.5053449951409135,
+      "grad_norm": 0.780188445457583,
+      "learning_rate": 5e-06,
+      "loss": 0.831,
+      "step": 260
+    },
+    {
+      "epoch": 0.5247813411078717,
+      "grad_norm": 0.5840816004625115,
+      "learning_rate": 5e-06,
+      "loss": 0.8319,
+      "step": 270
+    },
+    {
+      "epoch": 0.54421768707483,
+      "grad_norm": 0.6535257947752856,
+      "learning_rate": 5e-06,
+      "loss": 0.833,
+      "step": 280
+    },
+    {
+      "epoch": 0.5636540330417882,
+      "grad_norm": 0.6710102563759031,
+      "learning_rate": 5e-06,
+      "loss": 0.8354,
+      "step": 290
+    },
+    {
+      "epoch": 0.5830903790087464,
+      "grad_norm": 0.5947252085113186,
+      "learning_rate": 5e-06,
+      "loss": 0.8315,
+      "step": 300
+    },
+    {
+      "epoch": 0.6025267249757046,
+      "grad_norm": 0.5902161171422673,
+      "learning_rate": 5e-06,
+      "loss": 0.83,
+      "step": 310
+    },
+    {
+      "epoch": 0.6219630709426628,
+      "grad_norm": 0.5757165816873938,
+      "learning_rate": 5e-06,
+      "loss": 0.8299,
+      "step": 320
+    },
+    {
+      "epoch": 0.641399416909621,
+      "grad_norm": 0.6751753868245474,
+      "learning_rate": 5e-06,
+      "loss": 0.827,
+      "step": 330
+    },
+    {
+      "epoch": 0.6608357628765792,
+      "grad_norm": 0.535560817394151,
+      "learning_rate": 5e-06,
+      "loss": 0.8264,
+      "step": 340
+    },
+    {
+      "epoch": 0.6802721088435374,
+      "grad_norm": 0.6183377886108462,
+      "learning_rate": 5e-06,
+      "loss": 0.8243,
+      "step": 350
+    },
+    {
+      "epoch": 0.6997084548104956,
+      "grad_norm": 0.6200501825686097,
+      "learning_rate": 5e-06,
+      "loss": 0.828,
+      "step": 360
+    },
+    {
+      "epoch": 0.7191448007774538,
+      "grad_norm": 0.5622254912052161,
+      "learning_rate": 5e-06,
+      "loss": 0.8287,
+      "step": 370
+    },
+    {
+      "epoch": 0.738581146744412,
+      "grad_norm": 0.6720527159909909,
+      "learning_rate": 5e-06,
+      "loss": 0.8234,
+      "step": 380
+    },
+    {
+      "epoch": 0.7580174927113703,
+      "grad_norm": 0.5315560166276624,
+      "learning_rate": 5e-06,
+      "loss": 0.824,
+      "step": 390
+    },
+    {
+      "epoch": 0.7774538386783285,
+      "grad_norm": 0.6413527148042328,
+      "learning_rate": 5e-06,
+      "loss": 0.8194,
+      "step": 400
+    },
+    {
+      "epoch": 0.7968901846452867,
+      "grad_norm": 0.6402327795437167,
+      "learning_rate": 5e-06,
+      "loss": 0.8243,
+      "step": 410
+    },
+    {
+      "epoch": 0.8163265306122449,
+      "grad_norm": 0.6290935177044384,
+      "learning_rate": 5e-06,
+      "loss": 0.8201,
+      "step": 420
+    },
+    {
+      "epoch": 0.8357628765792031,
+      "grad_norm": 0.6137598310285064,
+      "learning_rate": 5e-06,
+      "loss": 0.8187,
+      "step": 430
+    },
+    {
+      "epoch": 0.8551992225461613,
+      "grad_norm": 0.6137894354862566,
+      "learning_rate": 5e-06,
+      "loss": 0.8217,
+      "step": 440
+    },
+    {
+      "epoch": 0.8746355685131195,
+      "grad_norm": 0.7376542092364302,
+      "learning_rate": 5e-06,
+      "loss": 0.8191,
+      "step": 450
+    },
+    {
+      "epoch": 0.8940719144800777,
+      "grad_norm": 0.5443940007812901,
+      "learning_rate": 5e-06,
+      "loss": 0.8179,
+      "step": 460
+    },
+    {
+      "epoch": 0.9135082604470359,
+      "grad_norm": 0.6778023088897194,
+      "learning_rate": 5e-06,
+      "loss": 0.8158,
+      "step": 470
+    },
+    {
+      "epoch": 0.9329446064139941,
+      "grad_norm": 0.6040677193471313,
+      "learning_rate": 5e-06,
+      "loss": 0.8176,
+      "step": 480
+    },
+    {
+      "epoch": 0.9523809523809523,
+      "grad_norm": 0.5741967517130403,
+      "learning_rate": 5e-06,
+      "loss": 0.8215,
+      "step": 490
+    },
+    {
+      "epoch": 0.9718172983479106,
+      "grad_norm": 0.5184977635424605,
+      "learning_rate": 5e-06,
+      "loss": 0.8218,
+      "step": 500
+    },
+    {
+      "epoch": 0.9912536443148688,
+      "grad_norm": 0.5484601778792662,
+      "learning_rate": 5e-06,
+      "loss": 0.8158,
+      "step": 510
+    },
+    {
+      "epoch": 0.9990281827016521,
+      "eval_loss": 0.8152287006378174,
+      "eval_runtime": 547.9504,
+      "eval_samples_per_second": 25.3,
+      "eval_steps_per_second": 0.396,
+      "step": 514
+    },
+    {
+      "epoch": 1.010689990281827,
+      "grad_norm": 0.6821565020818955,
+      "learning_rate": 5e-06,
+      "loss": 0.8355,
+      "step": 520
+    },
+    {
+      "epoch": 1.0301263362487851,
+      "grad_norm": 0.6277981964271326,
+      "learning_rate": 5e-06,
+      "loss": 0.7749,
+      "step": 530
+    },
+    {
+      "epoch": 1.0495626822157433,
+      "grad_norm": 0.6114756472079457,
+      "learning_rate": 5e-06,
+      "loss": 0.7751,
+      "step": 540
+    },
+    {
+      "epoch": 1.0689990281827018,
+      "grad_norm": 0.8259615966427586,
+      "learning_rate": 5e-06,
+      "loss": 0.7756,
+      "step": 550
+    },
+    {
+      "epoch": 1.08843537414966,
+      "grad_norm": 0.6113352198439804,
+      "learning_rate": 5e-06,
+      "loss": 0.7789,
+      "step": 560
+    },
+    {
+      "epoch": 1.1078717201166182,
+      "grad_norm": 0.5269512429419262,
+      "learning_rate": 5e-06,
+      "loss": 0.7784,
+      "step": 570
+    },
+    {
+      "epoch": 1.1273080660835764,
+      "grad_norm": 0.5792710933468033,
+      "learning_rate": 5e-06,
+      "loss": 0.7757,
+      "step": 580
+    },
+    {
+      "epoch": 1.1467444120505346,
+      "grad_norm": 0.5467198449481481,
+      "learning_rate": 5e-06,
+      "loss": 0.7757,
+      "step": 590
+    },
+    {
+      "epoch": 1.1661807580174928,
+      "grad_norm": 0.6190447420364188,
+      "learning_rate": 5e-06,
+      "loss": 0.7754,
+      "step": 600
+    },
+    {
+      "epoch": 1.185617103984451,
+      "grad_norm": 0.7074708178383962,
+      "learning_rate": 5e-06,
+      "loss": 0.7738,
+      "step": 610
+    },
+    {
+      "epoch": 1.2050534499514092,
+      "grad_norm": 0.5708793884696434,
+      "learning_rate": 5e-06,
+      "loss": 0.7675,
+      "step": 620
+    },
+    {
+      "epoch": 1.2244897959183674,
+      "grad_norm": 0.5278424041049065,
+      "learning_rate": 5e-06,
+      "loss": 0.7724,
+      "step": 630
+    },
+    {
+      "epoch": 1.2439261418853256,
+      "grad_norm": 0.48269223376284837,
+      "learning_rate": 5e-06,
+      "loss": 0.7678,
+      "step": 640
+    },
+    {
+      "epoch": 1.2633624878522838,
+      "grad_norm": 0.6628438951702088,
+      "learning_rate": 5e-06,
+      "loss": 0.772,
+      "step": 650
+    },
+    {
+      "epoch": 1.282798833819242,
+      "grad_norm": 0.49956285734450795,
+      "learning_rate": 5e-06,
+      "loss": 0.78,
+      "step": 660
+    },
+    {
+      "epoch": 1.3022351797862002,
+      "grad_norm": 0.4482989241465936,
+      "learning_rate": 5e-06,
+      "loss": 0.7712,
+      "step": 670
+    },
+    {
+      "epoch": 1.3216715257531584,
+      "grad_norm": 0.515252608881534,
+      "learning_rate": 5e-06,
+      "loss": 0.7712,
+      "step": 680
+    },
+    {
+      "epoch": 1.3411078717201166,
+      "grad_norm": 0.5392787594093453,
+      "learning_rate": 5e-06,
+      "loss": 0.7733,
+      "step": 690
+    },
+    {
+      "epoch": 1.3605442176870748,
+      "grad_norm": 0.5239288581769422,
+      "learning_rate": 5e-06,
+      "loss": 0.7744,
+      "step": 700
+    },
+    {
+      "epoch": 1.379980563654033,
+      "grad_norm": 0.5368087860350439,
+      "learning_rate": 5e-06,
+      "loss": 0.7721,
+      "step": 710
+    },
+    {
+      "epoch": 1.3994169096209912,
+      "grad_norm": 0.5331498843832938,
+      "learning_rate": 5e-06,
+      "loss": 0.7702,
+      "step": 720
+    },
+    {
+      "epoch": 1.4188532555879494,
+      "grad_norm": 0.5840718481917428,
+      "learning_rate": 5e-06,
+      "loss": 0.7755,
+      "step": 730
+    },
+    {
+      "epoch": 1.4382896015549078,
+      "grad_norm": 0.537679455083028,
+      "learning_rate": 5e-06,
+      "loss": 0.7719,
+      "step": 740
+    },
+    {
+      "epoch": 1.4577259475218658,
+      "grad_norm": 0.7948828701245976,
+      "learning_rate": 5e-06,
+      "loss": 0.7717,
+      "step": 750
+    },
+    {
+      "epoch": 1.4771622934888242,
+      "grad_norm": 0.5813227807421696,
+      "learning_rate": 5e-06,
+      "loss": 0.7763,
+      "step": 760
+    },
+    {
+      "epoch": 1.4965986394557822,
+      "grad_norm": 0.6049608273143411,
+      "learning_rate": 5e-06,
+      "loss": 0.7719,
+      "step": 770
+    },
+    {
+      "epoch": 1.5160349854227406,
+      "grad_norm": 0.6057676712274179,
+      "learning_rate": 5e-06,
+      "loss": 0.7742,
+      "step": 780
+    },
+    {
+      "epoch": 1.5354713313896986,
+      "grad_norm": 0.625042201984692,
+      "learning_rate": 5e-06,
+      "loss": 0.7767,
+      "step": 790
+    },
+    {
+      "epoch": 1.554907677356657,
+      "grad_norm": 0.5502470811006085,
+      "learning_rate": 5e-06,
+      "loss": 0.769,
+      "step": 800
+    },
+    {
+      "epoch": 1.574344023323615,
+      "grad_norm": 0.5857965121292225,
+      "learning_rate": 5e-06,
+      "loss": 0.7709,
+      "step": 810
+    },
+    {
+      "epoch": 1.5937803692905734,
+      "grad_norm": 0.613513782295781,
+      "learning_rate": 5e-06,
+      "loss": 0.7755,
+      "step": 820
+    },
+    {
+      "epoch": 1.6132167152575316,
+      "grad_norm": 0.583240417199926,
+      "learning_rate": 5e-06,
+      "loss": 0.7738,
+      "step": 830
+    },
+    {
+      "epoch": 1.6326530612244898,
+      "grad_norm": 0.5231701163074167,
+      "learning_rate": 5e-06,
+      "loss": 0.7731,
+      "step": 840
+    },
+    {
+      "epoch": 1.652089407191448,
+      "grad_norm": 0.4914609295920534,
+      "learning_rate": 5e-06,
+      "loss": 0.7714,
+      "step": 850
+    },
+    {
+      "epoch": 1.6715257531584062,
+      "grad_norm": 0.5250445862469549,
+      "learning_rate": 5e-06,
+      "loss": 0.7747,
+      "step": 860
+    },
+    {
+      "epoch": 1.6909620991253644,
+      "grad_norm": 0.5415316148110199,
+      "learning_rate": 5e-06,
+      "loss": 0.7685,
+      "step": 870
+    },
+    {
+      "epoch": 1.7103984450923226,
+      "grad_norm": 0.5426871063126633,
+      "learning_rate": 5e-06,
+      "loss": 0.7687,
+      "step": 880
+    },
+    {
+      "epoch": 1.7298347910592808,
+      "grad_norm": 0.5573890854917875,
+      "learning_rate": 5e-06,
+      "loss": 0.7732,
+      "step": 890
+    },
+    {
+      "epoch": 1.749271137026239,
+      "grad_norm": 0.5966240890058521,
+      "learning_rate": 5e-06,
+      "loss": 0.7695,
+      "step": 900
+    },
+    {
+      "epoch": 1.7687074829931972,
+      "grad_norm": 0.48852508189672406,
+      "learning_rate": 5e-06,
+      "loss": 0.7716,
+      "step": 910
+    },
+    {
+      "epoch": 1.7881438289601554,
+      "grad_norm": 0.510423049422432,
+      "learning_rate": 5e-06,
+      "loss": 0.7695,
+      "step": 920
+    },
+    {
+      "epoch": 1.8075801749271136,
+      "grad_norm": 0.5128362713697912,
+      "learning_rate": 5e-06,
+      "loss": 0.768,
+      "step": 930
+    },
+    {
+      "epoch": 1.8270165208940718,
+      "grad_norm": 0.5100892708497722,
+      "learning_rate": 5e-06,
+      "loss": 0.7721,
+      "step": 940
+    },
+    {
+      "epoch": 1.8464528668610303,
+      "grad_norm": 0.5050841552954286,
+      "learning_rate": 5e-06,
+      "loss": 0.7703,
+      "step": 950
+    },
+    {
+      "epoch": 1.8658892128279883,
+      "grad_norm": 0.5100122636839687,
+      "learning_rate": 5e-06,
+      "loss": 0.7675,
+      "step": 960
+    },
+    {
+      "epoch": 1.8853255587949467,
+      "grad_norm": 0.5858103448907649,
+      "learning_rate": 5e-06,
+      "loss": 0.7719,
+      "step": 970
+    },
+    {
+      "epoch": 1.9047619047619047,
+      "grad_norm": 0.7186164442084766,
+      "learning_rate": 5e-06,
+      "loss": 0.7704,
+      "step": 980
+    },
+    {
+      "epoch": 1.924198250728863,
+      "grad_norm": 0.49187325414902705,
+      "learning_rate": 5e-06,
+      "loss": 0.7744,
+      "step": 990
+    },
+    {
+      "epoch": 1.943634596695821,
+      "grad_norm": 0.48132179189009133,
+      "learning_rate": 5e-06,
+      "loss": 0.7666,
+      "step": 1000
+    },
+    {
+      "epoch": 1.9630709426627795,
+      "grad_norm": 0.6042950330363083,
+      "learning_rate": 5e-06,
+      "loss": 0.7682,
+      "step": 1010
+    },
+    {
+      "epoch": 1.9825072886297375,
+      "grad_norm": 0.5107563561120851,
+      "learning_rate": 5e-06,
+      "loss": 0.7688,
+      "step": 1020
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.8017936944961548,
+      "eval_runtime": 549.3546,
+      "eval_samples_per_second": 25.235,
+      "eval_steps_per_second": 0.395,
+      "step": 1029
+    },
+    {
+      "epoch": 2.001943634596696,
+      "grad_norm": 0.8520236880105687,
+      "learning_rate": 5e-06,
+      "loss": 0.8065,
+      "step": 1030
+    },
+    {
+      "epoch": 2.021379980563654,
+      "grad_norm": 0.6759369335284802,
+      "learning_rate": 5e-06,
+      "loss": 0.7247,
+      "step": 1040
+    },
+    {
+      "epoch": 2.0408163265306123,
+      "grad_norm": 0.6810404313004184,
+      "learning_rate": 5e-06,
+      "loss": 0.7257,
+      "step": 1050
+    },
+    {
+      "epoch": 2.0602526724975703,
+      "grad_norm": 0.5976865962159595,
+      "learning_rate": 5e-06,
+      "loss": 0.7266,
+      "step": 1060
+    },
+    {
+      "epoch": 2.0796890184645287,
+      "grad_norm": 0.5170299538250477,
+      "learning_rate": 5e-06,
+      "loss": 0.7262,
+      "step": 1070
+    },
+    {
+      "epoch": 2.0991253644314867,
+      "grad_norm": 0.636746039057963,
+      "learning_rate": 5e-06,
+      "loss": 0.7271,
+      "step": 1080
+    },
+    {
+      "epoch": 2.118561710398445,
+      "grad_norm": 0.7347207445428233,
+      "learning_rate": 5e-06,
+      "loss": 0.7221,
+      "step": 1090
+    },
+    {
+      "epoch": 2.1379980563654035,
+      "grad_norm": 0.6669755018146,
+      "learning_rate": 5e-06,
+      "loss": 0.7277,
+      "step": 1100
+    },
+    {
+      "epoch": 2.1574344023323615,
+      "grad_norm": 0.5653238227925762,
+      "learning_rate": 5e-06,
+      "loss": 0.726,
+      "step": 1110
+    },
+    {
+      "epoch": 2.17687074829932,
+      "grad_norm": 0.5720425000083328,
+      "learning_rate": 5e-06,
+      "loss": 0.7271,
+      "step": 1120
+    },
+    {
+      "epoch": 2.196307094266278,
+      "grad_norm": 0.5494809428119856,
+      "learning_rate": 5e-06,
+      "loss": 0.727,
+      "step": 1130
+    },
+    {
+      "epoch": 2.2157434402332363,
+      "grad_norm": 0.6708852933316355,
+      "learning_rate": 5e-06,
+      "loss": 0.7286,
+      "step": 1140
+    },
+    {
+      "epoch": 2.2351797862001943,
+      "grad_norm": 0.5649904350477953,
+      "learning_rate": 5e-06,
+      "loss": 0.7253,
+      "step": 1150
+    },
+    {
+      "epoch": 2.2546161321671527,
+      "grad_norm": 0.6681152322447659,
+      "learning_rate": 5e-06,
+      "loss": 0.7227,
+      "step": 1160
+    },
+    {
+      "epoch": 2.2740524781341107,
+      "grad_norm": 0.7452957502891413,
+      "learning_rate": 5e-06,
+      "loss": 0.7271,
+      "step": 1170
+    },
+    {
+      "epoch": 2.293488824101069,
+      "grad_norm": 0.730891816162587,
+      "learning_rate": 5e-06,
+      "loss": 0.7297,
+      "step": 1180
+    },
+    {
+      "epoch": 2.312925170068027,
+      "grad_norm": 0.5422066002537126,
+      "learning_rate": 5e-06,
+      "loss": 0.7291,
+      "step": 1190
+    },
+    {
+      "epoch": 2.3323615160349855,
+      "grad_norm": 0.5368279161848004,
+      "learning_rate": 5e-06,
+      "loss": 0.7291,
+      "step": 1200
+    },
+    {
+      "epoch": 2.3517978620019435,
+      "grad_norm": 0.5831776325357405,
+      "learning_rate": 5e-06,
+      "loss": 0.7299,
+      "step": 1210
+    },
+    {
+      "epoch": 2.371234207968902,
+      "grad_norm": 0.4988880812457934,
+      "learning_rate": 5e-06,
+      "loss": 0.7286,
+      "step": 1220
+    },
+    {
+      "epoch": 2.39067055393586,
+      "grad_norm": 0.6745820300056689,
+      "learning_rate": 5e-06,
+      "loss": 0.7246,
+      "step": 1230
+    },
+    {
+      "epoch": 2.4101068999028183,
+      "grad_norm": 0.6502142374990822,
+      "learning_rate": 5e-06,
+      "loss": 0.7274,
+      "step": 1240
+    },
+    {
+      "epoch": 2.4295432458697763,
+      "grad_norm": 0.6686318012888572,
+      "learning_rate": 5e-06,
+      "loss": 0.7273,
+      "step": 1250
+    },
+    {
+      "epoch": 2.4489795918367347,
+      "grad_norm": 0.5959741098147713,
+      "learning_rate": 5e-06,
+      "loss": 0.7296,
+      "step": 1260
+    },
+    {
+      "epoch": 2.4684159378036927,
+      "grad_norm": 0.6933996442905096,
+      "learning_rate": 5e-06,
+      "loss": 0.7265,
+      "step": 1270
+    },
+    {
+      "epoch": 2.487852283770651,
+      "grad_norm": 0.5220203781381132,
+      "learning_rate": 5e-06,
+      "loss": 0.7279,
+      "step": 1280
+    },
+    {
+      "epoch": 2.5072886297376096,
+      "grad_norm": 0.5382425321528858,
+      "learning_rate": 5e-06,
+      "loss": 0.7305,
+      "step": 1290
+    },
+    {
+      "epoch": 2.5267249757045676,
+      "grad_norm": 0.5181218910911854,
+      "learning_rate": 5e-06,
+      "loss": 0.7249,
+      "step": 1300
+    },
+    {
+      "epoch": 2.5461613216715255,
+      "grad_norm": 0.6478067615615305,
+      "learning_rate": 5e-06,
+      "loss": 0.7319,
+      "step": 1310
+    },
+    {
+      "epoch": 2.565597667638484,
+      "grad_norm": 0.5078942293566884,
+      "learning_rate": 5e-06,
+      "loss": 0.7288,
+      "step": 1320
+    },
+    {
+      "epoch": 2.5850340136054424,
+      "grad_norm": 0.6268137880948265,
+      "learning_rate": 5e-06,
+      "loss": 0.7299,
+      "step": 1330
+    },
+    {
+      "epoch": 2.6044703595724004,
+      "grad_norm": 0.7996921164519973,
+      "learning_rate": 5e-06,
+      "loss": 0.7332,
+      "step": 1340
+    },
+    {
+      "epoch": 2.6239067055393583,
+      "grad_norm": 0.6835765316105765,
+      "learning_rate": 5e-06,
+      "loss": 0.7332,
+      "step": 1350
+    },
+    {
+      "epoch": 2.6433430515063168,
+      "grad_norm": 0.6208677881375628,
+      "learning_rate": 5e-06,
+      "loss": 0.7255,
+      "step": 1360
+    },
+    {
+      "epoch": 2.662779397473275,
+      "grad_norm": 0.5685215567462071,
+      "learning_rate": 5e-06,
+      "loss": 0.7311,
+      "step": 1370
+    },
+    {
+      "epoch": 2.682215743440233,
+      "grad_norm": 0.502556515949076,
+      "learning_rate": 5e-06,
+      "loss": 0.7288,
+      "step": 1380
+    },
+    {
+      "epoch": 2.7016520894071916,
+      "grad_norm": 0.5557319329244653,
+      "learning_rate": 5e-06,
+      "loss": 0.7252,
+      "step": 1390
+    },
+    {
+      "epoch": 2.7210884353741496,
+      "grad_norm": 0.5681474343567127,
+      "learning_rate": 5e-06,
+      "loss": 0.7324,
+      "step": 1400
+    },
+    {
+      "epoch": 2.740524781341108,
+      "grad_norm": 0.5718856742663174,
+      "learning_rate": 5e-06,
+      "loss": 0.7314,
+      "step": 1410
+    },
+    {
+      "epoch": 2.759961127308066,
+      "grad_norm": 0.5207890225289823,
+      "learning_rate": 5e-06,
+      "loss": 0.7327,
+      "step": 1420
+    },
+    {
+      "epoch": 2.7793974732750244,
+      "grad_norm": 0.5610775317663003,
+      "learning_rate": 5e-06,
+      "loss": 0.7325,
+      "step": 1430
+    },
+    {
+      "epoch": 2.7988338192419824,
+      "grad_norm": 0.49435740403271333,
+      "learning_rate": 5e-06,
+      "loss": 0.7306,
+      "step": 1440
+    },
+    {
+      "epoch": 2.818270165208941,
+      "grad_norm": 0.5548340643212172,
+      "learning_rate": 5e-06,
+      "loss": 0.7248,
+      "step": 1450
+    },
+    {
+      "epoch": 2.837706511175899,
+      "grad_norm": 0.8162943266425523,
+      "learning_rate": 5e-06,
+      "loss": 0.728,
+      "step": 1460
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.5605722072050826,
+      "learning_rate": 5e-06,
+      "loss": 0.7321,
+      "step": 1470
+    },
+    {
+      "epoch": 2.8765792031098156,
+      "grad_norm": 0.5811312094740239,
+      "learning_rate": 5e-06,
+      "loss": 0.7312,
+      "step": 1480
+    },
+    {
+      "epoch": 2.8960155490767736,
+      "grad_norm": 0.581575144969956,
+      "learning_rate": 5e-06,
+      "loss": 0.7321,
+      "step": 1490
+    },
+    {
+      "epoch": 2.9154518950437316,
+      "grad_norm": 0.5099095695816087,
+      "learning_rate": 5e-06,
+      "loss": 0.7334,
+      "step": 1500
+    },
+    {
+      "epoch": 2.93488824101069,
+      "grad_norm": 0.5572721217938963,
+      "learning_rate": 5e-06,
+      "loss": 0.7308,
+      "step": 1510
+    },
+    {
+      "epoch": 2.9543245869776484,
+      "grad_norm": 0.522930114991094,
+      "learning_rate": 5e-06,
+      "loss": 0.7295,
+      "step": 1520
+    },
+    {
+      "epoch": 2.9737609329446064,
+      "grad_norm": 0.6436319992243297,
+      "learning_rate": 5e-06,
+      "loss": 0.7331,
+      "step": 1530
+    },
+    {
+      "epoch": 2.9931972789115644,
+      "grad_norm": 0.6235339876579238,
+      "learning_rate": 5e-06,
+      "loss": 0.7263,
+      "step": 1540
+    },
+    {
+      "epoch": 2.997084548104956,
+      "eval_loss": 0.8005240559577942,
+      "eval_runtime": 550.1116,
+      "eval_samples_per_second": 25.2,
+      "eval_steps_per_second": 0.394,
+      "step": 1542
+    },
+    {
+      "epoch": 2.997084548104956,
+      "step": 1542,
+      "total_flos": 2582698052812800.0,
+      "train_loss": 0.7834810720498151,
+      "train_runtime": 91136.0835,
+      "train_samples_per_second": 8.67,
+      "train_steps_per_second": 0.017
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1542,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2582698052812800.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed