End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +997 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: top_12_ranking_stackexchange
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # top_12_ranking_stackexchange
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7993

 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: top_12_ranking_stackexchange
 # top_12_ranking_stackexchange
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/top_12_ranking_stackexchange dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7993

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.994661421747682,
+    "eval_loss": 0.7992855906486511,
+    "eval_runtime": 479.0724,
+    "eval_samples_per_second": 25.023,
+    "eval_steps_per_second": 0.392,
+    "total_flos": 2230940231270400.0,
+    "train_loss": 0.7792791328630648,
+    "train_runtime": 78935.918,
+    "train_samples_per_second": 8.656,
+    "train_steps_per_second": 0.017
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.994661421747682,
+    "eval_loss": 0.7992855906486511,
+    "eval_runtime": 479.0724,
+    "eval_samples_per_second": 25.023,
+    "eval_steps_per_second": 0.392
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.994661421747682,
+    "total_flos": 2230940231270400.0,
+    "train_loss": 0.7792791328630648,
+    "train_runtime": 78935.918,
+    "train_samples_per_second": 8.656,
+    "train_steps_per_second": 0.017
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,997 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.994661421747682,
+  "eval_steps": 500,
+  "global_step": 1332,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.022478224220286596,
+      "grad_norm": 33.33372823372105,
+      "learning_rate": 5e-06,
+      "loss": 1.0579,
+      "step": 10
+    },
+    {
+      "epoch": 0.04495644844057319,
+      "grad_norm": 1.0746818341634412,
+      "learning_rate": 5e-06,
+      "loss": 0.9596,
+      "step": 20
+    },
+    {
+      "epoch": 0.0674346726608598,
+      "grad_norm": 0.7321847735876122,
+      "learning_rate": 5e-06,
+      "loss": 0.911,
+      "step": 30
+    },
+    {
+      "epoch": 0.08991289688114638,
+      "grad_norm": 0.7413785484127575,
+      "learning_rate": 5e-06,
+      "loss": 0.8985,
+      "step": 40
+    },
+    {
+      "epoch": 0.11239112110143298,
+      "grad_norm": 0.6455791890910528,
+      "learning_rate": 5e-06,
+      "loss": 0.8815,
+      "step": 50
+    },
+    {
+      "epoch": 0.1348693453217196,
+      "grad_norm": 0.6017747724297141,
+      "learning_rate": 5e-06,
+      "loss": 0.8721,
+      "step": 60
+    },
+    {
+      "epoch": 0.15734756954200618,
+      "grad_norm": 0.6032337358958032,
+      "learning_rate": 5e-06,
+      "loss": 0.8712,
+      "step": 70
+    },
+    {
+      "epoch": 0.17982579376229277,
+      "grad_norm": 0.7856142351722715,
+      "learning_rate": 5e-06,
+      "loss": 0.8628,
+      "step": 80
+    },
+    {
+      "epoch": 0.20230401798257938,
+      "grad_norm": 0.53309068117808,
+      "learning_rate": 5e-06,
+      "loss": 0.8619,
+      "step": 90
+    },
+    {
+      "epoch": 0.22478224220286597,
+      "grad_norm": 0.7302586105980624,
+      "learning_rate": 5e-06,
+      "loss": 0.8536,
+      "step": 100
+    },
+    {
+      "epoch": 0.24726046642315258,
+      "grad_norm": 0.5590012456129412,
+      "learning_rate": 5e-06,
+      "loss": 0.8494,
+      "step": 110
+    },
+    {
+      "epoch": 0.2697386906434392,
+      "grad_norm": 0.8024116489610613,
+      "learning_rate": 5e-06,
+      "loss": 0.8481,
+      "step": 120
+    },
+    {
+      "epoch": 0.29221691486372575,
+      "grad_norm": 0.879956013299429,
+      "learning_rate": 5e-06,
+      "loss": 0.8416,
+      "step": 130
+    },
+    {
+      "epoch": 0.31469513908401237,
+      "grad_norm": 0.7724424487225131,
+      "learning_rate": 5e-06,
+      "loss": 0.8459,
+      "step": 140
+    },
+    {
+      "epoch": 0.337173363304299,
+      "grad_norm": 0.7280953369087129,
+      "learning_rate": 5e-06,
+      "loss": 0.8407,
+      "step": 150
+    },
+    {
+      "epoch": 0.35965158752458554,
+      "grad_norm": 0.549976756286124,
+      "learning_rate": 5e-06,
+      "loss": 0.8383,
+      "step": 160
+    },
+    {
+      "epoch": 0.38212981174487215,
+      "grad_norm": 0.6289114641518077,
+      "learning_rate": 5e-06,
+      "loss": 0.835,
+      "step": 170
+    },
+    {
+      "epoch": 0.40460803596515876,
+      "grad_norm": 0.6648128128576364,
+      "learning_rate": 5e-06,
+      "loss": 0.834,
+      "step": 180
+    },
+    {
+      "epoch": 0.4270862601854454,
+      "grad_norm": 0.5103947816514711,
+      "learning_rate": 5e-06,
+      "loss": 0.8365,
+      "step": 190
+    },
+    {
+      "epoch": 0.44956448440573193,
+      "grad_norm": 0.6232935960390383,
+      "learning_rate": 5e-06,
+      "loss": 0.8326,
+      "step": 200
+    },
+    {
+      "epoch": 0.47204270862601855,
+      "grad_norm": 0.6038963808726584,
+      "learning_rate": 5e-06,
+      "loss": 0.8335,
+      "step": 210
+    },
+    {
+      "epoch": 0.49452093284630516,
+      "grad_norm": 0.5922907804331954,
+      "learning_rate": 5e-06,
+      "loss": 0.8295,
+      "step": 220
+    },
+    {
+      "epoch": 0.5169991570665917,
+      "grad_norm": 0.585830487528667,
+      "learning_rate": 5e-06,
+      "loss": 0.8252,
+      "step": 230
+    },
+    {
+      "epoch": 0.5394773812868784,
+      "grad_norm": 0.5455286191351291,
+      "learning_rate": 5e-06,
+      "loss": 0.8295,
+      "step": 240
+    },
+    {
+      "epoch": 0.561955605507165,
+      "grad_norm": 0.5826278411995965,
+      "learning_rate": 5e-06,
+      "loss": 0.8309,
+      "step": 250
+    },
+    {
+      "epoch": 0.5844338297274515,
+      "grad_norm": 0.628487407925322,
+      "learning_rate": 5e-06,
+      "loss": 0.8291,
+      "step": 260
+    },
+    {
+      "epoch": 0.6069120539477382,
+      "grad_norm": 0.5179593922905324,
+      "learning_rate": 5e-06,
+      "loss": 0.8224,
+      "step": 270
+    },
+    {
+      "epoch": 0.6293902781680247,
+      "grad_norm": 0.6849863947657101,
+      "learning_rate": 5e-06,
+      "loss": 0.8286,
+      "step": 280
+    },
+    {
+      "epoch": 0.6518685023883113,
+      "grad_norm": 0.5306421600279095,
+      "learning_rate": 5e-06,
+      "loss": 0.8239,
+      "step": 290
+    },
+    {
+      "epoch": 0.674346726608598,
+      "grad_norm": 0.5652336198819432,
+      "learning_rate": 5e-06,
+      "loss": 0.823,
+      "step": 300
+    },
+    {
+      "epoch": 0.6968249508288845,
+      "grad_norm": 0.5081331704644918,
+      "learning_rate": 5e-06,
+      "loss": 0.8234,
+      "step": 310
+    },
+    {
+      "epoch": 0.7193031750491711,
+      "grad_norm": 0.5996298789398387,
+      "learning_rate": 5e-06,
+      "loss": 0.8254,
+      "step": 320
+    },
+    {
+      "epoch": 0.7417813992694577,
+      "grad_norm": 0.5164629633736417,
+      "learning_rate": 5e-06,
+      "loss": 0.8169,
+      "step": 330
+    },
+    {
+      "epoch": 0.7642596234897443,
+      "grad_norm": 0.4728695844824827,
+      "learning_rate": 5e-06,
+      "loss": 0.8189,
+      "step": 340
+    },
+    {
+      "epoch": 0.7867378477100309,
+      "grad_norm": 0.5186229875938196,
+      "learning_rate": 5e-06,
+      "loss": 0.819,
+      "step": 350
+    },
+    {
+      "epoch": 0.8092160719303175,
+      "grad_norm": 0.6132017941059994,
+      "learning_rate": 5e-06,
+      "loss": 0.8165,
+      "step": 360
+    },
+    {
+      "epoch": 0.8316942961506041,
+      "grad_norm": 0.53282736043642,
+      "learning_rate": 5e-06,
+      "loss": 0.8196,
+      "step": 370
+    },
+    {
+      "epoch": 0.8541725203708908,
+      "grad_norm": 0.5547116807812702,
+      "learning_rate": 5e-06,
+      "loss": 0.8219,
+      "step": 380
+    },
+    {
+      "epoch": 0.8766507445911773,
+      "grad_norm": 0.5185569454044568,
+      "learning_rate": 5e-06,
+      "loss": 0.8155,
+      "step": 390
+    },
+    {
+      "epoch": 0.8991289688114639,
+      "grad_norm": 0.5854723337738515,
+      "learning_rate": 5e-06,
+      "loss": 0.8153,
+      "step": 400
+    },
+    {
+      "epoch": 0.9216071930317505,
+      "grad_norm": 0.535827573700132,
+      "learning_rate": 5e-06,
+      "loss": 0.8116,
+      "step": 410
+    },
+    {
+      "epoch": 0.9440854172520371,
+      "grad_norm": 0.6607158975384394,
+      "learning_rate": 5e-06,
+      "loss": 0.8109,
+      "step": 420
+    },
+    {
+      "epoch": 0.9665636414723237,
+      "grad_norm": 0.5944011061376792,
+      "learning_rate": 5e-06,
+      "loss": 0.8117,
+      "step": 430
+    },
+    {
+      "epoch": 0.9890418656926103,
+      "grad_norm": 0.5508674094850188,
+      "learning_rate": 5e-06,
+      "loss": 0.8066,
+      "step": 440
+    },
+    {
+      "epoch": 0.9980331553807249,
+      "eval_loss": 0.8152613639831543,
+      "eval_runtime": 475.9259,
+      "eval_samples_per_second": 25.189,
+      "eval_steps_per_second": 0.395,
+      "step": 444
+    },
+    {
+      "epoch": 1.0118010677156504,
+      "grad_norm": 0.7278302813509685,
+      "learning_rate": 5e-06,
+      "loss": 0.8129,
+      "step": 450
+    },
+    {
+      "epoch": 1.0342792919359372,
+      "grad_norm": 0.5287930468348986,
+      "learning_rate": 5e-06,
+      "loss": 0.7733,
+      "step": 460
+    },
+    {
+      "epoch": 1.0567575161562237,
+      "grad_norm": 0.6518686331871915,
+      "learning_rate": 5e-06,
+      "loss": 0.7633,
+      "step": 470
+    },
+    {
+      "epoch": 1.0792357403765103,
+      "grad_norm": 0.7380154444162458,
+      "learning_rate": 5e-06,
+      "loss": 0.7655,
+      "step": 480
+    },
+    {
+      "epoch": 1.1017139645967968,
+      "grad_norm": 0.5703044113679665,
+      "learning_rate": 5e-06,
+      "loss": 0.7684,
+      "step": 490
+    },
+    {
+      "epoch": 1.1241921888170834,
+      "grad_norm": 0.667061719861843,
+      "learning_rate": 5e-06,
+      "loss": 0.7703,
+      "step": 500
+    },
+    {
+      "epoch": 1.14667041303737,
+      "grad_norm": 0.49640152865057224,
+      "learning_rate": 5e-06,
+      "loss": 0.7677,
+      "step": 510
+    },
+    {
+      "epoch": 1.1691486372576567,
+      "grad_norm": 0.6823293155486599,
+      "learning_rate": 5e-06,
+      "loss": 0.7733,
+      "step": 520
+    },
+    {
+      "epoch": 1.1916268614779433,
+      "grad_norm": 0.8130260554922801,
+      "learning_rate": 5e-06,
+      "loss": 0.7694,
+      "step": 530
+    },
+    {
+      "epoch": 1.2141050856982298,
+      "grad_norm": 0.5402831179025009,
+      "learning_rate": 5e-06,
+      "loss": 0.7738,
+      "step": 540
+    },
+    {
+      "epoch": 1.2365833099185164,
+      "grad_norm": 0.7004205303241327,
+      "learning_rate": 5e-06,
+      "loss": 0.7703,
+      "step": 550
+    },
+    {
+      "epoch": 1.259061534138803,
+      "grad_norm": 0.5715169691190888,
+      "learning_rate": 5e-06,
+      "loss": 0.7728,
+      "step": 560
+    },
+    {
+      "epoch": 1.2815397583590897,
+      "grad_norm": 0.5291552273092583,
+      "learning_rate": 5e-06,
+      "loss": 0.776,
+      "step": 570
+    },
+    {
+      "epoch": 1.3040179825793763,
+      "grad_norm": 0.6092348267517466,
+      "learning_rate": 5e-06,
+      "loss": 0.7745,
+      "step": 580
+    },
+    {
+      "epoch": 1.3264962067996628,
+      "grad_norm": 0.6042527846865735,
+      "learning_rate": 5e-06,
+      "loss": 0.7667,
+      "step": 590
+    },
+    {
+      "epoch": 1.3489744310199494,
+      "grad_norm": 0.5783859809295108,
+      "learning_rate": 5e-06,
+      "loss": 0.7702,
+      "step": 600
+    },
+    {
+      "epoch": 1.371452655240236,
+      "grad_norm": 0.7456788631066553,
+      "learning_rate": 5e-06,
+      "loss": 0.7675,
+      "step": 610
+    },
+    {
+      "epoch": 1.3939308794605227,
+      "grad_norm": 0.5257587143597946,
+      "learning_rate": 5e-06,
+      "loss": 0.7688,
+      "step": 620
+    },
+    {
+      "epoch": 1.416409103680809,
+      "grad_norm": 0.48973885745829204,
+      "learning_rate": 5e-06,
+      "loss": 0.7712,
+      "step": 630
+    },
+    {
+      "epoch": 1.4388873279010959,
+      "grad_norm": 0.5351811080578055,
+      "learning_rate": 5e-06,
+      "loss": 0.7652,
+      "step": 640
+    },
+    {
+      "epoch": 1.4613655521213824,
+      "grad_norm": 0.5694063448256315,
+      "learning_rate": 5e-06,
+      "loss": 0.7731,
+      "step": 650
+    },
+    {
+      "epoch": 1.483843776341669,
+      "grad_norm": 0.6140412877051478,
+      "learning_rate": 5e-06,
+      "loss": 0.7666,
+      "step": 660
+    },
+    {
+      "epoch": 1.5063220005619558,
+      "grad_norm": 0.5277178288175658,
+      "learning_rate": 5e-06,
+      "loss": 0.7654,
+      "step": 670
+    },
+    {
+      "epoch": 1.528800224782242,
+      "grad_norm": 0.5149765953205256,
+      "learning_rate": 5e-06,
+      "loss": 0.7681,
+      "step": 680
+    },
+    {
+      "epoch": 1.5512784490025289,
+      "grad_norm": 0.6475419905061228,
+      "learning_rate": 5e-06,
+      "loss": 0.7666,
+      "step": 690
+    },
+    {
+      "epoch": 1.5737566732228154,
+      "grad_norm": 0.5166580562353909,
+      "learning_rate": 5e-06,
+      "loss": 0.7691,
+      "step": 700
+    },
+    {
+      "epoch": 1.596234897443102,
+      "grad_norm": 0.4787094497221161,
+      "learning_rate": 5e-06,
+      "loss": 0.7698,
+      "step": 710
+    },
+    {
+      "epoch": 1.6187131216633885,
+      "grad_norm": 0.4996359884347099,
+      "learning_rate": 5e-06,
+      "loss": 0.7678,
+      "step": 720
+    },
+    {
+      "epoch": 1.641191345883675,
+      "grad_norm": 0.5342877473493819,
+      "learning_rate": 5e-06,
+      "loss": 0.7676,
+      "step": 730
+    },
+    {
+      "epoch": 1.6636695701039619,
+      "grad_norm": 0.5893380084853027,
+      "learning_rate": 5e-06,
+      "loss": 0.7664,
+      "step": 740
+    },
+    {
+      "epoch": 1.6861477943242484,
+      "grad_norm": 0.5219388717823797,
+      "learning_rate": 5e-06,
+      "loss": 0.7652,
+      "step": 750
+    },
+    {
+      "epoch": 1.708626018544535,
+      "grad_norm": 0.48023628858244755,
+      "learning_rate": 5e-06,
+      "loss": 0.7669,
+      "step": 760
+    },
+    {
+      "epoch": 1.7311042427648216,
+      "grad_norm": 0.6019840339786192,
+      "learning_rate": 5e-06,
+      "loss": 0.7639,
+      "step": 770
+    },
+    {
+      "epoch": 1.7535824669851081,
+      "grad_norm": 0.7082749102315159,
+      "learning_rate": 5e-06,
+      "loss": 0.7699,
+      "step": 780
+    },
+    {
+      "epoch": 1.776060691205395,
+      "grad_norm": 0.6095877731067036,
+      "learning_rate": 5e-06,
+      "loss": 0.7655,
+      "step": 790
+    },
+    {
+      "epoch": 1.7985389154256812,
+      "grad_norm": 0.5263754173295848,
+      "learning_rate": 5e-06,
+      "loss": 0.7638,
+      "step": 800
+    },
+    {
+      "epoch": 1.821017139645968,
+      "grad_norm": 0.5559042865148136,
+      "learning_rate": 5e-06,
+      "loss": 0.7673,
+      "step": 810
+    },
+    {
+      "epoch": 1.8434953638662546,
+      "grad_norm": 0.5806755734084227,
+      "learning_rate": 5e-06,
+      "loss": 0.7637,
+      "step": 820
+    },
+    {
+      "epoch": 1.8659735880865411,
+      "grad_norm": 0.5027027691217111,
+      "learning_rate": 5e-06,
+      "loss": 0.7689,
+      "step": 830
+    },
+    {
+      "epoch": 1.888451812306828,
+      "grad_norm": 0.5835360323428179,
+      "learning_rate": 5e-06,
+      "loss": 0.7651,
+      "step": 840
+    },
+    {
+      "epoch": 1.9109300365271142,
+      "grad_norm": 0.5756674505040741,
+      "learning_rate": 5e-06,
+      "loss": 0.7646,
+      "step": 850
+    },
+    {
+      "epoch": 1.933408260747401,
+      "grad_norm": 0.6170172971003131,
+      "learning_rate": 5e-06,
+      "loss": 0.7685,
+      "step": 860
+    },
+    {
+      "epoch": 1.9558864849676876,
+      "grad_norm": 0.5744859525833788,
+      "learning_rate": 5e-06,
+      "loss": 0.77,
+      "step": 870
+    },
+    {
+      "epoch": 1.9783647091879741,
+      "grad_norm": 0.5307996136237888,
+      "learning_rate": 5e-06,
+      "loss": 0.765,
+      "step": 880
+    },
+    {
+      "epoch": 1.998595110986232,
+      "eval_loss": 0.8013474345207214,
+      "eval_runtime": 476.3278,
+      "eval_samples_per_second": 25.168,
+      "eval_steps_per_second": 0.395,
+      "step": 889
+    },
+    {
+      "epoch": 2.0011239112110144,
+      "grad_norm": 0.7180506387912537,
+      "learning_rate": 5e-06,
+      "loss": 0.7904,
+      "step": 890
+    },
+    {
+      "epoch": 2.0236021354313007,
+      "grad_norm": 0.8180532035670276,
+      "learning_rate": 5e-06,
+      "loss": 0.7189,
+      "step": 900
+    },
+    {
+      "epoch": 2.0460803596515875,
+      "grad_norm": 0.5436885253799979,
+      "learning_rate": 5e-06,
+      "loss": 0.7202,
+      "step": 910
+    },
+    {
+      "epoch": 2.0685585838718743,
+      "grad_norm": 0.6619250789509951,
+      "learning_rate": 5e-06,
+      "loss": 0.7223,
+      "step": 920
+    },
+    {
+      "epoch": 2.0910368080921606,
+      "grad_norm": 0.5679017431009896,
+      "learning_rate": 5e-06,
+      "loss": 0.7187,
+      "step": 930
+    },
+    {
+      "epoch": 2.1135150323124474,
+      "grad_norm": 0.580647118753186,
+      "learning_rate": 5e-06,
+      "loss": 0.7224,
+      "step": 940
+    },
+    {
+      "epoch": 2.1359932565327338,
+      "grad_norm": 0.5616410388098043,
+      "learning_rate": 5e-06,
+      "loss": 0.7213,
+      "step": 950
+    },
+    {
+      "epoch": 2.1584714807530205,
+      "grad_norm": 0.5750449067160542,
+      "learning_rate": 5e-06,
+      "loss": 0.7233,
+      "step": 960
+    },
+    {
+      "epoch": 2.1809497049733073,
+      "grad_norm": 0.648340798833132,
+      "learning_rate": 5e-06,
+      "loss": 0.7234,
+      "step": 970
+    },
+    {
+      "epoch": 2.2034279291935936,
+      "grad_norm": 0.5562199933691032,
+      "learning_rate": 5e-06,
+      "loss": 0.7238,
+      "step": 980
+    },
+    {
+      "epoch": 2.2259061534138804,
+      "grad_norm": 0.5994203497541317,
+      "learning_rate": 5e-06,
+      "loss": 0.7197,
+      "step": 990
+    },
+    {
+      "epoch": 2.2483843776341668,
+      "grad_norm": 0.5268766948768074,
+      "learning_rate": 5e-06,
+      "loss": 0.7239,
+      "step": 1000
+    },
+    {
+      "epoch": 2.2708626018544535,
+      "grad_norm": 0.5268496126487976,
+      "learning_rate": 5e-06,
+      "loss": 0.7237,
+      "step": 1010
+    },
+    {
+      "epoch": 2.29334082607474,
+      "grad_norm": 0.5327610201359371,
+      "learning_rate": 5e-06,
+      "loss": 0.7246,
+      "step": 1020
+    },
+    {
+      "epoch": 2.3158190502950267,
+      "grad_norm": 0.5638169818039437,
+      "learning_rate": 5e-06,
+      "loss": 0.7253,
+      "step": 1030
+    },
+    {
+      "epoch": 2.3382972745153134,
+      "grad_norm": 0.6391691143776261,
+      "learning_rate": 5e-06,
+      "loss": 0.7257,
+      "step": 1040
+    },
+    {
+      "epoch": 2.3607754987355998,
+      "grad_norm": 0.5627143397713585,
+      "learning_rate": 5e-06,
+      "loss": 0.7275,
+      "step": 1050
+    },
+    {
+      "epoch": 2.3832537229558866,
+      "grad_norm": 0.6289665122460788,
+      "learning_rate": 5e-06,
+      "loss": 0.7257,
+      "step": 1060
+    },
+    {
+      "epoch": 2.405731947176173,
+      "grad_norm": 0.5758684477805143,
+      "learning_rate": 5e-06,
+      "loss": 0.7272,
+      "step": 1070
+    },
+    {
+      "epoch": 2.4282101713964597,
+      "grad_norm": 0.5385204702334082,
+      "learning_rate": 5e-06,
+      "loss": 0.7264,
+      "step": 1080
+    },
+    {
+      "epoch": 2.4506883956167465,
+      "grad_norm": 0.6150195793596126,
+      "learning_rate": 5e-06,
+      "loss": 0.7255,
+      "step": 1090
+    },
+    {
+      "epoch": 2.473166619837033,
+      "grad_norm": 0.5418174401143092,
+      "learning_rate": 5e-06,
+      "loss": 0.7241,
+      "step": 1100
+    },
+    {
+      "epoch": 2.4956448440573196,
+      "grad_norm": 0.5691109067038872,
+      "learning_rate": 5e-06,
+      "loss": 0.7243,
+      "step": 1110
+    },
+    {
+      "epoch": 2.518123068277606,
+      "grad_norm": 0.5352586504544353,
+      "learning_rate": 5e-06,
+      "loss": 0.7247,
+      "step": 1120
+    },
+    {
+      "epoch": 2.5406012924978927,
+      "grad_norm": 0.5023069113499318,
+      "learning_rate": 5e-06,
+      "loss": 0.7214,
+      "step": 1130
+    },
+    {
+      "epoch": 2.5630795167181795,
+      "grad_norm": 0.6122471735672584,
+      "learning_rate": 5e-06,
+      "loss": 0.7261,
+      "step": 1140
+    },
+    {
+      "epoch": 2.585557740938466,
+      "grad_norm": 0.6574159621177923,
+      "learning_rate": 5e-06,
+      "loss": 0.7251,
+      "step": 1150
+    },
+    {
+      "epoch": 2.6080359651587526,
+      "grad_norm": 0.5528066749443077,
+      "learning_rate": 5e-06,
+      "loss": 0.7257,
+      "step": 1160
+    },
+    {
+      "epoch": 2.630514189379039,
+      "grad_norm": 0.631501888391289,
+      "learning_rate": 5e-06,
+      "loss": 0.7243,
+      "step": 1170
+    },
+    {
+      "epoch": 2.6529924135993257,
+      "grad_norm": 0.6448576692585793,
+      "learning_rate": 5e-06,
+      "loss": 0.7285,
+      "step": 1180
+    },
+    {
+      "epoch": 2.675470637819612,
+      "grad_norm": 0.6302476804716267,
+      "learning_rate": 5e-06,
+      "loss": 0.7279,
+      "step": 1190
+    },
+    {
+      "epoch": 2.697948862039899,
+      "grad_norm": 0.6863144289974997,
+      "learning_rate": 5e-06,
+      "loss": 0.7219,
+      "step": 1200
+    },
+    {
+      "epoch": 2.7204270862601856,
+      "grad_norm": 0.5633795290177185,
+      "learning_rate": 5e-06,
+      "loss": 0.7241,
+      "step": 1210
+    },
+    {
+      "epoch": 2.742905310480472,
+      "grad_norm": 0.5494311885409552,
+      "learning_rate": 5e-06,
+      "loss": 0.7282,
+      "step": 1220
+    },
+    {
+      "epoch": 2.7653835347007587,
+      "grad_norm": 0.5030682576319576,
+      "learning_rate": 5e-06,
+      "loss": 0.7242,
+      "step": 1230
+    },
+    {
+      "epoch": 2.7878617589210455,
+      "grad_norm": 0.5992059557152434,
+      "learning_rate": 5e-06,
+      "loss": 0.7222,
+      "step": 1240
+    },
+    {
+      "epoch": 2.810339983141332,
+      "grad_norm": 0.5824170874560094,
+      "learning_rate": 5e-06,
+      "loss": 0.7217,
+      "step": 1250
+    },
+    {
+      "epoch": 2.832818207361618,
+      "grad_norm": 0.5579212327960739,
+      "learning_rate": 5e-06,
+      "loss": 0.7215,
+      "step": 1260
+    },
+    {
+      "epoch": 2.855296431581905,
+      "grad_norm": 0.5580492783375522,
+      "learning_rate": 5e-06,
+      "loss": 0.7309,
+      "step": 1270
+    },
+    {
+      "epoch": 2.8777746558021917,
+      "grad_norm": 0.8439296859021013,
+      "learning_rate": 5e-06,
+      "loss": 0.7266,
+      "step": 1280
+    },
+    {
+      "epoch": 2.900252880022478,
+      "grad_norm": 0.5787147746004252,
+      "learning_rate": 5e-06,
+      "loss": 0.7282,
+      "step": 1290
+    },
+    {
+      "epoch": 2.922731104242765,
+      "grad_norm": 0.5649986792315236,
+      "learning_rate": 5e-06,
+      "loss": 0.7279,
+      "step": 1300
+    },
+    {
+      "epoch": 2.9452093284630516,
+      "grad_norm": 0.544211010945195,
+      "learning_rate": 5e-06,
+      "loss": 0.7227,
+      "step": 1310
+    },
+    {
+      "epoch": 2.967687552683338,
+      "grad_norm": 0.5684918426978717,
+      "learning_rate": 5e-06,
+      "loss": 0.7311,
+      "step": 1320
+    },
+    {
+      "epoch": 2.9901657769036247,
+      "grad_norm": 0.4877016715810142,
+      "learning_rate": 5e-06,
+      "loss": 0.723,
+      "step": 1330
+    },
+    {
+      "epoch": 2.994661421747682,
+      "eval_loss": 0.7992855906486511,
+      "eval_runtime": 476.027,
+      "eval_samples_per_second": 25.183,
+      "eval_steps_per_second": 0.395,
+      "step": 1332
+    },
+    {
+      "epoch": 2.994661421747682,
+      "step": 1332,
+      "total_flos": 2230940231270400.0,
+      "train_loss": 0.7792791328630648,
+      "train_runtime": 78935.918,
+      "train_samples_per_second": 8.656,
+      "train_steps_per_second": 0.017
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1332,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2230940231270400.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed