End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +1634 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: top_20_ranking_stackexchange
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # top_20_ranking_stackexchange
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7927

 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: top_20_ranking_stackexchange
 # top_20_ranking_stackexchange
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/top_20_ranking_stackexchange dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7927

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.999498746867168,
+    "eval_loss": 0.7926730513572693,
+    "eval_runtime": 796.5551,
+    "eval_samples_per_second": 25.308,
+    "eval_steps_per_second": 0.395,
+    "total_flos": 3758574199111680.0,
+    "train_loss": 0.7796513685780625,
+    "train_runtime": 132137.1731,
+    "train_samples_per_second": 8.696,
+    "train_steps_per_second": 0.017
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.999498746867168,
+    "eval_loss": 0.7926730513572693,
+    "eval_runtime": 796.5551,
+    "eval_samples_per_second": 25.308,
+    "eval_steps_per_second": 0.395
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.999498746867168,
+    "total_flos": 3758574199111680.0,
+    "train_loss": 0.7796513685780625,
+    "train_runtime": 132137.1731,
+    "train_samples_per_second": 8.696,
+    "train_steps_per_second": 0.017
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1634 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.999498746867168,
+  "eval_steps": 500,
+  "global_step": 2244,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.013366750208855471,
+      "grad_norm": 2.4323846059062397,
+      "learning_rate": 5e-06,
+      "loss": 1.0521,
+      "step": 10
+    },
+    {
+      "epoch": 0.026733500417710943,
+      "grad_norm": 1.2707159898783558,
+      "learning_rate": 5e-06,
+      "loss": 0.9449,
+      "step": 20
+    },
+    {
+      "epoch": 0.040100250626566414,
+      "grad_norm": 0.6645760066182232,
+      "learning_rate": 5e-06,
+      "loss": 0.9205,
+      "step": 30
+    },
+    {
+      "epoch": 0.053467000835421885,
+      "grad_norm": 0.6860381528425127,
+      "learning_rate": 5e-06,
+      "loss": 0.9062,
+      "step": 40
+    },
+    {
+      "epoch": 0.06683375104427736,
+      "grad_norm": 0.8462056832267063,
+      "learning_rate": 5e-06,
+      "loss": 0.8941,
+      "step": 50
+    },
+    {
+      "epoch": 0.08020050125313283,
+      "grad_norm": 0.5498617128094427,
+      "learning_rate": 5e-06,
+      "loss": 0.8866,
+      "step": 60
+    },
+    {
+      "epoch": 0.0935672514619883,
+      "grad_norm": 0.6217303867910247,
+      "learning_rate": 5e-06,
+      "loss": 0.8719,
+      "step": 70
+    },
+    {
+      "epoch": 0.10693400167084377,
+      "grad_norm": 0.687429978149511,
+      "learning_rate": 5e-06,
+      "loss": 0.876,
+      "step": 80
+    },
+    {
+      "epoch": 0.12030075187969924,
+      "grad_norm": 0.7639829931940186,
+      "learning_rate": 5e-06,
+      "loss": 0.8704,
+      "step": 90
+    },
+    {
+      "epoch": 0.1336675020885547,
+      "grad_norm": 0.5349974897408032,
+      "learning_rate": 5e-06,
+      "loss": 0.8677,
+      "step": 100
+    },
+    {
+      "epoch": 0.14703425229741018,
+      "grad_norm": 0.6212381364086903,
+      "learning_rate": 5e-06,
+      "loss": 0.8624,
+      "step": 110
+    },
+    {
+      "epoch": 0.16040100250626566,
+      "grad_norm": 0.5610901155787884,
+      "learning_rate": 5e-06,
+      "loss": 0.8621,
+      "step": 120
+    },
+    {
+      "epoch": 0.17376775271512113,
+      "grad_norm": 0.6155926013296407,
+      "learning_rate": 5e-06,
+      "loss": 0.8582,
+      "step": 130
+    },
+    {
+      "epoch": 0.1871345029239766,
+      "grad_norm": 0.6528571036607788,
+      "learning_rate": 5e-06,
+      "loss": 0.8532,
+      "step": 140
+    },
+    {
+      "epoch": 0.20050125313283207,
+      "grad_norm": 0.5372075443842537,
+      "learning_rate": 5e-06,
+      "loss": 0.8492,
+      "step": 150
+    },
+    {
+      "epoch": 0.21386800334168754,
+      "grad_norm": 0.7095829143035569,
+      "learning_rate": 5e-06,
+      "loss": 0.8494,
+      "step": 160
+    },
+    {
+      "epoch": 0.227234753550543,
+      "grad_norm": 0.7745444177509586,
+      "learning_rate": 5e-06,
+      "loss": 0.8476,
+      "step": 170
+    },
+    {
+      "epoch": 0.24060150375939848,
+      "grad_norm": 0.7586050901974903,
+      "learning_rate": 5e-06,
+      "loss": 0.8494,
+      "step": 180
+    },
+    {
+      "epoch": 0.25396825396825395,
+      "grad_norm": 0.5964597569119979,
+      "learning_rate": 5e-06,
+      "loss": 0.8498,
+      "step": 190
+    },
+    {
+      "epoch": 0.2673350041771094,
+      "grad_norm": 0.6293549963407589,
+      "learning_rate": 5e-06,
+      "loss": 0.8432,
+      "step": 200
+    },
+    {
+      "epoch": 0.2807017543859649,
+      "grad_norm": 0.5524407679849426,
+      "learning_rate": 5e-06,
+      "loss": 0.8475,
+      "step": 210
+    },
+    {
+      "epoch": 0.29406850459482037,
+      "grad_norm": 0.524350214049005,
+      "learning_rate": 5e-06,
+      "loss": 0.8431,
+      "step": 220
+    },
+    {
+      "epoch": 0.30743525480367584,
+      "grad_norm": 0.6760002252683699,
+      "learning_rate": 5e-06,
+      "loss": 0.8386,
+      "step": 230
+    },
+    {
+      "epoch": 0.3208020050125313,
+      "grad_norm": 0.5906902446596286,
+      "learning_rate": 5e-06,
+      "loss": 0.8349,
+      "step": 240
+    },
+    {
+      "epoch": 0.3341687552213868,
+      "grad_norm": 0.5723926384792003,
+      "learning_rate": 5e-06,
+      "loss": 0.8361,
+      "step": 250
+    },
+    {
+      "epoch": 0.34753550543024225,
+      "grad_norm": 0.5616096712561062,
+      "learning_rate": 5e-06,
+      "loss": 0.8368,
+      "step": 260
+    },
+    {
+      "epoch": 0.3609022556390977,
+      "grad_norm": 0.5507735559959206,
+      "learning_rate": 5e-06,
+      "loss": 0.835,
+      "step": 270
+    },
+    {
+      "epoch": 0.3742690058479532,
+      "grad_norm": 0.4803949597709757,
+      "learning_rate": 5e-06,
+      "loss": 0.8414,
+      "step": 280
+    },
+    {
+      "epoch": 0.38763575605680867,
+      "grad_norm": 0.5121852118343002,
+      "learning_rate": 5e-06,
+      "loss": 0.8325,
+      "step": 290
+    },
+    {
+      "epoch": 0.40100250626566414,
+      "grad_norm": 0.5559477754717894,
+      "learning_rate": 5e-06,
+      "loss": 0.8364,
+      "step": 300
+    },
+    {
+      "epoch": 0.4143692564745196,
+      "grad_norm": 0.7469026400245374,
+      "learning_rate": 5e-06,
+      "loss": 0.8306,
+      "step": 310
+    },
+    {
+      "epoch": 0.4277360066833751,
+      "grad_norm": 0.5090947427034287,
+      "learning_rate": 5e-06,
+      "loss": 0.8339,
+      "step": 320
+    },
+    {
+      "epoch": 0.44110275689223055,
+      "grad_norm": 0.6018861983279394,
+      "learning_rate": 5e-06,
+      "loss": 0.8283,
+      "step": 330
+    },
+    {
+      "epoch": 0.454469507101086,
+      "grad_norm": 0.5434521657719814,
+      "learning_rate": 5e-06,
+      "loss": 0.8285,
+      "step": 340
+    },
+    {
+      "epoch": 0.4678362573099415,
+      "grad_norm": 0.5903702809830117,
+      "learning_rate": 5e-06,
+      "loss": 0.8324,
+      "step": 350
+    },
+    {
+      "epoch": 0.48120300751879697,
+      "grad_norm": 0.6243867601355255,
+      "learning_rate": 5e-06,
+      "loss": 0.8284,
+      "step": 360
+    },
+    {
+      "epoch": 0.49456975772765244,
+      "grad_norm": 0.6094144532555286,
+      "learning_rate": 5e-06,
+      "loss": 0.8283,
+      "step": 370
+    },
+    {
+      "epoch": 0.5079365079365079,
+      "grad_norm": 0.5482360219270039,
+      "learning_rate": 5e-06,
+      "loss": 0.8289,
+      "step": 380
+    },
+    {
+      "epoch": 0.5213032581453634,
+      "grad_norm": 0.5061542985510644,
+      "learning_rate": 5e-06,
+      "loss": 0.8317,
+      "step": 390
+    },
+    {
+      "epoch": 0.5346700083542189,
+      "grad_norm": 0.6652440131533577,
+      "learning_rate": 5e-06,
+      "loss": 0.8256,
+      "step": 400
+    },
+    {
+      "epoch": 0.5480367585630743,
+      "grad_norm": 0.5613018728699922,
+      "learning_rate": 5e-06,
+      "loss": 0.8252,
+      "step": 410
+    },
+    {
+      "epoch": 0.5614035087719298,
+      "grad_norm": 0.7255190718604577,
+      "learning_rate": 5e-06,
+      "loss": 0.8247,
+      "step": 420
+    },
+    {
+      "epoch": 0.5747702589807853,
+      "grad_norm": 0.6781380945175464,
+      "learning_rate": 5e-06,
+      "loss": 0.823,
+      "step": 430
+    },
+    {
+      "epoch": 0.5881370091896407,
+      "grad_norm": 0.5530197743336887,
+      "learning_rate": 5e-06,
+      "loss": 0.8251,
+      "step": 440
+    },
+    {
+      "epoch": 0.6015037593984962,
+      "grad_norm": 0.571851888660113,
+      "learning_rate": 5e-06,
+      "loss": 0.8232,
+      "step": 450
+    },
+    {
+      "epoch": 0.6148705096073517,
+      "grad_norm": 0.5208791337420644,
+      "learning_rate": 5e-06,
+      "loss": 0.8235,
+      "step": 460
+    },
+    {
+      "epoch": 0.6282372598162071,
+      "grad_norm": 0.5198842932978275,
+      "learning_rate": 5e-06,
+      "loss": 0.8238,
+      "step": 470
+    },
+    {
+      "epoch": 0.6416040100250626,
+      "grad_norm": 0.48452315583166233,
+      "learning_rate": 5e-06,
+      "loss": 0.8221,
+      "step": 480
+    },
+    {
+      "epoch": 0.6549707602339181,
+      "grad_norm": 0.5219240912238245,
+      "learning_rate": 5e-06,
+      "loss": 0.8168,
+      "step": 490
+    },
+    {
+      "epoch": 0.6683375104427736,
+      "grad_norm": 0.51813285089071,
+      "learning_rate": 5e-06,
+      "loss": 0.8173,
+      "step": 500
+    },
+    {
+      "epoch": 0.681704260651629,
+      "grad_norm": 0.49897768190410446,
+      "learning_rate": 5e-06,
+      "loss": 0.8193,
+      "step": 510
+    },
+    {
+      "epoch": 0.6950710108604845,
+      "grad_norm": 0.546834157816808,
+      "learning_rate": 5e-06,
+      "loss": 0.8129,
+      "step": 520
+    },
+    {
+      "epoch": 0.70843776106934,
+      "grad_norm": 0.5295360571693272,
+      "learning_rate": 5e-06,
+      "loss": 0.8194,
+      "step": 530
+    },
+    {
+      "epoch": 0.7218045112781954,
+      "grad_norm": 0.6854942956404928,
+      "learning_rate": 5e-06,
+      "loss": 0.8193,
+      "step": 540
+    },
+    {
+      "epoch": 0.7351712614870509,
+      "grad_norm": 0.6819748794747951,
+      "learning_rate": 5e-06,
+      "loss": 0.8161,
+      "step": 550
+    },
+    {
+      "epoch": 0.7485380116959064,
+      "grad_norm": 0.7134808000164234,
+      "learning_rate": 5e-06,
+      "loss": 0.8166,
+      "step": 560
+    },
+    {
+      "epoch": 0.7619047619047619,
+      "grad_norm": 0.6412479917820569,
+      "learning_rate": 5e-06,
+      "loss": 0.8172,
+      "step": 570
+    },
+    {
+      "epoch": 0.7752715121136173,
+      "grad_norm": 0.5246142664617556,
+      "learning_rate": 5e-06,
+      "loss": 0.8145,
+      "step": 580
+    },
+    {
+      "epoch": 0.7886382623224728,
+      "grad_norm": 0.588843604202556,
+      "learning_rate": 5e-06,
+      "loss": 0.82,
+      "step": 590
+    },
+    {
+      "epoch": 0.8020050125313283,
+      "grad_norm": 0.5124861711768851,
+      "learning_rate": 5e-06,
+      "loss": 0.8156,
+      "step": 600
+    },
+    {
+      "epoch": 0.8153717627401837,
+      "grad_norm": 0.5015203839251716,
+      "learning_rate": 5e-06,
+      "loss": 0.8191,
+      "step": 610
+    },
+    {
+      "epoch": 0.8287385129490392,
+      "grad_norm": 0.6441893371422894,
+      "learning_rate": 5e-06,
+      "loss": 0.812,
+      "step": 620
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.5838304398634407,
+      "learning_rate": 5e-06,
+      "loss": 0.8086,
+      "step": 630
+    },
+    {
+      "epoch": 0.8554720133667502,
+      "grad_norm": 0.5107304906894905,
+      "learning_rate": 5e-06,
+      "loss": 0.8155,
+      "step": 640
+    },
+    {
+      "epoch": 0.8688387635756056,
+      "grad_norm": 0.5122885155184959,
+      "learning_rate": 5e-06,
+      "loss": 0.8131,
+      "step": 650
+    },
+    {
+      "epoch": 0.8822055137844611,
+      "grad_norm": 0.5985811394437027,
+      "learning_rate": 5e-06,
+      "loss": 0.8104,
+      "step": 660
+    },
+    {
+      "epoch": 0.8955722639933166,
+      "grad_norm": 0.5323936368547137,
+      "learning_rate": 5e-06,
+      "loss": 0.8186,
+      "step": 670
+    },
+    {
+      "epoch": 0.908939014202172,
+      "grad_norm": 0.616312309430872,
+      "learning_rate": 5e-06,
+      "loss": 0.8124,
+      "step": 680
+    },
+    {
+      "epoch": 0.9223057644110275,
+      "grad_norm": 0.6593022396181776,
+      "learning_rate": 5e-06,
+      "loss": 0.8156,
+      "step": 690
+    },
+    {
+      "epoch": 0.935672514619883,
+      "grad_norm": 0.5181097754729659,
+      "learning_rate": 5e-06,
+      "loss": 0.8135,
+      "step": 700
+    },
+    {
+      "epoch": 0.9490392648287385,
+      "grad_norm": 0.5160202542043503,
+      "learning_rate": 5e-06,
+      "loss": 0.8108,
+      "step": 710
+    },
+    {
+      "epoch": 0.9624060150375939,
+      "grad_norm": 0.5439429222609182,
+      "learning_rate": 5e-06,
+      "loss": 0.8098,
+      "step": 720
+    },
+    {
+      "epoch": 0.9757727652464494,
+      "grad_norm": 0.5666778381149935,
+      "learning_rate": 5e-06,
+      "loss": 0.8064,
+      "step": 730
+    },
+    {
+      "epoch": 0.9891395154553049,
+      "grad_norm": 0.5087008142559319,
+      "learning_rate": 5e-06,
+      "loss": 0.8124,
+      "step": 740
+    },
+    {
+      "epoch": 0.9998329156223893,
+      "eval_loss": 0.8087860345840454,
+      "eval_runtime": 793.9439,
+      "eval_samples_per_second": 25.391,
+      "eval_steps_per_second": 0.397,
+      "step": 748
+    },
+    {
+      "epoch": 1.0025062656641603,
+      "grad_norm": 0.6885103061332264,
+      "learning_rate": 5e-06,
+      "loss": 0.8763,
+      "step": 750
+    },
+    {
+      "epoch": 1.0158730158730158,
+      "grad_norm": 0.6156521836752095,
+      "learning_rate": 5e-06,
+      "loss": 0.7692,
+      "step": 760
+    },
+    {
+      "epoch": 1.0292397660818713,
+      "grad_norm": 0.6134559509903806,
+      "learning_rate": 5e-06,
+      "loss": 0.7719,
+      "step": 770
+    },
+    {
+      "epoch": 1.0426065162907268,
+      "grad_norm": 0.635583159755333,
+      "learning_rate": 5e-06,
+      "loss": 0.7724,
+      "step": 780
+    },
+    {
+      "epoch": 1.0559732664995822,
+      "grad_norm": 0.5771840092558814,
+      "learning_rate": 5e-06,
+      "loss": 0.7724,
+      "step": 790
+    },
+    {
+      "epoch": 1.0693400167084377,
+      "grad_norm": 0.5138399093282234,
+      "learning_rate": 5e-06,
+      "loss": 0.7671,
+      "step": 800
+    },
+    {
+      "epoch": 1.0827067669172932,
+      "grad_norm": 0.5865180500219783,
+      "learning_rate": 5e-06,
+      "loss": 0.7741,
+      "step": 810
+    },
+    {
+      "epoch": 1.0960735171261486,
+      "grad_norm": 0.5737059877569465,
+      "learning_rate": 5e-06,
+      "loss": 0.7735,
+      "step": 820
+    },
+    {
+      "epoch": 1.1094402673350041,
+      "grad_norm": 0.7198057887439943,
+      "learning_rate": 5e-06,
+      "loss": 0.7715,
+      "step": 830
+    },
+    {
+      "epoch": 1.1228070175438596,
+      "grad_norm": 0.723247678442899,
+      "learning_rate": 5e-06,
+      "loss": 0.7688,
+      "step": 840
+    },
+    {
+      "epoch": 1.136173767752715,
+      "grad_norm": 0.5724777994659187,
+      "learning_rate": 5e-06,
+      "loss": 0.7709,
+      "step": 850
+    },
+    {
+      "epoch": 1.1495405179615705,
+      "grad_norm": 0.6343455699124487,
+      "learning_rate": 5e-06,
+      "loss": 0.7756,
+      "step": 860
+    },
+    {
+      "epoch": 1.162907268170426,
+      "grad_norm": 0.5975092244071976,
+      "learning_rate": 5e-06,
+      "loss": 0.7762,
+      "step": 870
+    },
+    {
+      "epoch": 1.1762740183792815,
+      "grad_norm": 0.5550810138685736,
+      "learning_rate": 5e-06,
+      "loss": 0.7713,
+      "step": 880
+    },
+    {
+      "epoch": 1.189640768588137,
+      "grad_norm": 0.6031833100946619,
+      "learning_rate": 5e-06,
+      "loss": 0.7717,
+      "step": 890
+    },
+    {
+      "epoch": 1.2030075187969924,
+      "grad_norm": 0.5674692784021945,
+      "learning_rate": 5e-06,
+      "loss": 0.7714,
+      "step": 900
+    },
+    {
+      "epoch": 1.2163742690058479,
+      "grad_norm": 0.6831373781930358,
+      "learning_rate": 5e-06,
+      "loss": 0.7727,
+      "step": 910
+    },
+    {
+      "epoch": 1.2297410192147034,
+      "grad_norm": 0.517398562451772,
+      "learning_rate": 5e-06,
+      "loss": 0.7715,
+      "step": 920
+    },
+    {
+      "epoch": 1.2431077694235588,
+      "grad_norm": 0.5689793551691444,
+      "learning_rate": 5e-06,
+      "loss": 0.7682,
+      "step": 930
+    },
+    {
+      "epoch": 1.2564745196324143,
+      "grad_norm": 0.6979997189308218,
+      "learning_rate": 5e-06,
+      "loss": 0.7753,
+      "step": 940
+    },
+    {
+      "epoch": 1.2698412698412698,
+      "grad_norm": 0.5431703707142987,
+      "learning_rate": 5e-06,
+      "loss": 0.7726,
+      "step": 950
+    },
+    {
+      "epoch": 1.2832080200501252,
+      "grad_norm": 0.5341233588300426,
+      "learning_rate": 5e-06,
+      "loss": 0.7721,
+      "step": 960
+    },
+    {
+      "epoch": 1.2965747702589807,
+      "grad_norm": 0.5621957425809071,
+      "learning_rate": 5e-06,
+      "loss": 0.7702,
+      "step": 970
+    },
+    {
+      "epoch": 1.3099415204678362,
+      "grad_norm": 0.6187116295591158,
+      "learning_rate": 5e-06,
+      "loss": 0.7755,
+      "step": 980
+    },
+    {
+      "epoch": 1.3233082706766917,
+      "grad_norm": 0.6251656247161459,
+      "learning_rate": 5e-06,
+      "loss": 0.7742,
+      "step": 990
+    },
+    {
+      "epoch": 1.3366750208855471,
+      "grad_norm": 0.6092934361550684,
+      "learning_rate": 5e-06,
+      "loss": 0.7732,
+      "step": 1000
+    },
+    {
+      "epoch": 1.3500417710944026,
+      "grad_norm": 0.8086073910477094,
+      "learning_rate": 5e-06,
+      "loss": 0.7663,
+      "step": 1010
+    },
+    {
+      "epoch": 1.363408521303258,
+      "grad_norm": 0.6337909009600926,
+      "learning_rate": 5e-06,
+      "loss": 0.7698,
+      "step": 1020
+    },
+    {
+      "epoch": 1.3767752715121135,
+      "grad_norm": 0.6156017975821142,
+      "learning_rate": 5e-06,
+      "loss": 0.7687,
+      "step": 1030
+    },
+    {
+      "epoch": 1.390142021720969,
+      "grad_norm": 0.4791494199069362,
+      "learning_rate": 5e-06,
+      "loss": 0.7707,
+      "step": 1040
+    },
+    {
+      "epoch": 1.4035087719298245,
+      "grad_norm": 0.5102907384647386,
+      "learning_rate": 5e-06,
+      "loss": 0.7698,
+      "step": 1050
+    },
+    {
+      "epoch": 1.41687552213868,
+      "grad_norm": 0.60763231448239,
+      "learning_rate": 5e-06,
+      "loss": 0.7722,
+      "step": 1060
+    },
+    {
+      "epoch": 1.4302422723475354,
+      "grad_norm": 0.5538961425736992,
+      "learning_rate": 5e-06,
+      "loss": 0.7769,
+      "step": 1070
+    },
+    {
+      "epoch": 1.443609022556391,
+      "grad_norm": 0.511489662319519,
+      "learning_rate": 5e-06,
+      "loss": 0.7709,
+      "step": 1080
+    },
+    {
+      "epoch": 1.4569757727652464,
+      "grad_norm": 0.5006381424370965,
+      "learning_rate": 5e-06,
+      "loss": 0.7652,
+      "step": 1090
+    },
+    {
+      "epoch": 1.4703425229741018,
+      "grad_norm": 0.6446877306415851,
+      "learning_rate": 5e-06,
+      "loss": 0.7668,
+      "step": 1100
+    },
+    {
+      "epoch": 1.4837092731829573,
+      "grad_norm": 0.6472792025046472,
+      "learning_rate": 5e-06,
+      "loss": 0.7748,
+      "step": 1110
+    },
+    {
+      "epoch": 1.4970760233918128,
+      "grad_norm": 0.5297094594069526,
+      "learning_rate": 5e-06,
+      "loss": 0.7716,
+      "step": 1120
+    },
+    {
+      "epoch": 1.5104427736006683,
+      "grad_norm": 0.5172754876638852,
+      "learning_rate": 5e-06,
+      "loss": 0.7693,
+      "step": 1130
+    },
+    {
+      "epoch": 1.5238095238095237,
+      "grad_norm": 0.5499645842959932,
+      "learning_rate": 5e-06,
+      "loss": 0.7663,
+      "step": 1140
+    },
+    {
+      "epoch": 1.5371762740183792,
+      "grad_norm": 0.5115786493746641,
+      "learning_rate": 5e-06,
+      "loss": 0.7707,
+      "step": 1150
+    },
+    {
+      "epoch": 1.5505430242272347,
+      "grad_norm": 0.5733666230248589,
+      "learning_rate": 5e-06,
+      "loss": 0.7708,
+      "step": 1160
+    },
+    {
+      "epoch": 1.5639097744360901,
+      "grad_norm": 0.4914243878129098,
+      "learning_rate": 5e-06,
+      "loss": 0.769,
+      "step": 1170
+    },
+    {
+      "epoch": 1.5772765246449456,
+      "grad_norm": 0.5986514689445189,
+      "learning_rate": 5e-06,
+      "loss": 0.7722,
+      "step": 1180
+    },
+    {
+      "epoch": 1.590643274853801,
+      "grad_norm": 0.49301214049058534,
+      "learning_rate": 5e-06,
+      "loss": 0.7709,
+      "step": 1190
+    },
+    {
+      "epoch": 1.6040100250626566,
+      "grad_norm": 0.49122462674305145,
+      "learning_rate": 5e-06,
+      "loss": 0.7684,
+      "step": 1200
+    },
+    {
+      "epoch": 1.617376775271512,
+      "grad_norm": 0.5231320343494373,
+      "learning_rate": 5e-06,
+      "loss": 0.773,
+      "step": 1210
+    },
+    {
+      "epoch": 1.6307435254803675,
+      "grad_norm": 0.5974519524827527,
+      "learning_rate": 5e-06,
+      "loss": 0.7703,
+      "step": 1220
+    },
+    {
+      "epoch": 1.644110275689223,
+      "grad_norm": 0.49755848059450075,
+      "learning_rate": 5e-06,
+      "loss": 0.7684,
+      "step": 1230
+    },
+    {
+      "epoch": 1.6574770258980784,
+      "grad_norm": 0.49980350150699104,
+      "learning_rate": 5e-06,
+      "loss": 0.7648,
+      "step": 1240
+    },
+    {
+      "epoch": 1.670843776106934,
+      "grad_norm": 0.660197673406872,
+      "learning_rate": 5e-06,
+      "loss": 0.7663,
+      "step": 1250
+    },
+    {
+      "epoch": 1.6842105263157894,
+      "grad_norm": 0.501447743813946,
+      "learning_rate": 5e-06,
+      "loss": 0.7687,
+      "step": 1260
+    },
+    {
+      "epoch": 1.6975772765246449,
+      "grad_norm": 0.47339053427865196,
+      "learning_rate": 5e-06,
+      "loss": 0.7677,
+      "step": 1270
+    },
+    {
+      "epoch": 1.7109440267335003,
+      "grad_norm": 0.4776630843112484,
+      "learning_rate": 5e-06,
+      "loss": 0.7705,
+      "step": 1280
+    },
+    {
+      "epoch": 1.7243107769423558,
+      "grad_norm": 0.5805611285838953,
+      "learning_rate": 5e-06,
+      "loss": 0.7664,
+      "step": 1290
+    },
+    {
+      "epoch": 1.7376775271512113,
+      "grad_norm": 0.5589747352729452,
+      "learning_rate": 5e-06,
+      "loss": 0.7643,
+      "step": 1300
+    },
+    {
+      "epoch": 1.7510442773600667,
+      "grad_norm": 0.5862892637271495,
+      "learning_rate": 5e-06,
+      "loss": 0.767,
+      "step": 1310
+    },
+    {
+      "epoch": 1.7644110275689222,
+      "grad_norm": 0.6267084370944045,
+      "learning_rate": 5e-06,
+      "loss": 0.7701,
+      "step": 1320
+    },
+    {
+      "epoch": 1.7777777777777777,
+      "grad_norm": 0.5590629149887701,
+      "learning_rate": 5e-06,
+      "loss": 0.7725,
+      "step": 1330
+    },
+    {
+      "epoch": 1.7911445279866332,
+      "grad_norm": 0.589200505231269,
+      "learning_rate": 5e-06,
+      "loss": 0.768,
+      "step": 1340
+    },
+    {
+      "epoch": 1.8045112781954886,
+      "grad_norm": 0.4948446583957624,
+      "learning_rate": 5e-06,
+      "loss": 0.7685,
+      "step": 1350
+    },
+    {
+      "epoch": 1.817878028404344,
+      "grad_norm": 0.471229575382462,
+      "learning_rate": 5e-06,
+      "loss": 0.7685,
+      "step": 1360
+    },
+    {
+      "epoch": 1.8312447786131996,
+      "grad_norm": 0.5347363048336566,
+      "learning_rate": 5e-06,
+      "loss": 0.7668,
+      "step": 1370
+    },
+    {
+      "epoch": 1.844611528822055,
+      "grad_norm": 0.6085798758140744,
+      "learning_rate": 5e-06,
+      "loss": 0.7685,
+      "step": 1380
+    },
+    {
+      "epoch": 1.8579782790309105,
+      "grad_norm": 0.49237779847072155,
+      "learning_rate": 5e-06,
+      "loss": 0.766,
+      "step": 1390
+    },
+    {
+      "epoch": 1.871345029239766,
+      "grad_norm": 0.5429938063483495,
+      "learning_rate": 5e-06,
+      "loss": 0.7675,
+      "step": 1400
+    },
+    {
+      "epoch": 1.8847117794486214,
+      "grad_norm": 0.5315522378087794,
+      "learning_rate": 5e-06,
+      "loss": 0.7651,
+      "step": 1410
+    },
+    {
+      "epoch": 1.898078529657477,
+      "grad_norm": 0.5774851920268103,
+      "learning_rate": 5e-06,
+      "loss": 0.7683,
+      "step": 1420
+    },
+    {
+      "epoch": 1.9114452798663324,
+      "grad_norm": 0.4774206459938876,
+      "learning_rate": 5e-06,
+      "loss": 0.7651,
+      "step": 1430
+    },
+    {
+      "epoch": 1.9248120300751879,
+      "grad_norm": 0.48893280928600313,
+      "learning_rate": 5e-06,
+      "loss": 0.7664,
+      "step": 1440
+    },
+    {
+      "epoch": 1.9381787802840433,
+      "grad_norm": 0.47709822943051283,
+      "learning_rate": 5e-06,
+      "loss": 0.7667,
+      "step": 1450
+    },
+    {
+      "epoch": 1.9515455304928988,
+      "grad_norm": 0.5221458173728611,
+      "learning_rate": 5e-06,
+      "loss": 0.7649,
+      "step": 1460
+    },
+    {
+      "epoch": 1.9649122807017543,
+      "grad_norm": 0.5458985479332612,
+      "learning_rate": 5e-06,
+      "loss": 0.7653,
+      "step": 1470
+    },
+    {
+      "epoch": 1.9782790309106097,
+      "grad_norm": 0.5449151757658263,
+      "learning_rate": 5e-06,
+      "loss": 0.7665,
+      "step": 1480
+    },
+    {
+      "epoch": 1.9916457811194652,
+      "grad_norm": 0.5792068417255367,
+      "learning_rate": 5e-06,
+      "loss": 0.7674,
+      "step": 1490
+    },
+    {
+      "epoch": 1.9996658312447786,
+      "eval_loss": 0.7951143383979797,
+      "eval_runtime": 795.386,
+      "eval_samples_per_second": 25.345,
+      "eval_steps_per_second": 0.396,
+      "step": 1496
+    },
+    {
+      "epoch": 2.0050125313283207,
+      "grad_norm": 0.7521880602206925,
+      "learning_rate": 5e-06,
+      "loss": 0.8233,
+      "step": 1500
+    },
+    {
+      "epoch": 2.018379281537176,
+      "grad_norm": 0.6560054074439666,
+      "learning_rate": 5e-06,
+      "loss": 0.7256,
+      "step": 1510
+    },
+    {
+      "epoch": 2.0317460317460316,
+      "grad_norm": 0.5201512747130638,
+      "learning_rate": 5e-06,
+      "loss": 0.7218,
+      "step": 1520
+    },
+    {
+      "epoch": 2.045112781954887,
+      "grad_norm": 0.5262590120532872,
+      "learning_rate": 5e-06,
+      "loss": 0.7285,
+      "step": 1530
+    },
+    {
+      "epoch": 2.0584795321637426,
+      "grad_norm": 0.5393650388873087,
+      "learning_rate": 5e-06,
+      "loss": 0.7229,
+      "step": 1540
+    },
+    {
+      "epoch": 2.071846282372598,
+      "grad_norm": 0.5105428821348765,
+      "learning_rate": 5e-06,
+      "loss": 0.7231,
+      "step": 1550
+    },
+    {
+      "epoch": 2.0852130325814535,
+      "grad_norm": 0.6021970483052078,
+      "learning_rate": 5e-06,
+      "loss": 0.7239,
+      "step": 1560
+    },
+    {
+      "epoch": 2.098579782790309,
+      "grad_norm": 0.5009099309313954,
+      "learning_rate": 5e-06,
+      "loss": 0.7226,
+      "step": 1570
+    },
+    {
+      "epoch": 2.1119465329991645,
+      "grad_norm": 0.5605434690720502,
+      "learning_rate": 5e-06,
+      "loss": 0.7277,
+      "step": 1580
+    },
+    {
+      "epoch": 2.12531328320802,
+      "grad_norm": 0.5732299598938305,
+      "learning_rate": 5e-06,
+      "loss": 0.7286,
+      "step": 1590
+    },
+    {
+      "epoch": 2.1386800334168754,
+      "grad_norm": 0.5399334511302041,
+      "learning_rate": 5e-06,
+      "loss": 0.726,
+      "step": 1600
+    },
+    {
+      "epoch": 2.152046783625731,
+      "grad_norm": 0.505832452848056,
+      "learning_rate": 5e-06,
+      "loss": 0.7304,
+      "step": 1610
+    },
+    {
+      "epoch": 2.1654135338345863,
+      "grad_norm": 0.5674143618926153,
+      "learning_rate": 5e-06,
+      "loss": 0.7232,
+      "step": 1620
+    },
+    {
+      "epoch": 2.178780284043442,
+      "grad_norm": 0.5068914103748654,
+      "learning_rate": 5e-06,
+      "loss": 0.7336,
+      "step": 1630
+    },
+    {
+      "epoch": 2.1921470342522973,
+      "grad_norm": 0.5118320329600874,
+      "learning_rate": 5e-06,
+      "loss": 0.7255,
+      "step": 1640
+    },
+    {
+      "epoch": 2.2055137844611528,
+      "grad_norm": 0.5156250232792499,
+      "learning_rate": 5e-06,
+      "loss": 0.7295,
+      "step": 1650
+    },
+    {
+      "epoch": 2.2188805346700082,
+      "grad_norm": 0.6165225897496419,
+      "learning_rate": 5e-06,
+      "loss": 0.7274,
+      "step": 1660
+    },
+    {
+      "epoch": 2.2322472848788637,
+      "grad_norm": 0.5863877720536036,
+      "learning_rate": 5e-06,
+      "loss": 0.7256,
+      "step": 1670
+    },
+    {
+      "epoch": 2.245614035087719,
+      "grad_norm": 0.5641007704480012,
+      "learning_rate": 5e-06,
+      "loss": 0.7308,
+      "step": 1680
+    },
+    {
+      "epoch": 2.2589807852965746,
+      "grad_norm": 0.6101312501534099,
+      "learning_rate": 5e-06,
+      "loss": 0.7314,
+      "step": 1690
+    },
+    {
+      "epoch": 2.27234753550543,
+      "grad_norm": 0.5200998469176243,
+      "learning_rate": 5e-06,
+      "loss": 0.7275,
+      "step": 1700
+    },
+    {
+      "epoch": 2.2857142857142856,
+      "grad_norm": 0.5398343134194046,
+      "learning_rate": 5e-06,
+      "loss": 0.727,
+      "step": 1710
+    },
+    {
+      "epoch": 2.299081035923141,
+      "grad_norm": 0.5247712631574941,
+      "learning_rate": 5e-06,
+      "loss": 0.727,
+      "step": 1720
+    },
+    {
+      "epoch": 2.3124477861319965,
+      "grad_norm": 0.5655985095958795,
+      "learning_rate": 5e-06,
+      "loss": 0.7286,
+      "step": 1730
+    },
+    {
+      "epoch": 2.325814536340852,
+      "grad_norm": 0.5927409653328921,
+      "learning_rate": 5e-06,
+      "loss": 0.7271,
+      "step": 1740
+    },
+    {
+      "epoch": 2.3391812865497075,
+      "grad_norm": 0.6148593425957483,
+      "learning_rate": 5e-06,
+      "loss": 0.733,
+      "step": 1750
+    },
+    {
+      "epoch": 2.352548036758563,
+      "grad_norm": 0.5969831864554942,
+      "learning_rate": 5e-06,
+      "loss": 0.7302,
+      "step": 1760
+    },
+    {
+      "epoch": 2.3659147869674184,
+      "grad_norm": 0.4985456007136878,
+      "learning_rate": 5e-06,
+      "loss": 0.7341,
+      "step": 1770
+    },
+    {
+      "epoch": 2.379281537176274,
+      "grad_norm": 0.5005254522981937,
+      "learning_rate": 5e-06,
+      "loss": 0.7244,
+      "step": 1780
+    },
+    {
+      "epoch": 2.3926482873851294,
+      "grad_norm": 0.5288709360617612,
+      "learning_rate": 5e-06,
+      "loss": 0.7312,
+      "step": 1790
+    },
+    {
+      "epoch": 2.406015037593985,
+      "grad_norm": 0.5355584900475018,
+      "learning_rate": 5e-06,
+      "loss": 0.727,
+      "step": 1800
+    },
+    {
+      "epoch": 2.4193817878028403,
+      "grad_norm": 0.5666733459714918,
+      "learning_rate": 5e-06,
+      "loss": 0.731,
+      "step": 1810
+    },
+    {
+      "epoch": 2.4327485380116958,
+      "grad_norm": 0.5939862506331437,
+      "learning_rate": 5e-06,
+      "loss": 0.7292,
+      "step": 1820
+    },
+    {
+      "epoch": 2.4461152882205512,
+      "grad_norm": 0.5696153125681646,
+      "learning_rate": 5e-06,
+      "loss": 0.7295,
+      "step": 1830
+    },
+    {
+      "epoch": 2.4594820384294067,
+      "grad_norm": 0.5263801998302109,
+      "learning_rate": 5e-06,
+      "loss": 0.7289,
+      "step": 1840
+    },
+    {
+      "epoch": 2.472848788638262,
+      "grad_norm": 0.5564137280433736,
+      "learning_rate": 5e-06,
+      "loss": 0.7289,
+      "step": 1850
+    },
+    {
+      "epoch": 2.4862155388471177,
+      "grad_norm": 0.6117589560276474,
+      "learning_rate": 5e-06,
+      "loss": 0.7281,
+      "step": 1860
+    },
+    {
+      "epoch": 2.499582289055973,
+      "grad_norm": 0.5556838242891475,
+      "learning_rate": 5e-06,
+      "loss": 0.7296,
+      "step": 1870
+    },
+    {
+      "epoch": 2.5129490392648286,
+      "grad_norm": 0.4681598446789898,
+      "learning_rate": 5e-06,
+      "loss": 0.7296,
+      "step": 1880
+    },
+    {
+      "epoch": 2.526315789473684,
+      "grad_norm": 0.5231611697501862,
+      "learning_rate": 5e-06,
+      "loss": 0.7303,
+      "step": 1890
+    },
+    {
+      "epoch": 2.5396825396825395,
+      "grad_norm": 0.5126109088017671,
+      "learning_rate": 5e-06,
+      "loss": 0.7324,
+      "step": 1900
+    },
+    {
+      "epoch": 2.553049289891395,
+      "grad_norm": 0.5300428577804921,
+      "learning_rate": 5e-06,
+      "loss": 0.7273,
+      "step": 1910
+    },
+    {
+      "epoch": 2.5664160401002505,
+      "grad_norm": 0.4968055663040118,
+      "learning_rate": 5e-06,
+      "loss": 0.729,
+      "step": 1920
+    },
+    {
+      "epoch": 2.579782790309106,
+      "grad_norm": 0.568494743059541,
+      "learning_rate": 5e-06,
+      "loss": 0.7269,
+      "step": 1930
+    },
+    {
+      "epoch": 2.5931495405179614,
+      "grad_norm": 0.5482221484283202,
+      "learning_rate": 5e-06,
+      "loss": 0.7285,
+      "step": 1940
+    },
+    {
+      "epoch": 2.606516290726817,
+      "grad_norm": 0.47129332867964935,
+      "learning_rate": 5e-06,
+      "loss": 0.7292,
+      "step": 1950
+    },
+    {
+      "epoch": 2.6198830409356724,
+      "grad_norm": 0.5198836974979396,
+      "learning_rate": 5e-06,
+      "loss": 0.7264,
+      "step": 1960
+    },
+    {
+      "epoch": 2.633249791144528,
+      "grad_norm": 0.4945939304862693,
+      "learning_rate": 5e-06,
+      "loss": 0.7279,
+      "step": 1970
+    },
+    {
+      "epoch": 2.6466165413533833,
+      "grad_norm": 0.5751403403674279,
+      "learning_rate": 5e-06,
+      "loss": 0.7282,
+      "step": 1980
+    },
+    {
+      "epoch": 2.659983291562239,
+      "grad_norm": 0.5611452949151137,
+      "learning_rate": 5e-06,
+      "loss": 0.7331,
+      "step": 1990
+    },
+    {
+      "epoch": 2.6733500417710943,
+      "grad_norm": 0.6119128996618558,
+      "learning_rate": 5e-06,
+      "loss": 0.7296,
+      "step": 2000
+    },
+    {
+      "epoch": 2.6867167919799497,
+      "grad_norm": 0.4799215562608329,
+      "learning_rate": 5e-06,
+      "loss": 0.7298,
+      "step": 2010
+    },
+    {
+      "epoch": 2.700083542188805,
+      "grad_norm": 0.5541418078345739,
+      "learning_rate": 5e-06,
+      "loss": 0.7268,
+      "step": 2020
+    },
+    {
+      "epoch": 2.7134502923976607,
+      "grad_norm": 0.6870311878219804,
+      "learning_rate": 5e-06,
+      "loss": 0.7277,
+      "step": 2030
+    },
+    {
+      "epoch": 2.726817042606516,
+      "grad_norm": 0.5687894755714459,
+      "learning_rate": 5e-06,
+      "loss": 0.7298,
+      "step": 2040
+    },
+    {
+      "epoch": 2.7401837928153716,
+      "grad_norm": 0.5330460246090263,
+      "learning_rate": 5e-06,
+      "loss": 0.7325,
+      "step": 2050
+    },
+    {
+      "epoch": 2.753550543024227,
+      "grad_norm": 0.5427879116319339,
+      "learning_rate": 5e-06,
+      "loss": 0.7296,
+      "step": 2060
+    },
+    {
+      "epoch": 2.7669172932330826,
+      "grad_norm": 0.6013738539276209,
+      "learning_rate": 5e-06,
+      "loss": 0.7281,
+      "step": 2070
+    },
+    {
+      "epoch": 2.780284043441938,
+      "grad_norm": 0.6091854363964149,
+      "learning_rate": 5e-06,
+      "loss": 0.7294,
+      "step": 2080
+    },
+    {
+      "epoch": 2.7936507936507935,
+      "grad_norm": 0.5190279913663577,
+      "learning_rate": 5e-06,
+      "loss": 0.7248,
+      "step": 2090
+    },
+    {
+      "epoch": 2.807017543859649,
+      "grad_norm": 0.5126718278939274,
+      "learning_rate": 5e-06,
+      "loss": 0.7311,
+      "step": 2100
+    },
+    {
+      "epoch": 2.8203842940685044,
+      "grad_norm": 0.5571607138857257,
+      "learning_rate": 5e-06,
+      "loss": 0.7318,
+      "step": 2110
+    },
+    {
+      "epoch": 2.83375104427736,
+      "grad_norm": 0.5341175882686895,
+      "learning_rate": 5e-06,
+      "loss": 0.7336,
+      "step": 2120
+    },
+    {
+      "epoch": 2.8471177944862154,
+      "grad_norm": 0.4817774606348232,
+      "learning_rate": 5e-06,
+      "loss": 0.731,
+      "step": 2130
+    },
+    {
+      "epoch": 2.860484544695071,
+      "grad_norm": 0.5487220776810837,
+      "learning_rate": 5e-06,
+      "loss": 0.7282,
+      "step": 2140
+    },
+    {
+      "epoch": 2.8738512949039263,
+      "grad_norm": 0.6342699103351254,
+      "learning_rate": 5e-06,
+      "loss": 0.7335,
+      "step": 2150
+    },
+    {
+      "epoch": 2.887218045112782,
+      "grad_norm": 0.5078552425291176,
+      "learning_rate": 5e-06,
+      "loss": 0.7273,
+      "step": 2160
+    },
+    {
+      "epoch": 2.9005847953216373,
+      "grad_norm": 0.4819316377635323,
+      "learning_rate": 5e-06,
+      "loss": 0.7332,
+      "step": 2170
+    },
+    {
+      "epoch": 2.9139515455304927,
+      "grad_norm": 0.4627017239179797,
+      "learning_rate": 5e-06,
+      "loss": 0.7306,
+      "step": 2180
+    },
+    {
+      "epoch": 2.927318295739348,
+      "grad_norm": 0.4761325291977869,
+      "learning_rate": 5e-06,
+      "loss": 0.7314,
+      "step": 2190
+    },
+    {
+      "epoch": 2.9406850459482037,
+      "grad_norm": 0.5784029020001881,
+      "learning_rate": 5e-06,
+      "loss": 0.7298,
+      "step": 2200
+    },
+    {
+      "epoch": 2.954051796157059,
+      "grad_norm": 0.5120822643666457,
+      "learning_rate": 5e-06,
+      "loss": 0.731,
+      "step": 2210
+    },
+    {
+      "epoch": 2.9674185463659146,
+      "grad_norm": 0.5116915736315969,
+      "learning_rate": 5e-06,
+      "loss": 0.7322,
+      "step": 2220
+    },
+    {
+      "epoch": 2.98078529657477,
+      "grad_norm": 0.5021133290964584,
+      "learning_rate": 5e-06,
+      "loss": 0.7269,
+      "step": 2230
+    },
+    {
+      "epoch": 2.9941520467836256,
+      "grad_norm": 0.5317540745896701,
+      "learning_rate": 5e-06,
+      "loss": 0.7322,
+      "step": 2240
+    },
+    {
+      "epoch": 2.999498746867168,
+      "eval_loss": 0.7926730513572693,
+      "eval_runtime": 792.6639,
+      "eval_samples_per_second": 25.432,
+      "eval_steps_per_second": 0.397,
+      "step": 2244
+    },
+    {
+      "epoch": 2.999498746867168,
+      "step": 2244,
+      "total_flos": 3758574199111680.0,
+      "train_loss": 0.7796513685780625,
+      "train_runtime": 132137.1731,
+      "train_samples_per_second": 8.696,
+      "train_steps_per_second": 0.017
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2244,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3758574199111680.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed