End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +1340 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: top_17_ranking_stackexchange
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # top_17_ranking_stackexchange
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7988

 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: top_17_ranking_stackexchange
 # top_17_ranking_stackexchange
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/top_17_ranking_stackexchange dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7988

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.998768978251949,
+    "eval_loss": 0.7988426685333252,
+    "eval_runtime": 652.4279,
+    "eval_samples_per_second": 25.163,
+    "eval_steps_per_second": 0.394,
+    "total_flos": 3060083667763200.0,
+    "train_loss": 0.7828072778631836,
+    "train_runtime": 108782.0419,
+    "train_samples_per_second": 8.602,
+    "train_steps_per_second": 0.017
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.998768978251949,
+    "eval_loss": 0.7988426685333252,
+    "eval_runtime": 652.4279,
+    "eval_samples_per_second": 25.163,
+    "eval_steps_per_second": 0.394
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.998768978251949,
+    "total_flos": 3060083667763200.0,
+    "train_loss": 0.7828072778631836,
+    "train_runtime": 108782.0419,
+    "train_samples_per_second": 8.602,
+    "train_steps_per_second": 0.017
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1340 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.998768978251949,
+  "eval_steps": 500,
+  "global_step": 1827,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016413623307345096,
+      "grad_norm": 8.99270138013034,
+      "learning_rate": 5e-06,
+      "loss": 1.0659,
+      "step": 10
+    },
+    {
+      "epoch": 0.03282724661469019,
+      "grad_norm": 3.053529266595301,
+      "learning_rate": 5e-06,
+      "loss": 0.9734,
+      "step": 20
+    },
+    {
+      "epoch": 0.04924086992203529,
+      "grad_norm": 2.633730745764626,
+      "learning_rate": 5e-06,
+      "loss": 0.9341,
+      "step": 30
+    },
+    {
+      "epoch": 0.06565449322938038,
+      "grad_norm": 0.7908528977431902,
+      "learning_rate": 5e-06,
+      "loss": 0.9191,
+      "step": 40
+    },
+    {
+      "epoch": 0.08206811653672548,
+      "grad_norm": 1.0408150537576215,
+      "learning_rate": 5e-06,
+      "loss": 0.9013,
+      "step": 50
+    },
+    {
+      "epoch": 0.09848173984407058,
+      "grad_norm": 1.0009107528629961,
+      "learning_rate": 5e-06,
+      "loss": 0.8956,
+      "step": 60
+    },
+    {
+      "epoch": 0.11489536315141567,
+      "grad_norm": 0.8433960502782532,
+      "learning_rate": 5e-06,
+      "loss": 0.8783,
+      "step": 70
+    },
+    {
+      "epoch": 0.13130898645876077,
+      "grad_norm": 1.2184758422903226,
+      "learning_rate": 5e-06,
+      "loss": 0.8776,
+      "step": 80
+    },
+    {
+      "epoch": 0.14772260976610588,
+      "grad_norm": 0.7980413481780791,
+      "learning_rate": 5e-06,
+      "loss": 0.8707,
+      "step": 90
+    },
+    {
+      "epoch": 0.16413623307345096,
+      "grad_norm": 0.5562074295693149,
+      "learning_rate": 5e-06,
+      "loss": 0.8672,
+      "step": 100
+    },
+    {
+      "epoch": 0.18054985638079607,
+      "grad_norm": 0.5260694903772933,
+      "learning_rate": 5e-06,
+      "loss": 0.8611,
+      "step": 110
+    },
+    {
+      "epoch": 0.19696347968814115,
+      "grad_norm": 0.6264931158468416,
+      "learning_rate": 5e-06,
+      "loss": 0.8621,
+      "step": 120
+    },
+    {
+      "epoch": 0.21337710299548626,
+      "grad_norm": 0.6191079265416527,
+      "learning_rate": 5e-06,
+      "loss": 0.8541,
+      "step": 130
+    },
+    {
+      "epoch": 0.22979072630283134,
+      "grad_norm": 0.5921376117856568,
+      "learning_rate": 5e-06,
+      "loss": 0.8535,
+      "step": 140
+    },
+    {
+      "epoch": 0.24620434961017645,
+      "grad_norm": 0.5621484849345678,
+      "learning_rate": 5e-06,
+      "loss": 0.8548,
+      "step": 150
+    },
+    {
+      "epoch": 0.26261797291752154,
+      "grad_norm": 0.5294016331059697,
+      "learning_rate": 5e-06,
+      "loss": 0.8538,
+      "step": 160
+    },
+    {
+      "epoch": 0.2790315962248666,
+      "grad_norm": 0.5007519115014486,
+      "learning_rate": 5e-06,
+      "loss": 0.849,
+      "step": 170
+    },
+    {
+      "epoch": 0.29544521953221176,
+      "grad_norm": 0.5887354661692235,
+      "learning_rate": 5e-06,
+      "loss": 0.8488,
+      "step": 180
+    },
+    {
+      "epoch": 0.31185884283955684,
+      "grad_norm": 0.5081716506810537,
+      "learning_rate": 5e-06,
+      "loss": 0.8484,
+      "step": 190
+    },
+    {
+      "epoch": 0.3282724661469019,
+      "grad_norm": 0.6188657430216449,
+      "learning_rate": 5e-06,
+      "loss": 0.8426,
+      "step": 200
+    },
+    {
+      "epoch": 0.344686089454247,
+      "grad_norm": 1.0610412603180521,
+      "learning_rate": 5e-06,
+      "loss": 0.8399,
+      "step": 210
+    },
+    {
+      "epoch": 0.36109971276159214,
+      "grad_norm": 0.7317657510287235,
+      "learning_rate": 5e-06,
+      "loss": 0.8455,
+      "step": 220
+    },
+    {
+      "epoch": 0.3775133360689372,
+      "grad_norm": 0.5199899282009807,
+      "learning_rate": 5e-06,
+      "loss": 0.8437,
+      "step": 230
+    },
+    {
+      "epoch": 0.3939269593762823,
+      "grad_norm": 0.701918684276049,
+      "learning_rate": 5e-06,
+      "loss": 0.8391,
+      "step": 240
+    },
+    {
+      "epoch": 0.4103405826836274,
+      "grad_norm": 0.5559902755445636,
+      "learning_rate": 5e-06,
+      "loss": 0.8358,
+      "step": 250
+    },
+    {
+      "epoch": 0.4267542059909725,
+      "grad_norm": 0.5581199955306168,
+      "learning_rate": 5e-06,
+      "loss": 0.8394,
+      "step": 260
+    },
+    {
+      "epoch": 0.4431678292983176,
+      "grad_norm": 0.6690583184335058,
+      "learning_rate": 5e-06,
+      "loss": 0.8373,
+      "step": 270
+    },
+    {
+      "epoch": 0.4595814526056627,
+      "grad_norm": 0.505413325131573,
+      "learning_rate": 5e-06,
+      "loss": 0.8321,
+      "step": 280
+    },
+    {
+      "epoch": 0.47599507591300777,
+      "grad_norm": 0.4902726775641345,
+      "learning_rate": 5e-06,
+      "loss": 0.8407,
+      "step": 290
+    },
+    {
+      "epoch": 0.4924086992203529,
+      "grad_norm": 0.7023706220694934,
+      "learning_rate": 5e-06,
+      "loss": 0.8309,
+      "step": 300
+    },
+    {
+      "epoch": 0.508822322527698,
+      "grad_norm": 0.5762877961049813,
+      "learning_rate": 5e-06,
+      "loss": 0.8318,
+      "step": 310
+    },
+    {
+      "epoch": 0.5252359458350431,
+      "grad_norm": 0.47853098636746905,
+      "learning_rate": 5e-06,
+      "loss": 0.8332,
+      "step": 320
+    },
+    {
+      "epoch": 0.5416495691423882,
+      "grad_norm": 0.5281284739639729,
+      "learning_rate": 5e-06,
+      "loss": 0.83,
+      "step": 330
+    },
+    {
+      "epoch": 0.5580631924497332,
+      "grad_norm": 0.5794359727324211,
+      "learning_rate": 5e-06,
+      "loss": 0.827,
+      "step": 340
+    },
+    {
+      "epoch": 0.5744768157570784,
+      "grad_norm": 0.542547522553918,
+      "learning_rate": 5e-06,
+      "loss": 0.8289,
+      "step": 350
+    },
+    {
+      "epoch": 0.5908904390644235,
+      "grad_norm": 0.5615169892984633,
+      "learning_rate": 5e-06,
+      "loss": 0.8261,
+      "step": 360
+    },
+    {
+      "epoch": 0.6073040623717686,
+      "grad_norm": 0.5992859282011177,
+      "learning_rate": 5e-06,
+      "loss": 0.8265,
+      "step": 370
+    },
+    {
+      "epoch": 0.6237176856791137,
+      "grad_norm": 0.6682966476034522,
+      "learning_rate": 5e-06,
+      "loss": 0.8272,
+      "step": 380
+    },
+    {
+      "epoch": 0.6401313089864588,
+      "grad_norm": 0.5419933138601745,
+      "learning_rate": 5e-06,
+      "loss": 0.8258,
+      "step": 390
+    },
+    {
+      "epoch": 0.6565449322938038,
+      "grad_norm": 0.49991773880492635,
+      "learning_rate": 5e-06,
+      "loss": 0.8249,
+      "step": 400
+    },
+    {
+      "epoch": 0.6729585556011489,
+      "grad_norm": 0.6878795353167779,
+      "learning_rate": 5e-06,
+      "loss": 0.8222,
+      "step": 410
+    },
+    {
+      "epoch": 0.689372178908494,
+      "grad_norm": 0.4911573402330116,
+      "learning_rate": 5e-06,
+      "loss": 0.825,
+      "step": 420
+    },
+    {
+      "epoch": 0.7057858022158392,
+      "grad_norm": 0.5697346305894231,
+      "learning_rate": 5e-06,
+      "loss": 0.825,
+      "step": 430
+    },
+    {
+      "epoch": 0.7221994255231843,
+      "grad_norm": 0.5513743964148584,
+      "learning_rate": 5e-06,
+      "loss": 0.8246,
+      "step": 440
+    },
+    {
+      "epoch": 0.7386130488305294,
+      "grad_norm": 0.688660194802024,
+      "learning_rate": 5e-06,
+      "loss": 0.8222,
+      "step": 450
+    },
+    {
+      "epoch": 0.7550266721378744,
+      "grad_norm": 0.511273977957526,
+      "learning_rate": 5e-06,
+      "loss": 0.8213,
+      "step": 460
+    },
+    {
+      "epoch": 0.7714402954452195,
+      "grad_norm": 0.5063886517354912,
+      "learning_rate": 5e-06,
+      "loss": 0.8222,
+      "step": 470
+    },
+    {
+      "epoch": 0.7878539187525646,
+      "grad_norm": 0.5454872366957495,
+      "learning_rate": 5e-06,
+      "loss": 0.8193,
+      "step": 480
+    },
+    {
+      "epoch": 0.8042675420599097,
+      "grad_norm": 0.565210030152154,
+      "learning_rate": 5e-06,
+      "loss": 0.8177,
+      "step": 490
+    },
+    {
+      "epoch": 0.8206811653672548,
+      "grad_norm": 0.6047647526413709,
+      "learning_rate": 5e-06,
+      "loss": 0.8184,
+      "step": 500
+    },
+    {
+      "epoch": 0.8370947886746,
+      "grad_norm": 0.4922366597743525,
+      "learning_rate": 5e-06,
+      "loss": 0.8172,
+      "step": 510
+    },
+    {
+      "epoch": 0.853508411981945,
+      "grad_norm": 0.5659941668447491,
+      "learning_rate": 5e-06,
+      "loss": 0.8204,
+      "step": 520
+    },
+    {
+      "epoch": 0.8699220352892901,
+      "grad_norm": 0.7936193835249891,
+      "learning_rate": 5e-06,
+      "loss": 0.8156,
+      "step": 530
+    },
+    {
+      "epoch": 0.8863356585966352,
+      "grad_norm": 0.635704407895363,
+      "learning_rate": 5e-06,
+      "loss": 0.8168,
+      "step": 540
+    },
+    {
+      "epoch": 0.9027492819039803,
+      "grad_norm": 0.5271560225642157,
+      "learning_rate": 5e-06,
+      "loss": 0.8154,
+      "step": 550
+    },
+    {
+      "epoch": 0.9191629052113254,
+      "grad_norm": 0.4819325527832828,
+      "learning_rate": 5e-06,
+      "loss": 0.818,
+      "step": 560
+    },
+    {
+      "epoch": 0.9355765285186705,
+      "grad_norm": 0.5777651614118724,
+      "learning_rate": 5e-06,
+      "loss": 0.8168,
+      "step": 570
+    },
+    {
+      "epoch": 0.9519901518260155,
+      "grad_norm": 0.7370846415410045,
+      "learning_rate": 5e-06,
+      "loss": 0.8152,
+      "step": 580
+    },
+    {
+      "epoch": 0.9684037751333607,
+      "grad_norm": 0.6007353019630401,
+      "learning_rate": 5e-06,
+      "loss": 0.8173,
+      "step": 590
+    },
+    {
+      "epoch": 0.9848173984407058,
+      "grad_norm": 0.56593529564363,
+      "learning_rate": 5e-06,
+      "loss": 0.8157,
+      "step": 600
+    },
+    {
+      "epoch": 0.9995896594173164,
+      "eval_loss": 0.8145917654037476,
+      "eval_runtime": 649.6437,
+      "eval_samples_per_second": 25.271,
+      "eval_steps_per_second": 0.396,
+      "step": 609
+    },
+    {
+      "epoch": 1.001231021748051,
+      "grad_norm": 1.5425883939459828,
+      "learning_rate": 5e-06,
+      "loss": 0.8731,
+      "step": 610
+    },
+    {
+      "epoch": 1.017644645055396,
+      "grad_norm": 0.7619005303987774,
+      "learning_rate": 5e-06,
+      "loss": 0.7758,
+      "step": 620
+    },
+    {
+      "epoch": 1.034058268362741,
+      "grad_norm": 0.6185096912027249,
+      "learning_rate": 5e-06,
+      "loss": 0.7704,
+      "step": 630
+    },
+    {
+      "epoch": 1.0504718916700861,
+      "grad_norm": 0.5425736459453947,
+      "learning_rate": 5e-06,
+      "loss": 0.774,
+      "step": 640
+    },
+    {
+      "epoch": 1.0668855149774312,
+      "grad_norm": 0.5336001597783918,
+      "learning_rate": 5e-06,
+      "loss": 0.7706,
+      "step": 650
+    },
+    {
+      "epoch": 1.0832991382847763,
+      "grad_norm": 0.5273934170840361,
+      "learning_rate": 5e-06,
+      "loss": 0.7812,
+      "step": 660
+    },
+    {
+      "epoch": 1.0997127615921214,
+      "grad_norm": 0.49999784470967323,
+      "learning_rate": 5e-06,
+      "loss": 0.7786,
+      "step": 670
+    },
+    {
+      "epoch": 1.1161263848994665,
+      "grad_norm": 0.5332221891356449,
+      "learning_rate": 5e-06,
+      "loss": 0.7749,
+      "step": 680
+    },
+    {
+      "epoch": 1.1325400082068118,
+      "grad_norm": 0.5733836629913084,
+      "learning_rate": 5e-06,
+      "loss": 0.7782,
+      "step": 690
+    },
+    {
+      "epoch": 1.1489536315141566,
+      "grad_norm": 0.49502621904597705,
+      "learning_rate": 5e-06,
+      "loss": 0.771,
+      "step": 700
+    },
+    {
+      "epoch": 1.165367254821502,
+      "grad_norm": 0.527378879948705,
+      "learning_rate": 5e-06,
+      "loss": 0.769,
+      "step": 710
+    },
+    {
+      "epoch": 1.181780878128847,
+      "grad_norm": 0.6092891531058294,
+      "learning_rate": 5e-06,
+      "loss": 0.7735,
+      "step": 720
+    },
+    {
+      "epoch": 1.198194501436192,
+      "grad_norm": 0.7409709215243082,
+      "learning_rate": 5e-06,
+      "loss": 0.7735,
+      "step": 730
+    },
+    {
+      "epoch": 1.2146081247435372,
+      "grad_norm": 0.5407098999014308,
+      "learning_rate": 5e-06,
+      "loss": 0.7735,
+      "step": 740
+    },
+    {
+      "epoch": 1.2310217480508823,
+      "grad_norm": 0.558991862523269,
+      "learning_rate": 5e-06,
+      "loss": 0.7773,
+      "step": 750
+    },
+    {
+      "epoch": 1.2474353713582274,
+      "grad_norm": 0.5542175525118204,
+      "learning_rate": 5e-06,
+      "loss": 0.7776,
+      "step": 760
+    },
+    {
+      "epoch": 1.2638489946655724,
+      "grad_norm": 0.5783902400184568,
+      "learning_rate": 5e-06,
+      "loss": 0.7746,
+      "step": 770
+    },
+    {
+      "epoch": 1.2802626179729175,
+      "grad_norm": 0.5521887369776995,
+      "learning_rate": 5e-06,
+      "loss": 0.7707,
+      "step": 780
+    },
+    {
+      "epoch": 1.2966762412802626,
+      "grad_norm": 0.8444645957997873,
+      "learning_rate": 5e-06,
+      "loss": 0.7725,
+      "step": 790
+    },
+    {
+      "epoch": 1.3130898645876077,
+      "grad_norm": 0.5736975526122204,
+      "learning_rate": 5e-06,
+      "loss": 0.7744,
+      "step": 800
+    },
+    {
+      "epoch": 1.3295034878949528,
+      "grad_norm": 0.48292249290479,
+      "learning_rate": 5e-06,
+      "loss": 0.7733,
+      "step": 810
+    },
+    {
+      "epoch": 1.3459171112022978,
+      "grad_norm": 0.5049987037399899,
+      "learning_rate": 5e-06,
+      "loss": 0.7714,
+      "step": 820
+    },
+    {
+      "epoch": 1.362330734509643,
+      "grad_norm": 0.5062380261284263,
+      "learning_rate": 5e-06,
+      "loss": 0.7715,
+      "step": 830
+    },
+    {
+      "epoch": 1.3787443578169882,
+      "grad_norm": 0.5278903296959995,
+      "learning_rate": 5e-06,
+      "loss": 0.7726,
+      "step": 840
+    },
+    {
+      "epoch": 1.395157981124333,
+      "grad_norm": 0.5450750708926599,
+      "learning_rate": 5e-06,
+      "loss": 0.7747,
+      "step": 850
+    },
+    {
+      "epoch": 1.4115716044316784,
+      "grad_norm": 0.6396261396718704,
+      "learning_rate": 5e-06,
+      "loss": 0.7744,
+      "step": 860
+    },
+    {
+      "epoch": 1.4279852277390233,
+      "grad_norm": 0.5482394718697263,
+      "learning_rate": 5e-06,
+      "loss": 0.7727,
+      "step": 870
+    },
+    {
+      "epoch": 1.4443988510463686,
+      "grad_norm": 0.46959189800427537,
+      "learning_rate": 5e-06,
+      "loss": 0.7744,
+      "step": 880
+    },
+    {
+      "epoch": 1.4608124743537136,
+      "grad_norm": 0.5691557302620813,
+      "learning_rate": 5e-06,
+      "loss": 0.77,
+      "step": 890
+    },
+    {
+      "epoch": 1.4772260976610587,
+      "grad_norm": 0.5438893971371171,
+      "learning_rate": 5e-06,
+      "loss": 0.7742,
+      "step": 900
+    },
+    {
+      "epoch": 1.4936397209684038,
+      "grad_norm": 0.515108075044442,
+      "learning_rate": 5e-06,
+      "loss": 0.7674,
+      "step": 910
+    },
+    {
+      "epoch": 1.5100533442757489,
+      "grad_norm": 0.535062525509266,
+      "learning_rate": 5e-06,
+      "loss": 0.7716,
+      "step": 920
+    },
+    {
+      "epoch": 1.526466967583094,
+      "grad_norm": 0.48864573799251787,
+      "learning_rate": 5e-06,
+      "loss": 0.7711,
+      "step": 930
+    },
+    {
+      "epoch": 1.542880590890439,
+      "grad_norm": 0.512031098496549,
+      "learning_rate": 5e-06,
+      "loss": 0.7724,
+      "step": 940
+    },
+    {
+      "epoch": 1.5592942141977841,
+      "grad_norm": 0.5349855948928464,
+      "learning_rate": 5e-06,
+      "loss": 0.7693,
+      "step": 950
+    },
+    {
+      "epoch": 1.5757078375051292,
+      "grad_norm": 0.662086219432035,
+      "learning_rate": 5e-06,
+      "loss": 0.7712,
+      "step": 960
+    },
+    {
+      "epoch": 1.5921214608124743,
+      "grad_norm": 0.5526444522821828,
+      "learning_rate": 5e-06,
+      "loss": 0.7709,
+      "step": 970
+    },
+    {
+      "epoch": 1.6085350841198194,
+      "grad_norm": 0.55082436990172,
+      "learning_rate": 5e-06,
+      "loss": 0.7772,
+      "step": 980
+    },
+    {
+      "epoch": 1.6249487074271647,
+      "grad_norm": 0.5815245751790243,
+      "learning_rate": 5e-06,
+      "loss": 0.7722,
+      "step": 990
+    },
+    {
+      "epoch": 1.6413623307345095,
+      "grad_norm": 0.5154000934765319,
+      "learning_rate": 5e-06,
+      "loss": 0.7694,
+      "step": 1000
+    },
+    {
+      "epoch": 1.6577759540418548,
+      "grad_norm": 0.6023290973410581,
+      "learning_rate": 5e-06,
+      "loss": 0.7673,
+      "step": 1010
+    },
+    {
+      "epoch": 1.6741895773491997,
+      "grad_norm": 0.600577558938854,
+      "learning_rate": 5e-06,
+      "loss": 0.7713,
+      "step": 1020
+    },
+    {
+      "epoch": 1.690603200656545,
+      "grad_norm": 0.6079065129634816,
+      "learning_rate": 5e-06,
+      "loss": 0.7687,
+      "step": 1030
+    },
+    {
+      "epoch": 1.7070168239638899,
+      "grad_norm": 0.693841170522645,
+      "learning_rate": 5e-06,
+      "loss": 0.7697,
+      "step": 1040
+    },
+    {
+      "epoch": 1.7234304472712352,
+      "grad_norm": 0.48888900591836165,
+      "learning_rate": 5e-06,
+      "loss": 0.7691,
+      "step": 1050
+    },
+    {
+      "epoch": 1.7398440705785803,
+      "grad_norm": 0.7010501927100856,
+      "learning_rate": 5e-06,
+      "loss": 0.7741,
+      "step": 1060
+    },
+    {
+      "epoch": 1.7562576938859253,
+      "grad_norm": 0.4911843241601219,
+      "learning_rate": 5e-06,
+      "loss": 0.7661,
+      "step": 1070
+    },
+    {
+      "epoch": 1.7726713171932704,
+      "grad_norm": 0.5931976251774361,
+      "learning_rate": 5e-06,
+      "loss": 0.7689,
+      "step": 1080
+    },
+    {
+      "epoch": 1.7890849405006155,
+      "grad_norm": 0.6443181893958313,
+      "learning_rate": 5e-06,
+      "loss": 0.7705,
+      "step": 1090
+    },
+    {
+      "epoch": 1.8054985638079606,
+      "grad_norm": 0.470844146128102,
+      "learning_rate": 5e-06,
+      "loss": 0.7726,
+      "step": 1100
+    },
+    {
+      "epoch": 1.8219121871153057,
+      "grad_norm": 0.6664878595725717,
+      "learning_rate": 5e-06,
+      "loss": 0.7698,
+      "step": 1110
+    },
+    {
+      "epoch": 1.8383258104226508,
+      "grad_norm": 0.507697225304388,
+      "learning_rate": 5e-06,
+      "loss": 0.767,
+      "step": 1120
+    },
+    {
+      "epoch": 1.8547394337299958,
+      "grad_norm": 0.5439425837704747,
+      "learning_rate": 5e-06,
+      "loss": 0.7686,
+      "step": 1130
+    },
+    {
+      "epoch": 1.8711530570373411,
+      "grad_norm": 0.8621901744125671,
+      "learning_rate": 5e-06,
+      "loss": 0.7717,
+      "step": 1140
+    },
+    {
+      "epoch": 1.887566680344686,
+      "grad_norm": 0.5511696877991991,
+      "learning_rate": 5e-06,
+      "loss": 0.7715,
+      "step": 1150
+    },
+    {
+      "epoch": 1.9039803036520313,
+      "grad_norm": 0.4950879715118595,
+      "learning_rate": 5e-06,
+      "loss": 0.7719,
+      "step": 1160
+    },
+    {
+      "epoch": 1.9203939269593762,
+      "grad_norm": 0.48824900399092436,
+      "learning_rate": 5e-06,
+      "loss": 0.7715,
+      "step": 1170
+    },
+    {
+      "epoch": 1.9368075502667215,
+      "grad_norm": 0.5450863319774812,
+      "learning_rate": 5e-06,
+      "loss": 0.7653,
+      "step": 1180
+    },
+    {
+      "epoch": 1.9532211735740663,
+      "grad_norm": 0.5501310306692955,
+      "learning_rate": 5e-06,
+      "loss": 0.7678,
+      "step": 1190
+    },
+    {
+      "epoch": 1.9696347968814116,
+      "grad_norm": 0.5175687374091981,
+      "learning_rate": 5e-06,
+      "loss": 0.7687,
+      "step": 1200
+    },
+    {
+      "epoch": 1.9860484201887567,
+      "grad_norm": 0.5729581338876775,
+      "learning_rate": 5e-06,
+      "loss": 0.7667,
+      "step": 1210
+    },
+    {
+      "epoch": 1.9991793188346327,
+      "eval_loss": 0.8003594875335693,
+      "eval_runtime": 649.053,
+      "eval_samples_per_second": 25.294,
+      "eval_steps_per_second": 0.396,
+      "step": 1218
+    },
+    {
+      "epoch": 2.002462043496102,
+      "grad_norm": 0.6463781219114217,
+      "learning_rate": 5e-06,
+      "loss": 0.8157,
+      "step": 1220
+    },
+    {
+      "epoch": 2.0188756668034467,
+      "grad_norm": 0.5511864743662498,
+      "learning_rate": 5e-06,
+      "loss": 0.7226,
+      "step": 1230
+    },
+    {
+      "epoch": 2.035289290110792,
+      "grad_norm": 0.5903816264405134,
+      "learning_rate": 5e-06,
+      "loss": 0.723,
+      "step": 1240
+    },
+    {
+      "epoch": 2.0517029134181373,
+      "grad_norm": 0.6048277468103348,
+      "learning_rate": 5e-06,
+      "loss": 0.7279,
+      "step": 1250
+    },
+    {
+      "epoch": 2.068116536725482,
+      "grad_norm": 0.6180095343734895,
+      "learning_rate": 5e-06,
+      "loss": 0.7231,
+      "step": 1260
+    },
+    {
+      "epoch": 2.0845301600328274,
+      "grad_norm": 0.5510967685128281,
+      "learning_rate": 5e-06,
+      "loss": 0.7251,
+      "step": 1270
+    },
+    {
+      "epoch": 2.1009437833401723,
+      "grad_norm": 0.5188988483266073,
+      "learning_rate": 5e-06,
+      "loss": 0.721,
+      "step": 1280
+    },
+    {
+      "epoch": 2.1173574066475176,
+      "grad_norm": 0.5609918054348771,
+      "learning_rate": 5e-06,
+      "loss": 0.7247,
+      "step": 1290
+    },
+    {
+      "epoch": 2.1337710299548625,
+      "grad_norm": 0.5166068490722786,
+      "learning_rate": 5e-06,
+      "loss": 0.7239,
+      "step": 1300
+    },
+    {
+      "epoch": 2.1501846532622078,
+      "grad_norm": 0.5214337691948149,
+      "learning_rate": 5e-06,
+      "loss": 0.7265,
+      "step": 1310
+    },
+    {
+      "epoch": 2.1665982765695526,
+      "grad_norm": 0.5346119768256951,
+      "learning_rate": 5e-06,
+      "loss": 0.7271,
+      "step": 1320
+    },
+    {
+      "epoch": 2.183011899876898,
+      "grad_norm": 0.5300060629839389,
+      "learning_rate": 5e-06,
+      "loss": 0.728,
+      "step": 1330
+    },
+    {
+      "epoch": 2.199425523184243,
+      "grad_norm": 0.6130748309683199,
+      "learning_rate": 5e-06,
+      "loss": 0.7304,
+      "step": 1340
+    },
+    {
+      "epoch": 2.215839146491588,
+      "grad_norm": 0.5131757232254592,
+      "learning_rate": 5e-06,
+      "loss": 0.7289,
+      "step": 1350
+    },
+    {
+      "epoch": 2.232252769798933,
+      "grad_norm": 0.5452450696684072,
+      "learning_rate": 5e-06,
+      "loss": 0.7279,
+      "step": 1360
+    },
+    {
+      "epoch": 2.2486663931062782,
+      "grad_norm": 0.5644129980449457,
+      "learning_rate": 5e-06,
+      "loss": 0.7294,
+      "step": 1370
+    },
+    {
+      "epoch": 2.2650800164136236,
+      "grad_norm": 0.5723600015296851,
+      "learning_rate": 5e-06,
+      "loss": 0.7282,
+      "step": 1380
+    },
+    {
+      "epoch": 2.2814936397209684,
+      "grad_norm": 0.5745168828163788,
+      "learning_rate": 5e-06,
+      "loss": 0.7241,
+      "step": 1390
+    },
+    {
+      "epoch": 2.2979072630283133,
+      "grad_norm": 0.6316489371250509,
+      "learning_rate": 5e-06,
+      "loss": 0.7318,
+      "step": 1400
+    },
+    {
+      "epoch": 2.3143208863356586,
+      "grad_norm": 0.5884122209110337,
+      "learning_rate": 5e-06,
+      "loss": 0.726,
+      "step": 1410
+    },
+    {
+      "epoch": 2.330734509643004,
+      "grad_norm": 0.5694563289563384,
+      "learning_rate": 5e-06,
+      "loss": 0.7303,
+      "step": 1420
+    },
+    {
+      "epoch": 2.3471481329503487,
+      "grad_norm": 0.5726932495181197,
+      "learning_rate": 5e-06,
+      "loss": 0.7288,
+      "step": 1430
+    },
+    {
+      "epoch": 2.363561756257694,
+      "grad_norm": 0.5879297609823882,
+      "learning_rate": 5e-06,
+      "loss": 0.7296,
+      "step": 1440
+    },
+    {
+      "epoch": 2.379975379565039,
+      "grad_norm": 0.5883121862822648,
+      "learning_rate": 5e-06,
+      "loss": 0.733,
+      "step": 1450
+    },
+    {
+      "epoch": 2.396389002872384,
+      "grad_norm": 0.5536017215166609,
+      "learning_rate": 5e-06,
+      "loss": 0.7273,
+      "step": 1460
+    },
+    {
+      "epoch": 2.412802626179729,
+      "grad_norm": 0.5153804582312451,
+      "learning_rate": 5e-06,
+      "loss": 0.7315,
+      "step": 1470
+    },
+    {
+      "epoch": 2.4292162494870744,
+      "grad_norm": 0.5281531288573597,
+      "learning_rate": 5e-06,
+      "loss": 0.7276,
+      "step": 1480
+    },
+    {
+      "epoch": 2.4456298727944192,
+      "grad_norm": 0.5456727623872021,
+      "learning_rate": 5e-06,
+      "loss": 0.7303,
+      "step": 1490
+    },
+    {
+      "epoch": 2.4620434961017645,
+      "grad_norm": 0.6682595967854461,
+      "learning_rate": 5e-06,
+      "loss": 0.7268,
+      "step": 1500
+    },
+    {
+      "epoch": 2.4784571194091094,
+      "grad_norm": 0.5277543058901882,
+      "learning_rate": 5e-06,
+      "loss": 0.7308,
+      "step": 1510
+    },
+    {
+      "epoch": 2.4948707427164547,
+      "grad_norm": 0.5377911403023388,
+      "learning_rate": 5e-06,
+      "loss": 0.7307,
+      "step": 1520
+    },
+    {
+      "epoch": 2.5112843660237996,
+      "grad_norm": 0.5405851958754275,
+      "learning_rate": 5e-06,
+      "loss": 0.7276,
+      "step": 1530
+    },
+    {
+      "epoch": 2.527697989331145,
+      "grad_norm": 0.5939416097504914,
+      "learning_rate": 5e-06,
+      "loss": 0.7276,
+      "step": 1540
+    },
+    {
+      "epoch": 2.54411161263849,
+      "grad_norm": 0.4915672499433413,
+      "learning_rate": 5e-06,
+      "loss": 0.7302,
+      "step": 1550
+    },
+    {
+      "epoch": 2.560525235945835,
+      "grad_norm": 0.5714622711240627,
+      "learning_rate": 5e-06,
+      "loss": 0.7304,
+      "step": 1560
+    },
+    {
+      "epoch": 2.57693885925318,
+      "grad_norm": 0.5369778418982702,
+      "learning_rate": 5e-06,
+      "loss": 0.7288,
+      "step": 1570
+    },
+    {
+      "epoch": 2.593352482560525,
+      "grad_norm": 0.6119186113930412,
+      "learning_rate": 5e-06,
+      "loss": 0.7292,
+      "step": 1580
+    },
+    {
+      "epoch": 2.6097661058678705,
+      "grad_norm": 0.6419697927822549,
+      "learning_rate": 5e-06,
+      "loss": 0.7329,
+      "step": 1590
+    },
+    {
+      "epoch": 2.6261797291752154,
+      "grad_norm": 0.519894007258766,
+      "learning_rate": 5e-06,
+      "loss": 0.7289,
+      "step": 1600
+    },
+    {
+      "epoch": 2.6425933524825607,
+      "grad_norm": 0.5472253996087528,
+      "learning_rate": 5e-06,
+      "loss": 0.7331,
+      "step": 1610
+    },
+    {
+      "epoch": 2.6590069757899055,
+      "grad_norm": 0.516270579267269,
+      "learning_rate": 5e-06,
+      "loss": 0.73,
+      "step": 1620
+    },
+    {
+      "epoch": 2.675420599097251,
+      "grad_norm": 0.6435228852860398,
+      "learning_rate": 5e-06,
+      "loss": 0.7302,
+      "step": 1630
+    },
+    {
+      "epoch": 2.6918342224045957,
+      "grad_norm": 0.5501601501281529,
+      "learning_rate": 5e-06,
+      "loss": 0.7326,
+      "step": 1640
+    },
+    {
+      "epoch": 2.708247845711941,
+      "grad_norm": 0.5949109159278116,
+      "learning_rate": 5e-06,
+      "loss": 0.7316,
+      "step": 1650
+    },
+    {
+      "epoch": 2.724661469019286,
+      "grad_norm": 0.6480843118367907,
+      "learning_rate": 5e-06,
+      "loss": 0.7309,
+      "step": 1660
+    },
+    {
+      "epoch": 2.741075092326631,
+      "grad_norm": 0.6058791921384546,
+      "learning_rate": 5e-06,
+      "loss": 0.7348,
+      "step": 1670
+    },
+    {
+      "epoch": 2.7574887156339765,
+      "grad_norm": 0.5131902537364029,
+      "learning_rate": 5e-06,
+      "loss": 0.7286,
+      "step": 1680
+    },
+    {
+      "epoch": 2.7739023389413213,
+      "grad_norm": 0.6053781021635353,
+      "learning_rate": 5e-06,
+      "loss": 0.7342,
+      "step": 1690
+    },
+    {
+      "epoch": 2.790315962248666,
+      "grad_norm": 0.6434931495576977,
+      "learning_rate": 5e-06,
+      "loss": 0.7303,
+      "step": 1700
+    },
+    {
+      "epoch": 2.8067295855560115,
+      "grad_norm": 0.521875321768183,
+      "learning_rate": 5e-06,
+      "loss": 0.7292,
+      "step": 1710
+    },
+    {
+      "epoch": 2.823143208863357,
+      "grad_norm": 0.5213429202976669,
+      "learning_rate": 5e-06,
+      "loss": 0.7255,
+      "step": 1720
+    },
+    {
+      "epoch": 2.8395568321707017,
+      "grad_norm": 0.5054795512549872,
+      "learning_rate": 5e-06,
+      "loss": 0.7305,
+      "step": 1730
+    },
+    {
+      "epoch": 2.8559704554780465,
+      "grad_norm": 0.5445431336579684,
+      "learning_rate": 5e-06,
+      "loss": 0.7297,
+      "step": 1740
+    },
+    {
+      "epoch": 2.872384078785392,
+      "grad_norm": 0.4823512740461331,
+      "learning_rate": 5e-06,
+      "loss": 0.7274,
+      "step": 1750
+    },
+    {
+      "epoch": 2.888797702092737,
+      "grad_norm": 0.7211033918425558,
+      "learning_rate": 5e-06,
+      "loss": 0.7321,
+      "step": 1760
+    },
+    {
+      "epoch": 2.905211325400082,
+      "grad_norm": 0.4979306966545238,
+      "learning_rate": 5e-06,
+      "loss": 0.7315,
+      "step": 1770
+    },
+    {
+      "epoch": 2.9216249487074273,
+      "grad_norm": 0.5227086332578315,
+      "learning_rate": 5e-06,
+      "loss": 0.7266,
+      "step": 1780
+    },
+    {
+      "epoch": 2.938038572014772,
+      "grad_norm": 0.48731020276155895,
+      "learning_rate": 5e-06,
+      "loss": 0.7326,
+      "step": 1790
+    },
+    {
+      "epoch": 2.9544521953221174,
+      "grad_norm": 0.5441906751482174,
+      "learning_rate": 5e-06,
+      "loss": 0.7299,
+      "step": 1800
+    },
+    {
+      "epoch": 2.9708658186294623,
+      "grad_norm": 0.4959846025308401,
+      "learning_rate": 5e-06,
+      "loss": 0.7363,
+      "step": 1810
+    },
+    {
+      "epoch": 2.9872794419368076,
+      "grad_norm": 0.5582701492514177,
+      "learning_rate": 5e-06,
+      "loss": 0.7286,
+      "step": 1820
+    },
+    {
+      "epoch": 2.998768978251949,
+      "eval_loss": 0.7988426685333252,
+      "eval_runtime": 652.5981,
+      "eval_samples_per_second": 25.156,
+      "eval_steps_per_second": 0.394,
+      "step": 1827
+    },
+    {
+      "epoch": 2.998768978251949,
+      "step": 1827,
+      "total_flos": 3060083667763200.0,
+      "train_loss": 0.7828072778631836,
+      "train_runtime": 108782.0419,
+      "train_samples_per_second": 8.602,
+      "train_steps_per_second": 0.017
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1827,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3060083667763200.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed