End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +1501 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: top_19_ranking_stackexchange
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # top_19_ranking_stackexchange
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.8026

 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: top_19_ranking_stackexchange
 # top_19_ranking_stackexchange
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/top_19_ranking_stackexchange dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.8026

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.9961671837926627,
+    "eval_loss": 0.8026307821273804,
+    "eval_runtime": 735.7112,
+    "eval_samples_per_second": 25.085,
+    "eval_steps_per_second": 0.393,
+    "total_flos": 3436967047987200.0,
+    "train_loss": 0.7862561077286161,
+    "train_runtime": 121149.8135,
+    "train_samples_per_second": 8.683,
+    "train_steps_per_second": 0.017
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.9961671837926627,
+    "eval_loss": 0.8026307821273804,
+    "eval_runtime": 735.7112,
+    "eval_samples_per_second": 25.085,
+    "eval_steps_per_second": 0.393
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.9961671837926627,
+    "total_flos": 3436967047987200.0,
+    "train_loss": 0.7862561077286161,
+    "train_runtime": 121149.8135,
+    "train_samples_per_second": 8.683,
+    "train_steps_per_second": 0.017
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1501 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9961671837926627,
+  "eval_steps": 500,
+  "global_step": 2052,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.014601204599379448,
+      "grad_norm": 31.200588193935076,
+      "learning_rate": 5e-06,
+      "loss": 1.0665,
+      "step": 10
+    },
+    {
+      "epoch": 0.029202409198758897,
+      "grad_norm": 2.1030174817289726,
+      "learning_rate": 5e-06,
+      "loss": 0.9851,
+      "step": 20
+    },
+    {
+      "epoch": 0.043803613798138345,
+      "grad_norm": 1.7889745137305164,
+      "learning_rate": 5e-06,
+      "loss": 0.9413,
+      "step": 30
+    },
+    {
+      "epoch": 0.058404818397517794,
+      "grad_norm": 0.7798640642359425,
+      "learning_rate": 5e-06,
+      "loss": 0.9234,
+      "step": 40
+    },
+    {
+      "epoch": 0.07300602299689725,
+      "grad_norm": 2.7003950764835163,
+      "learning_rate": 5e-06,
+      "loss": 0.9077,
+      "step": 50
+    },
+    {
+      "epoch": 0.08760722759627669,
+      "grad_norm": 6.611494407685939,
+      "learning_rate": 5e-06,
+      "loss": 0.9045,
+      "step": 60
+    },
+    {
+      "epoch": 0.10220843219565615,
+      "grad_norm": 1.081246334652253,
+      "learning_rate": 5e-06,
+      "loss": 0.8965,
+      "step": 70
+    },
+    {
+      "epoch": 0.11680963679503559,
+      "grad_norm": 0.8101716177472186,
+      "learning_rate": 5e-06,
+      "loss": 0.8855,
+      "step": 80
+    },
+    {
+      "epoch": 0.13141084139441503,
+      "grad_norm": 0.8801376249396373,
+      "learning_rate": 5e-06,
+      "loss": 0.8827,
+      "step": 90
+    },
+    {
+      "epoch": 0.1460120459937945,
+      "grad_norm": 0.6385442821205225,
+      "learning_rate": 5e-06,
+      "loss": 0.8714,
+      "step": 100
+    },
+    {
+      "epoch": 0.16061325059317394,
+      "grad_norm": 0.6872556086848158,
+      "learning_rate": 5e-06,
+      "loss": 0.8735,
+      "step": 110
+    },
+    {
+      "epoch": 0.17521445519255338,
+      "grad_norm": 0.6992499417358933,
+      "learning_rate": 5e-06,
+      "loss": 0.8682,
+      "step": 120
+    },
+    {
+      "epoch": 0.18981565979193282,
+      "grad_norm": 0.6013577080845172,
+      "learning_rate": 5e-06,
+      "loss": 0.8662,
+      "step": 130
+    },
+    {
+      "epoch": 0.2044168643913123,
+      "grad_norm": 0.5988600270173734,
+      "learning_rate": 5e-06,
+      "loss": 0.8649,
+      "step": 140
+    },
+    {
+      "epoch": 0.21901806899069173,
+      "grad_norm": 0.6460225651419232,
+      "learning_rate": 5e-06,
+      "loss": 0.8647,
+      "step": 150
+    },
+    {
+      "epoch": 0.23361927359007117,
+      "grad_norm": 0.5603551860244527,
+      "learning_rate": 5e-06,
+      "loss": 0.8533,
+      "step": 160
+    },
+    {
+      "epoch": 0.24822047818945062,
+      "grad_norm": 0.5553710866489995,
+      "learning_rate": 5e-06,
+      "loss": 0.8544,
+      "step": 170
+    },
+    {
+      "epoch": 0.26282168278883006,
+      "grad_norm": 0.6554387689230537,
+      "learning_rate": 5e-06,
+      "loss": 0.8592,
+      "step": 180
+    },
+    {
+      "epoch": 0.2774228873882095,
+      "grad_norm": 0.6786037864304745,
+      "learning_rate": 5e-06,
+      "loss": 0.8518,
+      "step": 190
+    },
+    {
+      "epoch": 0.292024091987589,
+      "grad_norm": 0.5561181889312125,
+      "learning_rate": 5e-06,
+      "loss": 0.8505,
+      "step": 200
+    },
+    {
+      "epoch": 0.3066252965869684,
+      "grad_norm": 0.7917259340608006,
+      "learning_rate": 5e-06,
+      "loss": 0.8475,
+      "step": 210
+    },
+    {
+      "epoch": 0.3212265011863479,
+      "grad_norm": 0.5838972916992158,
+      "learning_rate": 5e-06,
+      "loss": 0.8468,
+      "step": 220
+    },
+    {
+      "epoch": 0.33582770578572735,
+      "grad_norm": 0.5415756077794452,
+      "learning_rate": 5e-06,
+      "loss": 0.8478,
+      "step": 230
+    },
+    {
+      "epoch": 0.35042891038510676,
+      "grad_norm": 0.561460816685303,
+      "learning_rate": 5e-06,
+      "loss": 0.8471,
+      "step": 240
+    },
+    {
+      "epoch": 0.36503011498448623,
+      "grad_norm": 0.5431016015146285,
+      "learning_rate": 5e-06,
+      "loss": 0.845,
+      "step": 250
+    },
+    {
+      "epoch": 0.37963131958386565,
+      "grad_norm": 0.5777928639036234,
+      "learning_rate": 5e-06,
+      "loss": 0.8427,
+      "step": 260
+    },
+    {
+      "epoch": 0.3942325241832451,
+      "grad_norm": 0.6855206088461627,
+      "learning_rate": 5e-06,
+      "loss": 0.8406,
+      "step": 270
+    },
+    {
+      "epoch": 0.4088337287826246,
+      "grad_norm": 0.5959537867792327,
+      "learning_rate": 5e-06,
+      "loss": 0.843,
+      "step": 280
+    },
+    {
+      "epoch": 0.423434933382004,
+      "grad_norm": 0.5787987185587301,
+      "learning_rate": 5e-06,
+      "loss": 0.842,
+      "step": 290
+    },
+    {
+      "epoch": 0.43803613798138347,
+      "grad_norm": 0.9829016985861171,
+      "learning_rate": 5e-06,
+      "loss": 0.841,
+      "step": 300
+    },
+    {
+      "epoch": 0.45263734258076294,
+      "grad_norm": 0.7109664833342627,
+      "learning_rate": 5e-06,
+      "loss": 0.8376,
+      "step": 310
+    },
+    {
+      "epoch": 0.46723854718014235,
+      "grad_norm": 0.5953929615896101,
+      "learning_rate": 5e-06,
+      "loss": 0.8352,
+      "step": 320
+    },
+    {
+      "epoch": 0.4818397517795218,
+      "grad_norm": 0.6459745420821242,
+      "learning_rate": 5e-06,
+      "loss": 0.8322,
+      "step": 330
+    },
+    {
+      "epoch": 0.49644095637890123,
+      "grad_norm": 0.7286780710714444,
+      "learning_rate": 5e-06,
+      "loss": 0.833,
+      "step": 340
+    },
+    {
+      "epoch": 0.5110421609782807,
+      "grad_norm": 0.8624769767543123,
+      "learning_rate": 5e-06,
+      "loss": 0.8326,
+      "step": 350
+    },
+    {
+      "epoch": 0.5256433655776601,
+      "grad_norm": 0.7489286697832975,
+      "learning_rate": 5e-06,
+      "loss": 0.8344,
+      "step": 360
+    },
+    {
+      "epoch": 0.5402445701770396,
+      "grad_norm": 0.6965027768353624,
+      "learning_rate": 5e-06,
+      "loss": 0.8329,
+      "step": 370
+    },
+    {
+      "epoch": 0.554845774776419,
+      "grad_norm": 0.5898605613508874,
+      "learning_rate": 5e-06,
+      "loss": 0.8396,
+      "step": 380
+    },
+    {
+      "epoch": 0.5694469793757985,
+      "grad_norm": 0.669429389652064,
+      "learning_rate": 5e-06,
+      "loss": 0.8318,
+      "step": 390
+    },
+    {
+      "epoch": 0.584048183975178,
+      "grad_norm": 0.6580798841963941,
+      "learning_rate": 5e-06,
+      "loss": 0.8314,
+      "step": 400
+    },
+    {
+      "epoch": 0.5986493885745574,
+      "grad_norm": 0.624128356604639,
+      "learning_rate": 5e-06,
+      "loss": 0.8282,
+      "step": 410
+    },
+    {
+      "epoch": 0.6132505931739368,
+      "grad_norm": 0.5755522646670556,
+      "learning_rate": 5e-06,
+      "loss": 0.8321,
+      "step": 420
+    },
+    {
+      "epoch": 0.6278517977733163,
+      "grad_norm": 0.8196980265974857,
+      "learning_rate": 5e-06,
+      "loss": 0.8313,
+      "step": 430
+    },
+    {
+      "epoch": 0.6424530023726958,
+      "grad_norm": 0.48261475886925087,
+      "learning_rate": 5e-06,
+      "loss": 0.8238,
+      "step": 440
+    },
+    {
+      "epoch": 0.6570542069720752,
+      "grad_norm": 0.5343014097762563,
+      "learning_rate": 5e-06,
+      "loss": 0.8296,
+      "step": 450
+    },
+    {
+      "epoch": 0.6716554115714547,
+      "grad_norm": 0.8585815714707374,
+      "learning_rate": 5e-06,
+      "loss": 0.823,
+      "step": 460
+    },
+    {
+      "epoch": 0.6862566161708341,
+      "grad_norm": 0.7315620836524508,
+      "learning_rate": 5e-06,
+      "loss": 0.8331,
+      "step": 470
+    },
+    {
+      "epoch": 0.7008578207702135,
+      "grad_norm": 0.4711661790189355,
+      "learning_rate": 5e-06,
+      "loss": 0.8245,
+      "step": 480
+    },
+    {
+      "epoch": 0.7154590253695929,
+      "grad_norm": 0.546263482109446,
+      "learning_rate": 5e-06,
+      "loss": 0.8212,
+      "step": 490
+    },
+    {
+      "epoch": 0.7300602299689725,
+      "grad_norm": 0.5757304431326317,
+      "learning_rate": 5e-06,
+      "loss": 0.8252,
+      "step": 500
+    },
+    {
+      "epoch": 0.7446614345683519,
+      "grad_norm": 0.5563752904399338,
+      "learning_rate": 5e-06,
+      "loss": 0.8251,
+      "step": 510
+    },
+    {
+      "epoch": 0.7592626391677313,
+      "grad_norm": 0.48890029763799747,
+      "learning_rate": 5e-06,
+      "loss": 0.8244,
+      "step": 520
+    },
+    {
+      "epoch": 0.7738638437671108,
+      "grad_norm": 0.6121148728397559,
+      "learning_rate": 5e-06,
+      "loss": 0.8219,
+      "step": 530
+    },
+    {
+      "epoch": 0.7884650483664902,
+      "grad_norm": 0.651565586948898,
+      "learning_rate": 5e-06,
+      "loss": 0.8203,
+      "step": 540
+    },
+    {
+      "epoch": 0.8030662529658696,
+      "grad_norm": 0.5365587518038645,
+      "learning_rate": 5e-06,
+      "loss": 0.8244,
+      "step": 550
+    },
+    {
+      "epoch": 0.8176674575652492,
+      "grad_norm": 0.5585874614674294,
+      "learning_rate": 5e-06,
+      "loss": 0.8261,
+      "step": 560
+    },
+    {
+      "epoch": 0.8322686621646286,
+      "grad_norm": 0.48225482309598716,
+      "learning_rate": 5e-06,
+      "loss": 0.828,
+      "step": 570
+    },
+    {
+      "epoch": 0.846869866764008,
+      "grad_norm": 0.6379018399000604,
+      "learning_rate": 5e-06,
+      "loss": 0.8187,
+      "step": 580
+    },
+    {
+      "epoch": 0.8614710713633875,
+      "grad_norm": 0.8248757003628987,
+      "learning_rate": 5e-06,
+      "loss": 0.8245,
+      "step": 590
+    },
+    {
+      "epoch": 0.8760722759627669,
+      "grad_norm": 0.7072642911500023,
+      "learning_rate": 5e-06,
+      "loss": 0.8199,
+      "step": 600
+    },
+    {
+      "epoch": 0.8906734805621463,
+      "grad_norm": 0.6066965111128374,
+      "learning_rate": 5e-06,
+      "loss": 0.821,
+      "step": 610
+    },
+    {
+      "epoch": 0.9052746851615259,
+      "grad_norm": 0.49608072224953953,
+      "learning_rate": 5e-06,
+      "loss": 0.8263,
+      "step": 620
+    },
+    {
+      "epoch": 0.9198758897609053,
+      "grad_norm": 0.6053461220096085,
+      "learning_rate": 5e-06,
+      "loss": 0.8225,
+      "step": 630
+    },
+    {
+      "epoch": 0.9344770943602847,
+      "grad_norm": 0.5575666035835788,
+      "learning_rate": 5e-06,
+      "loss": 0.8211,
+      "step": 640
+    },
+    {
+      "epoch": 0.9490782989596642,
+      "grad_norm": 0.5170427420902555,
+      "learning_rate": 5e-06,
+      "loss": 0.8202,
+      "step": 650
+    },
+    {
+      "epoch": 0.9636795035590436,
+      "grad_norm": 0.5652214016440857,
+      "learning_rate": 5e-06,
+      "loss": 0.8219,
+      "step": 660
+    },
+    {
+      "epoch": 0.978280708158423,
+      "grad_norm": 0.5065476265832586,
+      "learning_rate": 5e-06,
+      "loss": 0.8121,
+      "step": 670
+    },
+    {
+      "epoch": 0.9928819127578025,
+      "grad_norm": 0.5713479763199619,
+      "learning_rate": 5e-06,
+      "loss": 0.8154,
+      "step": 680
+    },
+    {
+      "epoch": 0.9987223945975543,
+      "eval_loss": 0.8185040950775146,
+      "eval_runtime": 729.2812,
+      "eval_samples_per_second": 25.306,
+      "eval_steps_per_second": 0.396,
+      "step": 684
+    },
+    {
+      "epoch": 1.0074831173571819,
+      "grad_norm": 0.5659100587324225,
+      "learning_rate": 5e-06,
+      "loss": 0.8068,
+      "step": 690
+    },
+    {
+      "epoch": 1.0220843219565614,
+      "grad_norm": 0.5725390345160268,
+      "learning_rate": 5e-06,
+      "loss": 0.7796,
+      "step": 700
+    },
+    {
+      "epoch": 1.036685526555941,
+      "grad_norm": 0.5067331567131128,
+      "learning_rate": 5e-06,
+      "loss": 0.7784,
+      "step": 710
+    },
+    {
+      "epoch": 1.0512867311553202,
+      "grad_norm": 0.5633875492368658,
+      "learning_rate": 5e-06,
+      "loss": 0.7789,
+      "step": 720
+    },
+    {
+      "epoch": 1.0658879357546998,
+      "grad_norm": 0.6503391798526155,
+      "learning_rate": 5e-06,
+      "loss": 0.7785,
+      "step": 730
+    },
+    {
+      "epoch": 1.0804891403540793,
+      "grad_norm": 0.6157238494098765,
+      "learning_rate": 5e-06,
+      "loss": 0.7788,
+      "step": 740
+    },
+    {
+      "epoch": 1.0950903449534586,
+      "grad_norm": 0.6692246242398756,
+      "learning_rate": 5e-06,
+      "loss": 0.7791,
+      "step": 750
+    },
+    {
+      "epoch": 1.109691549552838,
+      "grad_norm": 0.772994893504376,
+      "learning_rate": 5e-06,
+      "loss": 0.7772,
+      "step": 760
+    },
+    {
+      "epoch": 1.1242927541522176,
+      "grad_norm": 0.5654007018077802,
+      "learning_rate": 5e-06,
+      "loss": 0.778,
+      "step": 770
+    },
+    {
+      "epoch": 1.138893958751597,
+      "grad_norm": 0.5871850769943243,
+      "learning_rate": 5e-06,
+      "loss": 0.7797,
+      "step": 780
+    },
+    {
+      "epoch": 1.1534951633509765,
+      "grad_norm": 0.6081431556291285,
+      "learning_rate": 5e-06,
+      "loss": 0.7776,
+      "step": 790
+    },
+    {
+      "epoch": 1.168096367950356,
+      "grad_norm": 0.5943447291419969,
+      "learning_rate": 5e-06,
+      "loss": 0.7812,
+      "step": 800
+    },
+    {
+      "epoch": 1.1826975725497353,
+      "grad_norm": 0.5174382592861106,
+      "learning_rate": 5e-06,
+      "loss": 0.7742,
+      "step": 810
+    },
+    {
+      "epoch": 1.1972987771491148,
+      "grad_norm": 0.5335467085784507,
+      "learning_rate": 5e-06,
+      "loss": 0.7821,
+      "step": 820
+    },
+    {
+      "epoch": 1.2118999817484943,
+      "grad_norm": 0.5424184832410203,
+      "learning_rate": 5e-06,
+      "loss": 0.7832,
+      "step": 830
+    },
+    {
+      "epoch": 1.2265011863478736,
+      "grad_norm": 0.5401853269685924,
+      "learning_rate": 5e-06,
+      "loss": 0.7764,
+      "step": 840
+    },
+    {
+      "epoch": 1.2411023909472532,
+      "grad_norm": 0.5532297607385643,
+      "learning_rate": 5e-06,
+      "loss": 0.776,
+      "step": 850
+    },
+    {
+      "epoch": 1.2557035955466325,
+      "grad_norm": 0.4600563956098031,
+      "learning_rate": 5e-06,
+      "loss": 0.7746,
+      "step": 860
+    },
+    {
+      "epoch": 1.270304800146012,
+      "grad_norm": 0.5135474289282321,
+      "learning_rate": 5e-06,
+      "loss": 0.7725,
+      "step": 870
+    },
+    {
+      "epoch": 1.2849060047453915,
+      "grad_norm": 0.6354802982105713,
+      "learning_rate": 5e-06,
+      "loss": 0.7787,
+      "step": 880
+    },
+    {
+      "epoch": 1.299507209344771,
+      "grad_norm": 0.5869839476501474,
+      "learning_rate": 5e-06,
+      "loss": 0.7712,
+      "step": 890
+    },
+    {
+      "epoch": 1.3141084139441503,
+      "grad_norm": 0.49495760536344496,
+      "learning_rate": 5e-06,
+      "loss": 0.777,
+      "step": 900
+    },
+    {
+      "epoch": 1.3287096185435299,
+      "grad_norm": 0.5322628773610525,
+      "learning_rate": 5e-06,
+      "loss": 0.7791,
+      "step": 910
+    },
+    {
+      "epoch": 1.3433108231429092,
+      "grad_norm": 0.6394355119269733,
+      "learning_rate": 5e-06,
+      "loss": 0.7813,
+      "step": 920
+    },
+    {
+      "epoch": 1.3579120277422887,
+      "grad_norm": 0.6150475948115007,
+      "learning_rate": 5e-06,
+      "loss": 0.7718,
+      "step": 930
+    },
+    {
+      "epoch": 1.3725132323416682,
+      "grad_norm": 0.6284466998832495,
+      "learning_rate": 5e-06,
+      "loss": 0.7716,
+      "step": 940
+    },
+    {
+      "epoch": 1.3871144369410477,
+      "grad_norm": 0.4995594773156744,
+      "learning_rate": 5e-06,
+      "loss": 0.7801,
+      "step": 950
+    },
+    {
+      "epoch": 1.401715641540427,
+      "grad_norm": 0.5533231758658743,
+      "learning_rate": 5e-06,
+      "loss": 0.7749,
+      "step": 960
+    },
+    {
+      "epoch": 1.4163168461398066,
+      "grad_norm": 0.5566318311264558,
+      "learning_rate": 5e-06,
+      "loss": 0.7809,
+      "step": 970
+    },
+    {
+      "epoch": 1.4309180507391859,
+      "grad_norm": 0.5996092713965696,
+      "learning_rate": 5e-06,
+      "loss": 0.7769,
+      "step": 980
+    },
+    {
+      "epoch": 1.4455192553385654,
+      "grad_norm": 0.4923370749506076,
+      "learning_rate": 5e-06,
+      "loss": 0.7733,
+      "step": 990
+    },
+    {
+      "epoch": 1.460120459937945,
+      "grad_norm": 0.5718051545730899,
+      "learning_rate": 5e-06,
+      "loss": 0.778,
+      "step": 1000
+    },
+    {
+      "epoch": 1.4747216645373245,
+      "grad_norm": 0.4966605100244046,
+      "learning_rate": 5e-06,
+      "loss": 0.7755,
+      "step": 1010
+    },
+    {
+      "epoch": 1.4893228691367038,
+      "grad_norm": 0.5104108866561695,
+      "learning_rate": 5e-06,
+      "loss": 0.7762,
+      "step": 1020
+    },
+    {
+      "epoch": 1.5039240737360833,
+      "grad_norm": 0.5790841364965528,
+      "learning_rate": 5e-06,
+      "loss": 0.775,
+      "step": 1030
+    },
+    {
+      "epoch": 1.5185252783354626,
+      "grad_norm": 0.5079205962955746,
+      "learning_rate": 5e-06,
+      "loss": 0.7791,
+      "step": 1040
+    },
+    {
+      "epoch": 1.533126482934842,
+      "grad_norm": 0.4897829483446737,
+      "learning_rate": 5e-06,
+      "loss": 0.7732,
+      "step": 1050
+    },
+    {
+      "epoch": 1.5477276875342216,
+      "grad_norm": 0.5375326427308407,
+      "learning_rate": 5e-06,
+      "loss": 0.7734,
+      "step": 1060
+    },
+    {
+      "epoch": 1.5623288921336012,
+      "grad_norm": 0.4714533263773857,
+      "learning_rate": 5e-06,
+      "loss": 0.7786,
+      "step": 1070
+    },
+    {
+      "epoch": 1.5769300967329805,
+      "grad_norm": 0.5170403858384673,
+      "learning_rate": 5e-06,
+      "loss": 0.772,
+      "step": 1080
+    },
+    {
+      "epoch": 1.5915313013323598,
+      "grad_norm": 0.5584745095875884,
+      "learning_rate": 5e-06,
+      "loss": 0.7788,
+      "step": 1090
+    },
+    {
+      "epoch": 1.6061325059317393,
+      "grad_norm": 0.5632792125524021,
+      "learning_rate": 5e-06,
+      "loss": 0.7764,
+      "step": 1100
+    },
+    {
+      "epoch": 1.6207337105311188,
+      "grad_norm": 0.5303585273369582,
+      "learning_rate": 5e-06,
+      "loss": 0.7698,
+      "step": 1110
+    },
+    {
+      "epoch": 1.6353349151304983,
+      "grad_norm": 0.5292556194617752,
+      "learning_rate": 5e-06,
+      "loss": 0.7754,
+      "step": 1120
+    },
+    {
+      "epoch": 1.6499361197298779,
+      "grad_norm": 0.5319736770394399,
+      "learning_rate": 5e-06,
+      "loss": 0.7754,
+      "step": 1130
+    },
+    {
+      "epoch": 1.6645373243292572,
+      "grad_norm": 0.5409862397072692,
+      "learning_rate": 5e-06,
+      "loss": 0.7732,
+      "step": 1140
+    },
+    {
+      "epoch": 1.6791385289286365,
+      "grad_norm": 0.5347398767131228,
+      "learning_rate": 5e-06,
+      "loss": 0.775,
+      "step": 1150
+    },
+    {
+      "epoch": 1.693739733528016,
+      "grad_norm": 0.5887598823053857,
+      "learning_rate": 5e-06,
+      "loss": 0.7734,
+      "step": 1160
+    },
+    {
+      "epoch": 1.7083409381273955,
+      "grad_norm": 0.588980481311897,
+      "learning_rate": 5e-06,
+      "loss": 0.7776,
+      "step": 1170
+    },
+    {
+      "epoch": 1.722942142726775,
+      "grad_norm": 0.5476017973657227,
+      "learning_rate": 5e-06,
+      "loss": 0.7718,
+      "step": 1180
+    },
+    {
+      "epoch": 1.7375433473261546,
+      "grad_norm": 0.5548638443373327,
+      "learning_rate": 5e-06,
+      "loss": 0.778,
+      "step": 1190
+    },
+    {
+      "epoch": 1.7521445519255339,
+      "grad_norm": 0.5443995408512653,
+      "learning_rate": 5e-06,
+      "loss": 0.7731,
+      "step": 1200
+    },
+    {
+      "epoch": 1.7667457565249132,
+      "grad_norm": 0.5134399032378028,
+      "learning_rate": 5e-06,
+      "loss": 0.7762,
+      "step": 1210
+    },
+    {
+      "epoch": 1.7813469611242927,
+      "grad_norm": 0.5143443520325698,
+      "learning_rate": 5e-06,
+      "loss": 0.7737,
+      "step": 1220
+    },
+    {
+      "epoch": 1.7959481657236722,
+      "grad_norm": 0.5712512301925389,
+      "learning_rate": 5e-06,
+      "loss": 0.7752,
+      "step": 1230
+    },
+    {
+      "epoch": 1.8105493703230517,
+      "grad_norm": 0.5022436155237929,
+      "learning_rate": 5e-06,
+      "loss": 0.7746,
+      "step": 1240
+    },
+    {
+      "epoch": 1.825150574922431,
+      "grad_norm": 0.5183607046169039,
+      "learning_rate": 5e-06,
+      "loss": 0.7758,
+      "step": 1250
+    },
+    {
+      "epoch": 1.8397517795218106,
+      "grad_norm": 0.5327048894936923,
+      "learning_rate": 5e-06,
+      "loss": 0.7737,
+      "step": 1260
+    },
+    {
+      "epoch": 1.8543529841211899,
+      "grad_norm": 0.4828373941208032,
+      "learning_rate": 5e-06,
+      "loss": 0.7734,
+      "step": 1270
+    },
+    {
+      "epoch": 1.8689541887205694,
+      "grad_norm": 0.4692396568766125,
+      "learning_rate": 5e-06,
+      "loss": 0.7702,
+      "step": 1280
+    },
+    {
+      "epoch": 1.883555393319949,
+      "grad_norm": 0.5272353821412613,
+      "learning_rate": 5e-06,
+      "loss": 0.7738,
+      "step": 1290
+    },
+    {
+      "epoch": 1.8981565979193284,
+      "grad_norm": 0.5974378803453756,
+      "learning_rate": 5e-06,
+      "loss": 0.7744,
+      "step": 1300
+    },
+    {
+      "epoch": 1.9127578025187078,
+      "grad_norm": 0.45897414900404526,
+      "learning_rate": 5e-06,
+      "loss": 0.7707,
+      "step": 1310
+    },
+    {
+      "epoch": 1.9273590071180873,
+      "grad_norm": 0.5661797189222842,
+      "learning_rate": 5e-06,
+      "loss": 0.7729,
+      "step": 1320
+    },
+    {
+      "epoch": 1.9419602117174666,
+      "grad_norm": 0.5291159788169262,
+      "learning_rate": 5e-06,
+      "loss": 0.7704,
+      "step": 1330
+    },
+    {
+      "epoch": 1.956561416316846,
+      "grad_norm": 0.5803039839795054,
+      "learning_rate": 5e-06,
+      "loss": 0.7711,
+      "step": 1340
+    },
+    {
+      "epoch": 1.9711626209162256,
+      "grad_norm": 0.5668430658536179,
+      "learning_rate": 5e-06,
+      "loss": 0.7714,
+      "step": 1350
+    },
+    {
+      "epoch": 1.9857638255156052,
+      "grad_norm": 0.5292288364377395,
+      "learning_rate": 5e-06,
+      "loss": 0.7681,
+      "step": 1360
+    },
+    {
+      "epoch": 1.9989049096550464,
+      "eval_loss": 0.8045554161071777,
+      "eval_runtime": 732.5889,
+      "eval_samples_per_second": 25.191,
+      "eval_steps_per_second": 0.394,
+      "step": 1369
+    },
+    {
+      "epoch": 2.0003650301149847,
+      "grad_norm": 0.5917011918049785,
+      "learning_rate": 5e-06,
+      "loss": 0.7813,
+      "step": 1370
+    },
+    {
+      "epoch": 2.0149662347143638,
+      "grad_norm": 0.6437888983866474,
+      "learning_rate": 5e-06,
+      "loss": 0.7275,
+      "step": 1380
+    },
+    {
+      "epoch": 2.0295674393137433,
+      "grad_norm": 0.5993159674827806,
+      "learning_rate": 5e-06,
+      "loss": 0.7301,
+      "step": 1390
+    },
+    {
+      "epoch": 2.044168643913123,
+      "grad_norm": 0.6004688564094799,
+      "learning_rate": 5e-06,
+      "loss": 0.7279,
+      "step": 1400
+    },
+    {
+      "epoch": 2.0587698485125023,
+      "grad_norm": 0.5968765010977406,
+      "learning_rate": 5e-06,
+      "loss": 0.732,
+      "step": 1410
+    },
+    {
+      "epoch": 2.073371053111882,
+      "grad_norm": 0.6368286520923802,
+      "learning_rate": 5e-06,
+      "loss": 0.7327,
+      "step": 1420
+    },
+    {
+      "epoch": 2.0879722577112614,
+      "grad_norm": 0.5121224799191383,
+      "learning_rate": 5e-06,
+      "loss": 0.7289,
+      "step": 1430
+    },
+    {
+      "epoch": 2.1025734623106405,
+      "grad_norm": 0.5426488835411897,
+      "learning_rate": 5e-06,
+      "loss": 0.729,
+      "step": 1440
+    },
+    {
+      "epoch": 2.11717466691002,
+      "grad_norm": 0.5360711433495,
+      "learning_rate": 5e-06,
+      "loss": 0.7321,
+      "step": 1450
+    },
+    {
+      "epoch": 2.1317758715093995,
+      "grad_norm": 0.570345866307846,
+      "learning_rate": 5e-06,
+      "loss": 0.7283,
+      "step": 1460
+    },
+    {
+      "epoch": 2.146377076108779,
+      "grad_norm": 0.5646482286111343,
+      "learning_rate": 5e-06,
+      "loss": 0.7341,
+      "step": 1470
+    },
+    {
+      "epoch": 2.1609782807081586,
+      "grad_norm": 0.6170916412089019,
+      "learning_rate": 5e-06,
+      "loss": 0.7273,
+      "step": 1480
+    },
+    {
+      "epoch": 2.175579485307538,
+      "grad_norm": 0.5669820051659463,
+      "learning_rate": 5e-06,
+      "loss": 0.7327,
+      "step": 1490
+    },
+    {
+      "epoch": 2.190180689906917,
+      "grad_norm": 0.5032603903192315,
+      "learning_rate": 5e-06,
+      "loss": 0.7259,
+      "step": 1500
+    },
+    {
+      "epoch": 2.2047818945062967,
+      "grad_norm": 0.536250519862031,
+      "learning_rate": 5e-06,
+      "loss": 0.7309,
+      "step": 1510
+    },
+    {
+      "epoch": 2.219383099105676,
+      "grad_norm": 0.4989195000116721,
+      "learning_rate": 5e-06,
+      "loss": 0.7351,
+      "step": 1520
+    },
+    {
+      "epoch": 2.2339843037050557,
+      "grad_norm": 0.48084465295135953,
+      "learning_rate": 5e-06,
+      "loss": 0.7319,
+      "step": 1530
+    },
+    {
+      "epoch": 2.2485855083044353,
+      "grad_norm": 0.6041492880249871,
+      "learning_rate": 5e-06,
+      "loss": 0.7331,
+      "step": 1540
+    },
+    {
+      "epoch": 2.2631867129038143,
+      "grad_norm": 0.48501706414438883,
+      "learning_rate": 5e-06,
+      "loss": 0.7364,
+      "step": 1550
+    },
+    {
+      "epoch": 2.277787917503194,
+      "grad_norm": 0.486621431249399,
+      "learning_rate": 5e-06,
+      "loss": 0.7336,
+      "step": 1560
+    },
+    {
+      "epoch": 2.2923891221025734,
+      "grad_norm": 0.6249224122437264,
+      "learning_rate": 5e-06,
+      "loss": 0.7335,
+      "step": 1570
+    },
+    {
+      "epoch": 2.306990326701953,
+      "grad_norm": 0.6255968466832591,
+      "learning_rate": 5e-06,
+      "loss": 0.7315,
+      "step": 1580
+    },
+    {
+      "epoch": 2.3215915313013324,
+      "grad_norm": 0.5271636183137467,
+      "learning_rate": 5e-06,
+      "loss": 0.7322,
+      "step": 1590
+    },
+    {
+      "epoch": 2.336192735900712,
+      "grad_norm": 0.5924207573496902,
+      "learning_rate": 5e-06,
+      "loss": 0.7345,
+      "step": 1600
+    },
+    {
+      "epoch": 2.3507939405000915,
+      "grad_norm": 0.5587622345452513,
+      "learning_rate": 5e-06,
+      "loss": 0.7344,
+      "step": 1610
+    },
+    {
+      "epoch": 2.3653951450994706,
+      "grad_norm": 0.6269568570302153,
+      "learning_rate": 5e-06,
+      "loss": 0.7359,
+      "step": 1620
+    },
+    {
+      "epoch": 2.37999634969885,
+      "grad_norm": 0.6533009549527986,
+      "learning_rate": 5e-06,
+      "loss": 0.7368,
+      "step": 1630
+    },
+    {
+      "epoch": 2.3945975542982296,
+      "grad_norm": 0.5195452792704209,
+      "learning_rate": 5e-06,
+      "loss": 0.7355,
+      "step": 1640
+    },
+    {
+      "epoch": 2.409198758897609,
+      "grad_norm": 0.5528537289554203,
+      "learning_rate": 5e-06,
+      "loss": 0.7355,
+      "step": 1650
+    },
+    {
+      "epoch": 2.4237999634969887,
+      "grad_norm": 0.5434124551027695,
+      "learning_rate": 5e-06,
+      "loss": 0.7343,
+      "step": 1660
+    },
+    {
+      "epoch": 2.4384011680963678,
+      "grad_norm": 0.5166289013156083,
+      "learning_rate": 5e-06,
+      "loss": 0.7348,
+      "step": 1670
+    },
+    {
+      "epoch": 2.4530023726957473,
+      "grad_norm": 0.5085481998475664,
+      "learning_rate": 5e-06,
+      "loss": 0.7384,
+      "step": 1680
+    },
+    {
+      "epoch": 2.467603577295127,
+      "grad_norm": 0.47895714940004425,
+      "learning_rate": 5e-06,
+      "loss": 0.7319,
+      "step": 1690
+    },
+    {
+      "epoch": 2.4822047818945063,
+      "grad_norm": 0.6790087073589551,
+      "learning_rate": 5e-06,
+      "loss": 0.7344,
+      "step": 1700
+    },
+    {
+      "epoch": 2.496805986493886,
+      "grad_norm": 0.5337750367727961,
+      "learning_rate": 5e-06,
+      "loss": 0.7348,
+      "step": 1710
+    },
+    {
+      "epoch": 2.511407191093265,
+      "grad_norm": 0.578024943637955,
+      "learning_rate": 5e-06,
+      "loss": 0.7335,
+      "step": 1720
+    },
+    {
+      "epoch": 2.526008395692645,
+      "grad_norm": 0.5321237652992599,
+      "learning_rate": 5e-06,
+      "loss": 0.7341,
+      "step": 1730
+    },
+    {
+      "epoch": 2.540609600292024,
+      "grad_norm": 0.552250362014024,
+      "learning_rate": 5e-06,
+      "loss": 0.7345,
+      "step": 1740
+    },
+    {
+      "epoch": 2.5552108048914035,
+      "grad_norm": 0.47716769494866995,
+      "learning_rate": 5e-06,
+      "loss": 0.7364,
+      "step": 1750
+    },
+    {
+      "epoch": 2.569812009490783,
+      "grad_norm": 0.5382810891628738,
+      "learning_rate": 5e-06,
+      "loss": 0.7327,
+      "step": 1760
+    },
+    {
+      "epoch": 2.5844132140901626,
+      "grad_norm": 0.5291652309846363,
+      "learning_rate": 5e-06,
+      "loss": 0.734,
+      "step": 1770
+    },
+    {
+      "epoch": 2.599014418689542,
+      "grad_norm": 0.583404809051004,
+      "learning_rate": 5e-06,
+      "loss": 0.7309,
+      "step": 1780
+    },
+    {
+      "epoch": 2.613615623288921,
+      "grad_norm": 0.607832211058636,
+      "learning_rate": 5e-06,
+      "loss": 0.7352,
+      "step": 1790
+    },
+    {
+      "epoch": 2.6282168278883007,
+      "grad_norm": 0.5598928601891838,
+      "learning_rate": 5e-06,
+      "loss": 0.7356,
+      "step": 1800
+    },
+    {
+      "epoch": 2.64281803248768,
+      "grad_norm": 0.5949750898099526,
+      "learning_rate": 5e-06,
+      "loss": 0.7353,
+      "step": 1810
+    },
+    {
+      "epoch": 2.6574192370870597,
+      "grad_norm": 0.6066820856022053,
+      "learning_rate": 5e-06,
+      "loss": 0.7347,
+      "step": 1820
+    },
+    {
+      "epoch": 2.6720204416864393,
+      "grad_norm": 0.5198033507111653,
+      "learning_rate": 5e-06,
+      "loss": 0.7354,
+      "step": 1830
+    },
+    {
+      "epoch": 2.6866216462858183,
+      "grad_norm": 0.5040452692645214,
+      "learning_rate": 5e-06,
+      "loss": 0.7391,
+      "step": 1840
+    },
+    {
+      "epoch": 2.701222850885198,
+      "grad_norm": 0.5156854247429866,
+      "learning_rate": 5e-06,
+      "loss": 0.7388,
+      "step": 1850
+    },
+    {
+      "epoch": 2.7158240554845774,
+      "grad_norm": 0.5617334329140413,
+      "learning_rate": 5e-06,
+      "loss": 0.737,
+      "step": 1860
+    },
+    {
+      "epoch": 2.730425260083957,
+      "grad_norm": 0.4970472716656489,
+      "learning_rate": 5e-06,
+      "loss": 0.7359,
+      "step": 1870
+    },
+    {
+      "epoch": 2.7450264646833364,
+      "grad_norm": 0.6666729572656519,
+      "learning_rate": 5e-06,
+      "loss": 0.7349,
+      "step": 1880
+    },
+    {
+      "epoch": 2.759627669282716,
+      "grad_norm": 0.660456603270783,
+      "learning_rate": 5e-06,
+      "loss": 0.7363,
+      "step": 1890
+    },
+    {
+      "epoch": 2.7742288738820955,
+      "grad_norm": 0.5479397279932245,
+      "learning_rate": 5e-06,
+      "loss": 0.7359,
+      "step": 1900
+    },
+    {
+      "epoch": 2.7888300784814746,
+      "grad_norm": 0.5184737073351016,
+      "learning_rate": 5e-06,
+      "loss": 0.7383,
+      "step": 1910
+    },
+    {
+      "epoch": 2.803431283080854,
+      "grad_norm": 0.501451603194624,
+      "learning_rate": 5e-06,
+      "loss": 0.7344,
+      "step": 1920
+    },
+    {
+      "epoch": 2.8180324876802336,
+      "grad_norm": 0.5543991291124852,
+      "learning_rate": 5e-06,
+      "loss": 0.7382,
+      "step": 1930
+    },
+    {
+      "epoch": 2.832633692279613,
+      "grad_norm": 0.6053239113120223,
+      "learning_rate": 5e-06,
+      "loss": 0.7356,
+      "step": 1940
+    },
+    {
+      "epoch": 2.8472348968789927,
+      "grad_norm": 0.5618006505391813,
+      "learning_rate": 5e-06,
+      "loss": 0.7377,
+      "step": 1950
+    },
+    {
+      "epoch": 2.8618361014783718,
+      "grad_norm": 0.5815392261505143,
+      "learning_rate": 5e-06,
+      "loss": 0.7337,
+      "step": 1960
+    },
+    {
+      "epoch": 2.8764373060777513,
+      "grad_norm": 0.7488694605510656,
+      "learning_rate": 5e-06,
+      "loss": 0.7362,
+      "step": 1970
+    },
+    {
+      "epoch": 2.891038510677131,
+      "grad_norm": 0.5769073126410138,
+      "learning_rate": 5e-06,
+      "loss": 0.7359,
+      "step": 1980
+    },
+    {
+      "epoch": 2.9056397152765103,
+      "grad_norm": 0.5750570915989177,
+      "learning_rate": 5e-06,
+      "loss": 0.7331,
+      "step": 1990
+    },
+    {
+      "epoch": 2.92024091987589,
+      "grad_norm": 0.5354199731148004,
+      "learning_rate": 5e-06,
+      "loss": 0.7341,
+      "step": 2000
+    },
+    {
+      "epoch": 2.9348421244752694,
+      "grad_norm": 0.5855570342179945,
+      "learning_rate": 5e-06,
+      "loss": 0.7404,
+      "step": 2010
+    },
+    {
+      "epoch": 2.949443329074649,
+      "grad_norm": 0.6261526281235102,
+      "learning_rate": 5e-06,
+      "loss": 0.7337,
+      "step": 2020
+    },
+    {
+      "epoch": 2.964044533674028,
+      "grad_norm": 0.5504549828167312,
+      "learning_rate": 5e-06,
+      "loss": 0.7348,
+      "step": 2030
+    },
+    {
+      "epoch": 2.9786457382734075,
+      "grad_norm": 0.529021801831048,
+      "learning_rate": 5e-06,
+      "loss": 0.7354,
+      "step": 2040
+    },
+    {
+      "epoch": 2.993246942872787,
+      "grad_norm": 0.5245972765419218,
+      "learning_rate": 5e-06,
+      "loss": 0.7372,
+      "step": 2050
+    },
+    {
+      "epoch": 2.9961671837926627,
+      "eval_loss": 0.8026307821273804,
+      "eval_runtime": 732.5471,
+      "eval_samples_per_second": 25.193,
+      "eval_steps_per_second": 0.395,
+      "step": 2052
+    },
+    {
+      "epoch": 2.9961671837926627,
+      "step": 2052,
+      "total_flos": 3436967047987200.0,
+      "train_loss": 0.7862561077286161,
+      "train_runtime": 121149.8135,
+      "train_samples_per_second": 8.683,
+      "train_steps_per_second": 0.017
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2052,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3436967047987200.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed