End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +962 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: llama3-1_8b_math_500000_samples
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # llama3-1_8b_math_500000_samples
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.5069

 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: llama3-1_8b_math_500000_samples
 # llama3-1_8b_math_500000_samples
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/math_500000_samples dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.5069

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 3.0,
+    "eval_loss": 0.5069288611412048,
+    "eval_runtime": 41.6306,
+    "eval_samples_per_second": 276.311,
+    "eval_steps_per_second": 1.081,
+    "total_flos": 2145722711408640.0,
+    "train_loss": 0.4994005665799587,
+    "train_runtime": 8118.1452,
+    "train_samples_per_second": 80.761,
+    "train_steps_per_second": 0.158
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 3.0,
+    "eval_loss": 0.5069288611412048,
+    "eval_runtime": 41.6306,
+    "eval_samples_per_second": 276.311,
+    "eval_steps_per_second": 1.081
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 3.0,
+    "total_flos": 2145722711408640.0,
+    "train_loss": 0.4994005665799587,
+    "train_runtime": 8118.1452,
+    "train_samples_per_second": 80.761,
+    "train_steps_per_second": 0.158
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,962 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1281,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0234192037470726,
+      "grad_norm": 2.481647447902812,
+      "learning_rate": 5e-06,
+      "loss": 0.7124,
+      "step": 10
+    },
+    {
+      "epoch": 0.0468384074941452,
+      "grad_norm": 0.7262736099420435,
+      "learning_rate": 5e-06,
+      "loss": 0.6153,
+      "step": 20
+    },
+    {
+      "epoch": 0.0702576112412178,
+      "grad_norm": 0.7790978795240339,
+      "learning_rate": 5e-06,
+      "loss": 0.5859,
+      "step": 30
+    },
+    {
+      "epoch": 0.0936768149882904,
+      "grad_norm": 0.6467962423003429,
+      "learning_rate": 5e-06,
+      "loss": 0.5711,
+      "step": 40
+    },
+    {
+      "epoch": 0.117096018735363,
+      "grad_norm": 0.5143977117058031,
+      "learning_rate": 5e-06,
+      "loss": 0.5668,
+      "step": 50
+    },
+    {
+      "epoch": 0.1405152224824356,
+      "grad_norm": 0.600469324649081,
+      "learning_rate": 5e-06,
+      "loss": 0.559,
+      "step": 60
+    },
+    {
+      "epoch": 0.16393442622950818,
+      "grad_norm": 0.5054683279603859,
+      "learning_rate": 5e-06,
+      "loss": 0.555,
+      "step": 70
+    },
+    {
+      "epoch": 0.1873536299765808,
+      "grad_norm": 0.520518786315232,
+      "learning_rate": 5e-06,
+      "loss": 0.548,
+      "step": 80
+    },
+    {
+      "epoch": 0.2107728337236534,
+      "grad_norm": 1.008487324396366,
+      "learning_rate": 5e-06,
+      "loss": 0.5481,
+      "step": 90
+    },
+    {
+      "epoch": 0.234192037470726,
+      "grad_norm": 0.6357300755902857,
+      "learning_rate": 5e-06,
+      "loss": 0.5449,
+      "step": 100
+    },
+    {
+      "epoch": 0.2576112412177986,
+      "grad_norm": 0.4471586394958172,
+      "learning_rate": 5e-06,
+      "loss": 0.541,
+      "step": 110
+    },
+    {
+      "epoch": 0.2810304449648712,
+      "grad_norm": 0.42874334693852684,
+      "learning_rate": 5e-06,
+      "loss": 0.5397,
+      "step": 120
+    },
+    {
+      "epoch": 0.3044496487119438,
+      "grad_norm": 0.5493546613118618,
+      "learning_rate": 5e-06,
+      "loss": 0.5357,
+      "step": 130
+    },
+    {
+      "epoch": 0.32786885245901637,
+      "grad_norm": 0.7309908680131434,
+      "learning_rate": 5e-06,
+      "loss": 0.535,
+      "step": 140
+    },
+    {
+      "epoch": 0.351288056206089,
+      "grad_norm": 0.5231239547569523,
+      "learning_rate": 5e-06,
+      "loss": 0.5373,
+      "step": 150
+    },
+    {
+      "epoch": 0.3747072599531616,
+      "grad_norm": 0.47995952741525777,
+      "learning_rate": 5e-06,
+      "loss": 0.5346,
+      "step": 160
+    },
+    {
+      "epoch": 0.3981264637002342,
+      "grad_norm": 0.562861554643795,
+      "learning_rate": 5e-06,
+      "loss": 0.5312,
+      "step": 170
+    },
+    {
+      "epoch": 0.4215456674473068,
+      "grad_norm": 0.6323157343840412,
+      "learning_rate": 5e-06,
+      "loss": 0.5332,
+      "step": 180
+    },
+    {
+      "epoch": 0.4449648711943794,
+      "grad_norm": 0.6022835533604722,
+      "learning_rate": 5e-06,
+      "loss": 0.5368,
+      "step": 190
+    },
+    {
+      "epoch": 0.468384074941452,
+      "grad_norm": 0.5241376216453505,
+      "learning_rate": 5e-06,
+      "loss": 0.5348,
+      "step": 200
+    },
+    {
+      "epoch": 0.4918032786885246,
+      "grad_norm": 0.4560855929707371,
+      "learning_rate": 5e-06,
+      "loss": 0.5298,
+      "step": 210
+    },
+    {
+      "epoch": 0.5152224824355972,
+      "grad_norm": 0.4775160386802804,
+      "learning_rate": 5e-06,
+      "loss": 0.527,
+      "step": 220
+    },
+    {
+      "epoch": 0.5386416861826698,
+      "grad_norm": 0.5062169069603658,
+      "learning_rate": 5e-06,
+      "loss": 0.5295,
+      "step": 230
+    },
+    {
+      "epoch": 0.5620608899297423,
+      "grad_norm": 0.43311731742710935,
+      "learning_rate": 5e-06,
+      "loss": 0.5264,
+      "step": 240
+    },
+    {
+      "epoch": 0.585480093676815,
+      "grad_norm": 0.4911544553586758,
+      "learning_rate": 5e-06,
+      "loss": 0.5292,
+      "step": 250
+    },
+    {
+      "epoch": 0.6088992974238876,
+      "grad_norm": 0.5749030738074241,
+      "learning_rate": 5e-06,
+      "loss": 0.5253,
+      "step": 260
+    },
+    {
+      "epoch": 0.6323185011709602,
+      "grad_norm": 0.5580576917700846,
+      "learning_rate": 5e-06,
+      "loss": 0.5238,
+      "step": 270
+    },
+    {
+      "epoch": 0.6557377049180327,
+      "grad_norm": 0.4760038513178861,
+      "learning_rate": 5e-06,
+      "loss": 0.522,
+      "step": 280
+    },
+    {
+      "epoch": 0.6791569086651054,
+      "grad_norm": 0.5216361788356643,
+      "learning_rate": 5e-06,
+      "loss": 0.5243,
+      "step": 290
+    },
+    {
+      "epoch": 0.702576112412178,
+      "grad_norm": 0.46276681576162904,
+      "learning_rate": 5e-06,
+      "loss": 0.5217,
+      "step": 300
+    },
+    {
+      "epoch": 0.7259953161592506,
+      "grad_norm": 0.4747845254798478,
+      "learning_rate": 5e-06,
+      "loss": 0.5226,
+      "step": 310
+    },
+    {
+      "epoch": 0.7494145199063232,
+      "grad_norm": 0.5960080896982851,
+      "learning_rate": 5e-06,
+      "loss": 0.521,
+      "step": 320
+    },
+    {
+      "epoch": 0.7728337236533958,
+      "grad_norm": 0.5212901742846952,
+      "learning_rate": 5e-06,
+      "loss": 0.5201,
+      "step": 330
+    },
+    {
+      "epoch": 0.7962529274004684,
+      "grad_norm": 0.5133380400247927,
+      "learning_rate": 5e-06,
+      "loss": 0.5199,
+      "step": 340
+    },
+    {
+      "epoch": 0.819672131147541,
+      "grad_norm": 0.4368489322167074,
+      "learning_rate": 5e-06,
+      "loss": 0.5204,
+      "step": 350
+    },
+    {
+      "epoch": 0.8430913348946136,
+      "grad_norm": 0.5101656467162784,
+      "learning_rate": 5e-06,
+      "loss": 0.5221,
+      "step": 360
+    },
+    {
+      "epoch": 0.8665105386416861,
+      "grad_norm": 0.5226647593757339,
+      "learning_rate": 5e-06,
+      "loss": 0.519,
+      "step": 370
+    },
+    {
+      "epoch": 0.8899297423887588,
+      "grad_norm": 0.4265472443674912,
+      "learning_rate": 5e-06,
+      "loss": 0.5193,
+      "step": 380
+    },
+    {
+      "epoch": 0.9133489461358314,
+      "grad_norm": 0.42184387318773725,
+      "learning_rate": 5e-06,
+      "loss": 0.5178,
+      "step": 390
+    },
+    {
+      "epoch": 0.936768149882904,
+      "grad_norm": 0.4773759814767807,
+      "learning_rate": 5e-06,
+      "loss": 0.5201,
+      "step": 400
+    },
+    {
+      "epoch": 0.9601873536299765,
+      "grad_norm": 0.5049194555570107,
+      "learning_rate": 5e-06,
+      "loss": 0.5146,
+      "step": 410
+    },
+    {
+      "epoch": 0.9836065573770492,
+      "grad_norm": 0.4836846129062704,
+      "learning_rate": 5e-06,
+      "loss": 0.514,
+      "step": 420
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.5167434215545654,
+      "eval_runtime": 41.6336,
+      "eval_samples_per_second": 276.291,
+      "eval_steps_per_second": 1.081,
+      "step": 427
+    },
+    {
+      "epoch": 1.0070257611241218,
+      "grad_norm": 0.4520838332937754,
+      "learning_rate": 5e-06,
+      "loss": 0.51,
+      "step": 430
+    },
+    {
+      "epoch": 1.0304449648711944,
+      "grad_norm": 0.4866921792731558,
+      "learning_rate": 5e-06,
+      "loss": 0.4977,
+      "step": 440
+    },
+    {
+      "epoch": 1.053864168618267,
+      "grad_norm": 0.46612714801971344,
+      "learning_rate": 5e-06,
+      "loss": 0.4974,
+      "step": 450
+    },
+    {
+      "epoch": 1.0772833723653397,
+      "grad_norm": 0.47487180548015817,
+      "learning_rate": 5e-06,
+      "loss": 0.4947,
+      "step": 460
+    },
+    {
+      "epoch": 1.100702576112412,
+      "grad_norm": 0.5023694335040125,
+      "learning_rate": 5e-06,
+      "loss": 0.4909,
+      "step": 470
+    },
+    {
+      "epoch": 1.1241217798594847,
+      "grad_norm": 0.5257140618717152,
+      "learning_rate": 5e-06,
+      "loss": 0.4932,
+      "step": 480
+    },
+    {
+      "epoch": 1.1475409836065573,
+      "grad_norm": 0.39736116687044365,
+      "learning_rate": 5e-06,
+      "loss": 0.4894,
+      "step": 490
+    },
+    {
+      "epoch": 1.17096018735363,
+      "grad_norm": 0.42753669693798274,
+      "learning_rate": 5e-06,
+      "loss": 0.4943,
+      "step": 500
+    },
+    {
+      "epoch": 1.1943793911007026,
+      "grad_norm": 0.4539073963232022,
+      "learning_rate": 5e-06,
+      "loss": 0.497,
+      "step": 510
+    },
+    {
+      "epoch": 1.2177985948477752,
+      "grad_norm": 0.40221054966287245,
+      "learning_rate": 5e-06,
+      "loss": 0.4916,
+      "step": 520
+    },
+    {
+      "epoch": 1.2412177985948478,
+      "grad_norm": 0.40071702689512967,
+      "learning_rate": 5e-06,
+      "loss": 0.4904,
+      "step": 530
+    },
+    {
+      "epoch": 1.2646370023419204,
+      "grad_norm": 0.4914284242936407,
+      "learning_rate": 5e-06,
+      "loss": 0.4941,
+      "step": 540
+    },
+    {
+      "epoch": 1.288056206088993,
+      "grad_norm": 0.4409757840452587,
+      "learning_rate": 5e-06,
+      "loss": 0.4924,
+      "step": 550
+    },
+    {
+      "epoch": 1.3114754098360657,
+      "grad_norm": 0.4137810147419586,
+      "learning_rate": 5e-06,
+      "loss": 0.4925,
+      "step": 560
+    },
+    {
+      "epoch": 1.334894613583138,
+      "grad_norm": 0.48726970931068914,
+      "learning_rate": 5e-06,
+      "loss": 0.4943,
+      "step": 570
+    },
+    {
+      "epoch": 1.3583138173302107,
+      "grad_norm": 0.5494057601615131,
+      "learning_rate": 5e-06,
+      "loss": 0.4935,
+      "step": 580
+    },
+    {
+      "epoch": 1.3817330210772834,
+      "grad_norm": 0.41251177208283474,
+      "learning_rate": 5e-06,
+      "loss": 0.4956,
+      "step": 590
+    },
+    {
+      "epoch": 1.405152224824356,
+      "grad_norm": 0.39118790995394576,
+      "learning_rate": 5e-06,
+      "loss": 0.4883,
+      "step": 600
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.49321602369191925,
+      "learning_rate": 5e-06,
+      "loss": 0.496,
+      "step": 610
+    },
+    {
+      "epoch": 1.4519906323185012,
+      "grad_norm": 0.5208838513759695,
+      "learning_rate": 5e-06,
+      "loss": 0.4953,
+      "step": 620
+    },
+    {
+      "epoch": 1.4754098360655736,
+      "grad_norm": 0.43794255642976027,
+      "learning_rate": 5e-06,
+      "loss": 0.4951,
+      "step": 630
+    },
+    {
+      "epoch": 1.4988290398126463,
+      "grad_norm": 0.5557497245301519,
+      "learning_rate": 5e-06,
+      "loss": 0.4929,
+      "step": 640
+    },
+    {
+      "epoch": 1.5222482435597189,
+      "grad_norm": 0.5087889678152296,
+      "learning_rate": 5e-06,
+      "loss": 0.4896,
+      "step": 650
+    },
+    {
+      "epoch": 1.5456674473067915,
+      "grad_norm": 0.38311143724497726,
+      "learning_rate": 5e-06,
+      "loss": 0.4928,
+      "step": 660
+    },
+    {
+      "epoch": 1.5690866510538641,
+      "grad_norm": 0.48635422306380777,
+      "learning_rate": 5e-06,
+      "loss": 0.488,
+      "step": 670
+    },
+    {
+      "epoch": 1.5925058548009368,
+      "grad_norm": 0.6136317892186699,
+      "learning_rate": 5e-06,
+      "loss": 0.4949,
+      "step": 680
+    },
+    {
+      "epoch": 1.6159250585480094,
+      "grad_norm": 0.47270773640049263,
+      "learning_rate": 5e-06,
+      "loss": 0.4883,
+      "step": 690
+    },
+    {
+      "epoch": 1.639344262295082,
+      "grad_norm": 0.5783159568227515,
+      "learning_rate": 5e-06,
+      "loss": 0.4936,
+      "step": 700
+    },
+    {
+      "epoch": 1.6627634660421546,
+      "grad_norm": 0.43336426848256004,
+      "learning_rate": 5e-06,
+      "loss": 0.4938,
+      "step": 710
+    },
+    {
+      "epoch": 1.6861826697892273,
+      "grad_norm": 0.4314026408601509,
+      "learning_rate": 5e-06,
+      "loss": 0.4967,
+      "step": 720
+    },
+    {
+      "epoch": 1.7096018735362999,
+      "grad_norm": 0.38409075682155797,
+      "learning_rate": 5e-06,
+      "loss": 0.4891,
+      "step": 730
+    },
+    {
+      "epoch": 1.7330210772833725,
+      "grad_norm": 0.5211936246593988,
+      "learning_rate": 5e-06,
+      "loss": 0.491,
+      "step": 740
+    },
+    {
+      "epoch": 1.756440281030445,
+      "grad_norm": 0.46808950576809366,
+      "learning_rate": 5e-06,
+      "loss": 0.4949,
+      "step": 750
+    },
+    {
+      "epoch": 1.7798594847775175,
+      "grad_norm": 0.3970884421781967,
+      "learning_rate": 5e-06,
+      "loss": 0.4864,
+      "step": 760
+    },
+    {
+      "epoch": 1.8032786885245902,
+      "grad_norm": 0.4338133573209166,
+      "learning_rate": 5e-06,
+      "loss": 0.4877,
+      "step": 770
+    },
+    {
+      "epoch": 1.8266978922716628,
+      "grad_norm": 0.5302929743875919,
+      "learning_rate": 5e-06,
+      "loss": 0.491,
+      "step": 780
+    },
+    {
+      "epoch": 1.8501170960187352,
+      "grad_norm": 0.48309786628518675,
+      "learning_rate": 5e-06,
+      "loss": 0.4919,
+      "step": 790
+    },
+    {
+      "epoch": 1.8735362997658078,
+      "grad_norm": 0.44165745399512696,
+      "learning_rate": 5e-06,
+      "loss": 0.4875,
+      "step": 800
+    },
+    {
+      "epoch": 1.8969555035128804,
+      "grad_norm": 0.46223180789120477,
+      "learning_rate": 5e-06,
+      "loss": 0.4936,
+      "step": 810
+    },
+    {
+      "epoch": 1.920374707259953,
+      "grad_norm": 0.48226738955985293,
+      "learning_rate": 5e-06,
+      "loss": 0.4888,
+      "step": 820
+    },
+    {
+      "epoch": 1.9437939110070257,
+      "grad_norm": 0.39089405316919673,
+      "learning_rate": 5e-06,
+      "loss": 0.4893,
+      "step": 830
+    },
+    {
+      "epoch": 1.9672131147540983,
+      "grad_norm": 0.3967255297864678,
+      "learning_rate": 5e-06,
+      "loss": 0.4915,
+      "step": 840
+    },
+    {
+      "epoch": 1.990632318501171,
+      "grad_norm": 0.4053292002238054,
+      "learning_rate": 5e-06,
+      "loss": 0.4902,
+      "step": 850
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.5077147483825684,
+      "eval_runtime": 41.7501,
+      "eval_samples_per_second": 275.521,
+      "eval_steps_per_second": 1.078,
+      "step": 854
+    },
+    {
+      "epoch": 2.0140515222482436,
+      "grad_norm": 0.5268697371178085,
+      "learning_rate": 5e-06,
+      "loss": 0.4752,
+      "step": 860
+    },
+    {
+      "epoch": 2.037470725995316,
+      "grad_norm": 0.5447609017298904,
+      "learning_rate": 5e-06,
+      "loss": 0.4669,
+      "step": 870
+    },
+    {
+      "epoch": 2.060889929742389,
+      "grad_norm": 0.5637527273857,
+      "learning_rate": 5e-06,
+      "loss": 0.4614,
+      "step": 880
+    },
+    {
+      "epoch": 2.0843091334894615,
+      "grad_norm": 0.3841529769069848,
+      "learning_rate": 5e-06,
+      "loss": 0.4659,
+      "step": 890
+    },
+    {
+      "epoch": 2.107728337236534,
+      "grad_norm": 0.42444670935905104,
+      "learning_rate": 5e-06,
+      "loss": 0.4593,
+      "step": 900
+    },
+    {
+      "epoch": 2.1311475409836067,
+      "grad_norm": 0.4111116772372611,
+      "learning_rate": 5e-06,
+      "loss": 0.4677,
+      "step": 910
+    },
+    {
+      "epoch": 2.1545667447306793,
+      "grad_norm": 0.6373094219643812,
+      "learning_rate": 5e-06,
+      "loss": 0.4652,
+      "step": 920
+    },
+    {
+      "epoch": 2.177985948477752,
+      "grad_norm": 0.6160202621538906,
+      "learning_rate": 5e-06,
+      "loss": 0.4681,
+      "step": 930
+    },
+    {
+      "epoch": 2.201405152224824,
+      "grad_norm": 0.37701921381345616,
+      "learning_rate": 5e-06,
+      "loss": 0.4586,
+      "step": 940
+    },
+    {
+      "epoch": 2.2248243559718968,
+      "grad_norm": 0.4220733688149805,
+      "learning_rate": 5e-06,
+      "loss": 0.4669,
+      "step": 950
+    },
+    {
+      "epoch": 2.2482435597189694,
+      "grad_norm": 0.5051333977952606,
+      "learning_rate": 5e-06,
+      "loss": 0.4665,
+      "step": 960
+    },
+    {
+      "epoch": 2.271662763466042,
+      "grad_norm": 0.45928248167044633,
+      "learning_rate": 5e-06,
+      "loss": 0.4665,
+      "step": 970
+    },
+    {
+      "epoch": 2.2950819672131146,
+      "grad_norm": 0.4495617426425146,
+      "learning_rate": 5e-06,
+      "loss": 0.4662,
+      "step": 980
+    },
+    {
+      "epoch": 2.3185011709601873,
+      "grad_norm": 0.4517571833465374,
+      "learning_rate": 5e-06,
+      "loss": 0.4646,
+      "step": 990
+    },
+    {
+      "epoch": 2.34192037470726,
+      "grad_norm": 0.4138564151461986,
+      "learning_rate": 5e-06,
+      "loss": 0.4647,
+      "step": 1000
+    },
+    {
+      "epoch": 2.3653395784543325,
+      "grad_norm": 0.5738034318601656,
+      "learning_rate": 5e-06,
+      "loss": 0.4659,
+      "step": 1010
+    },
+    {
+      "epoch": 2.388758782201405,
+      "grad_norm": 0.631953187344588,
+      "learning_rate": 5e-06,
+      "loss": 0.4663,
+      "step": 1020
+    },
+    {
+      "epoch": 2.4121779859484778,
+      "grad_norm": 0.46549566538492926,
+      "learning_rate": 5e-06,
+      "loss": 0.4676,
+      "step": 1030
+    },
+    {
+      "epoch": 2.4355971896955504,
+      "grad_norm": 0.43668516137249047,
+      "learning_rate": 5e-06,
+      "loss": 0.4673,
+      "step": 1040
+    },
+    {
+      "epoch": 2.459016393442623,
+      "grad_norm": 0.42948375636759983,
+      "learning_rate": 5e-06,
+      "loss": 0.4654,
+      "step": 1050
+    },
+    {
+      "epoch": 2.4824355971896956,
+      "grad_norm": 0.4391102312189122,
+      "learning_rate": 5e-06,
+      "loss": 0.469,
+      "step": 1060
+    },
+    {
+      "epoch": 2.5058548009367683,
+      "grad_norm": 0.49842008382902114,
+      "learning_rate": 5e-06,
+      "loss": 0.47,
+      "step": 1070
+    },
+    {
+      "epoch": 2.529274004683841,
+      "grad_norm": 0.5588818151397613,
+      "learning_rate": 5e-06,
+      "loss": 0.4685,
+      "step": 1080
+    },
+    {
+      "epoch": 2.552693208430913,
+      "grad_norm": 0.4037726802957124,
+      "learning_rate": 5e-06,
+      "loss": 0.4677,
+      "step": 1090
+    },
+    {
+      "epoch": 2.576112412177986,
+      "grad_norm": 0.48821405764171133,
+      "learning_rate": 5e-06,
+      "loss": 0.4695,
+      "step": 1100
+    },
+    {
+      "epoch": 2.5995316159250583,
+      "grad_norm": 0.3858402261553971,
+      "learning_rate": 5e-06,
+      "loss": 0.4677,
+      "step": 1110
+    },
+    {
+      "epoch": 2.6229508196721314,
+      "grad_norm": 0.4203466943287186,
+      "learning_rate": 5e-06,
+      "loss": 0.4696,
+      "step": 1120
+    },
+    {
+      "epoch": 2.6463700234192036,
+      "grad_norm": 0.48915413911853883,
+      "learning_rate": 5e-06,
+      "loss": 0.4664,
+      "step": 1130
+    },
+    {
+      "epoch": 2.669789227166276,
+      "grad_norm": 0.43449855854006547,
+      "learning_rate": 5e-06,
+      "loss": 0.4693,
+      "step": 1140
+    },
+    {
+      "epoch": 2.693208430913349,
+      "grad_norm": 0.5093894446854779,
+      "learning_rate": 5e-06,
+      "loss": 0.4692,
+      "step": 1150
+    },
+    {
+      "epoch": 2.7166276346604215,
+      "grad_norm": 0.44982206423576104,
+      "learning_rate": 5e-06,
+      "loss": 0.4704,
+      "step": 1160
+    },
+    {
+      "epoch": 2.740046838407494,
+      "grad_norm": 0.40753147448920485,
+      "learning_rate": 5e-06,
+      "loss": 0.4648,
+      "step": 1170
+    },
+    {
+      "epoch": 2.7634660421545667,
+      "grad_norm": 0.36811758162537594,
+      "learning_rate": 5e-06,
+      "loss": 0.4629,
+      "step": 1180
+    },
+    {
+      "epoch": 2.7868852459016393,
+      "grad_norm": 0.3949349460619723,
+      "learning_rate": 5e-06,
+      "loss": 0.4704,
+      "step": 1190
+    },
+    {
+      "epoch": 2.810304449648712,
+      "grad_norm": 0.41402013254949793,
+      "learning_rate": 5e-06,
+      "loss": 0.4707,
+      "step": 1200
+    },
+    {
+      "epoch": 2.8337236533957846,
+      "grad_norm": 0.5432806369874894,
+      "learning_rate": 5e-06,
+      "loss": 0.4679,
+      "step": 1210
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.3788514557857583,
+      "learning_rate": 5e-06,
+      "loss": 0.4711,
+      "step": 1220
+    },
+    {
+      "epoch": 2.88056206088993,
+      "grad_norm": 0.39636258806401947,
+      "learning_rate": 5e-06,
+      "loss": 0.4653,
+      "step": 1230
+    },
+    {
+      "epoch": 2.9039812646370025,
+      "grad_norm": 0.3936100271064975,
+      "learning_rate": 5e-06,
+      "loss": 0.4667,
+      "step": 1240
+    },
+    {
+      "epoch": 2.927400468384075,
+      "grad_norm": 0.4271813759224886,
+      "learning_rate": 5e-06,
+      "loss": 0.4664,
+      "step": 1250
+    },
+    {
+      "epoch": 2.9508196721311473,
+      "grad_norm": 0.4720023108766376,
+      "learning_rate": 5e-06,
+      "loss": 0.468,
+      "step": 1260
+    },
+    {
+      "epoch": 2.9742388758782203,
+      "grad_norm": 0.42625818049428776,
+      "learning_rate": 5e-06,
+      "loss": 0.4654,
+      "step": 1270
+    },
+    {
+      "epoch": 2.9976580796252925,
+      "grad_norm": 0.4282654656901964,
+      "learning_rate": 5e-06,
+      "loss": 0.47,
+      "step": 1280
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.5069288611412048,
+      "eval_runtime": 41.6671,
+      "eval_samples_per_second": 276.069,
+      "eval_steps_per_second": 1.08,
+      "step": 1281
+    },
+    {
+      "epoch": 3.0,
+      "step": 1281,
+      "total_flos": 2145722711408640.0,
+      "train_loss": 0.4994005665799587,
+      "train_runtime": 8118.1452,
+      "train_samples_per_second": 80.761,
+      "train_steps_per_second": 0.158
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1281,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2145722711408640.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed