End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +1130 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: apache-2.0
 base_model: mistralai/Mistral-7B-v0.3
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: mistral_7b_0-3_oh-dcft-v3.1-claude-3-5-haiku-20241022
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # mistral_7b_0-3_oh-dcft-v3.1-claude-3-5-haiku-20241022
-This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.3708

 base_model: mistralai/Mistral-7B-v0.3
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: mistral_7b_0-3_oh-dcft-v3.1-claude-3-5-haiku-20241022
 # mistral_7b_0-3_oh-dcft-v3.1-claude-3-5-haiku-20241022
+This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the mlfoundations-dev/oh-dcft-v3.1-claude-3-5-haiku-20241022 dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.3708

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 3.0,
+    "eval_loss": 0.37075862288475037,
+    "eval_runtime": 53.1485,
+    "eval_samples_per_second": 256.639,
+    "eval_steps_per_second": 1.016,
+    "total_flos": 2547731650314240.0,
+    "train_loss": 0.2954053143131192,
+    "train_runtime": 9131.3778,
+    "train_samples_per_second": 85.143,
+    "train_steps_per_second": 0.167
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 3.0,
+    "eval_loss": 0.37075862288475037,
+    "eval_runtime": 53.1485,
+    "eval_samples_per_second": 256.639,
+    "eval_steps_per_second": 1.016
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 3.0,
+    "total_flos": 2547731650314240.0,
+    "train_loss": 0.2954053143131192,
+    "train_runtime": 9131.3778,
+    "train_samples_per_second": 85.143,
+    "train_steps_per_second": 0.167
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1130 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1521,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01972386587771203,
+      "grad_norm": 8.260985845744274,
+      "learning_rate": 6.493506493506493e-07,
+      "loss": 0.7556,
+      "step": 10
+    },
+    {
+      "epoch": 0.03944773175542406,
+      "grad_norm": 3.285262290141762,
+      "learning_rate": 1.2987012987012986e-06,
+      "loss": 0.6243,
+      "step": 20
+    },
+    {
+      "epoch": 0.05917159763313609,
+      "grad_norm": 1.8240152669785012,
+      "learning_rate": 1.9480519480519483e-06,
+      "loss": 0.527,
+      "step": 30
+    },
+    {
+      "epoch": 0.07889546351084813,
+      "grad_norm": 2.484918347587673,
+      "learning_rate": 2.597402597402597e-06,
+      "loss": 0.4858,
+      "step": 40
+    },
+    {
+      "epoch": 0.09861932938856016,
+      "grad_norm": 1.6071106659008887,
+      "learning_rate": 3.246753246753247e-06,
+      "loss": 0.4616,
+      "step": 50
+    },
+    {
+      "epoch": 0.11834319526627218,
+      "grad_norm": 1.7753595146102916,
+      "learning_rate": 3.896103896103897e-06,
+      "loss": 0.4467,
+      "step": 60
+    },
+    {
+      "epoch": 0.13806706114398423,
+      "grad_norm": 1.9952784096044014,
+      "learning_rate": 4.5454545454545455e-06,
+      "loss": 0.4355,
+      "step": 70
+    },
+    {
+      "epoch": 0.15779092702169625,
+      "grad_norm": 2.0090088525329057,
+      "learning_rate": 4.999952075361122e-06,
+      "loss": 0.4303,
+      "step": 80
+    },
+    {
+      "epoch": 0.17751479289940827,
+      "grad_norm": 1.9429866993343683,
+      "learning_rate": 4.99910013857428e-06,
+      "loss": 0.4213,
+      "step": 90
+    },
+    {
+      "epoch": 0.19723865877712032,
+      "grad_norm": 2.2150406308730166,
+      "learning_rate": 4.997183673954895e-06,
+      "loss": 0.4205,
+      "step": 100
+    },
+    {
+      "epoch": 0.21696252465483234,
+      "grad_norm": 2.3280715715799105,
+      "learning_rate": 4.994203588590157e-06,
+      "loss": 0.4132,
+      "step": 110
+    },
+    {
+      "epoch": 0.23668639053254437,
+      "grad_norm": 2.0514718162160617,
+      "learning_rate": 4.9901612929925455e-06,
+      "loss": 0.4097,
+      "step": 120
+    },
+    {
+      "epoch": 0.2564102564102564,
+      "grad_norm": 2.248051724393392,
+      "learning_rate": 4.985058700432217e-06,
+      "loss": 0.4078,
+      "step": 130
+    },
+    {
+      "epoch": 0.27613412228796846,
+      "grad_norm": 2.4477065193392114,
+      "learning_rate": 4.978898226031426e-06,
+      "loss": 0.4035,
+      "step": 140
+    },
+    {
+      "epoch": 0.2958579881656805,
+      "grad_norm": 2.3530821376592317,
+      "learning_rate": 4.97168278562142e-06,
+      "loss": 0.3988,
+      "step": 150
+    },
+    {
+      "epoch": 0.3155818540433925,
+      "grad_norm": 2.0658208779463796,
+      "learning_rate": 4.9634157943623345e-06,
+      "loss": 0.4008,
+      "step": 160
+    },
+    {
+      "epoch": 0.33530571992110453,
+      "grad_norm": 1.6308701318103827,
+      "learning_rate": 4.954101165126764e-06,
+      "loss": 0.3955,
+      "step": 170
+    },
+    {
+      "epoch": 0.35502958579881655,
+      "grad_norm": 1.8767575875235638,
+      "learning_rate": 4.943743306647738e-06,
+      "loss": 0.3964,
+      "step": 180
+    },
+    {
+      "epoch": 0.3747534516765286,
+      "grad_norm": 2.158851334024998,
+      "learning_rate": 4.932347121432018e-06,
+      "loss": 0.3955,
+      "step": 190
+    },
+    {
+      "epoch": 0.39447731755424065,
+      "grad_norm": 2.2424601067528367,
+      "learning_rate": 4.919918003439677e-06,
+      "loss": 0.3929,
+      "step": 200
+    },
+    {
+      "epoch": 0.41420118343195267,
+      "grad_norm": 1.4704562127782181,
+      "learning_rate": 4.9064618355310694e-06,
+      "loss": 0.3951,
+      "step": 210
+    },
+    {
+      "epoch": 0.4339250493096647,
+      "grad_norm": 1.5325962055467024,
+      "learning_rate": 4.8919849866823955e-06,
+      "loss": 0.3936,
+      "step": 220
+    },
+    {
+      "epoch": 0.4536489151873767,
+      "grad_norm": 1.752553432251344,
+      "learning_rate": 4.8764943089711876e-06,
+      "loss": 0.3894,
+      "step": 230
+    },
+    {
+      "epoch": 0.47337278106508873,
+      "grad_norm": 3.582185649197669,
+      "learning_rate": 4.859997134333133e-06,
+      "loss": 0.39,
+      "step": 240
+    },
+    {
+      "epoch": 0.4930966469428008,
+      "grad_norm": 2.283623608488685,
+      "learning_rate": 4.842501271091773e-06,
+      "loss": 0.3845,
+      "step": 250
+    },
+    {
+      "epoch": 0.5128205128205128,
+      "grad_norm": 2.954635543178996,
+      "learning_rate": 4.8240150002627285e-06,
+      "loss": 0.3853,
+      "step": 260
+    },
+    {
+      "epoch": 0.5325443786982249,
+      "grad_norm": 2.621411991175976,
+      "learning_rate": 4.80454707163418e-06,
+      "loss": 0.3802,
+      "step": 270
+    },
+    {
+      "epoch": 0.5522682445759369,
+      "grad_norm": 3.0076538937186554,
+      "learning_rate": 4.784106699625493e-06,
+      "loss": 0.3778,
+      "step": 280
+    },
+    {
+      "epoch": 0.571992110453649,
+      "grad_norm": 2.620788244299813,
+      "learning_rate": 4.762703558925907e-06,
+      "loss": 0.381,
+      "step": 290
+    },
+    {
+      "epoch": 0.591715976331361,
+      "grad_norm": 2.600774288511616,
+      "learning_rate": 4.740347779915384e-06,
+      "loss": 0.3795,
+      "step": 300
+    },
+    {
+      "epoch": 0.611439842209073,
+      "grad_norm": 2.825934593188172,
+      "learning_rate": 4.717049943869774e-06,
+      "loss": 0.3754,
+      "step": 310
+    },
+    {
+      "epoch": 0.631163708086785,
+      "grad_norm": 1.9636455738063043,
+      "learning_rate": 4.692821077952556e-06,
+      "loss": 0.3709,
+      "step": 320
+    },
+    {
+      "epoch": 0.650887573964497,
+      "grad_norm": 1.465934093555826,
+      "learning_rate": 4.667672649995539e-06,
+      "loss": 0.3686,
+      "step": 330
+    },
+    {
+      "epoch": 0.6706114398422091,
+      "grad_norm": 1.6730733158146738,
+      "learning_rate": 4.641616563071003e-06,
+      "loss": 0.374,
+      "step": 340
+    },
+    {
+      "epoch": 0.6903353057199211,
+      "grad_norm": 1.6420981338152472,
+      "learning_rate": 4.6146651498578095e-06,
+      "loss": 0.3725,
+      "step": 350
+    },
+    {
+      "epoch": 0.7100591715976331,
+      "grad_norm": 1.7081311753490396,
+      "learning_rate": 4.586831166804191e-06,
+      "loss": 0.3723,
+      "step": 360
+    },
+    {
+      "epoch": 0.7297830374753451,
+      "grad_norm": 1.7004420091082517,
+      "learning_rate": 4.558127788089966e-06,
+      "loss": 0.3685,
+      "step": 370
+    },
+    {
+      "epoch": 0.7495069033530573,
+      "grad_norm": 1.813129623101683,
+      "learning_rate": 4.5285685993910246e-06,
+      "loss": 0.3693,
+      "step": 380
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 1.441392302489358,
+      "learning_rate": 4.49816759144906e-06,
+      "loss": 0.3672,
+      "step": 390
+    },
+    {
+      "epoch": 0.7889546351084813,
+      "grad_norm": 1.743528342139816,
+      "learning_rate": 4.466939153449565e-06,
+      "loss": 0.3629,
+      "step": 400
+    },
+    {
+      "epoch": 0.8086785009861933,
+      "grad_norm": 1.5505480061250534,
+      "learning_rate": 4.434898066211255e-06,
+      "loss": 0.3647,
+      "step": 410
+    },
+    {
+      "epoch": 0.8284023668639053,
+      "grad_norm": 1.748134152515452,
+      "learning_rate": 4.402059495190112e-06,
+      "loss": 0.3687,
+      "step": 420
+    },
+    {
+      "epoch": 0.8481262327416174,
+      "grad_norm": 1.888131474531523,
+      "learning_rate": 4.368438983301382e-06,
+      "loss": 0.368,
+      "step": 430
+    },
+    {
+      "epoch": 0.8678500986193294,
+      "grad_norm": 1.3077877777100417,
+      "learning_rate": 4.334052443562914e-06,
+      "loss": 0.364,
+      "step": 440
+    },
+    {
+      "epoch": 0.8875739644970414,
+      "grad_norm": 1.7143497390643974,
+      "learning_rate": 4.298916151563324e-06,
+      "loss": 0.3662,
+      "step": 450
+    },
+    {
+      "epoch": 0.9072978303747534,
+      "grad_norm": 1.2650560376490414,
+      "learning_rate": 4.263046737758557e-06,
+      "loss": 0.3634,
+      "step": 460
+    },
+    {
+      "epoch": 0.9270216962524654,
+      "grad_norm": 1.325272234023546,
+      "learning_rate": 4.226461179600474e-06,
+      "loss": 0.3647,
+      "step": 470
+    },
+    {
+      "epoch": 0.9467455621301775,
+      "grad_norm": 1.7799396783443953,
+      "learning_rate": 4.189176793501208e-06,
+      "loss": 0.3601,
+      "step": 480
+    },
+    {
+      "epoch": 0.9664694280078896,
+      "grad_norm": 1.6138030010077298,
+      "learning_rate": 4.151211226637083e-06,
+      "loss": 0.3639,
+      "step": 490
+    },
+    {
+      "epoch": 0.9861932938856016,
+      "grad_norm": 1.6475058606657829,
+      "learning_rate": 4.112582448595989e-06,
+      "loss": 0.3631,
+      "step": 500
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.3610161542892456,
+      "eval_runtime": 46.5378,
+      "eval_samples_per_second": 293.095,
+      "eval_steps_per_second": 1.16,
+      "step": 507
+    },
+    {
+      "epoch": 1.0059171597633136,
+      "grad_norm": 2.318083617694004,
+      "learning_rate": 4.073308742872136e-06,
+      "loss": 0.339,
+      "step": 510
+    },
+    {
+      "epoch": 1.0256410256410255,
+      "grad_norm": 2.26507527796031,
+      "learning_rate": 4.033408698212244e-06,
+      "loss": 0.2904,
+      "step": 520
+    },
+    {
+      "epoch": 1.0453648915187377,
+      "grad_norm": 2.129210352759771,
+      "learning_rate": 3.99290119981726e-06,
+      "loss": 0.2845,
+      "step": 530
+    },
+    {
+      "epoch": 1.0650887573964498,
+      "grad_norm": 2.0458511034566897,
+      "learning_rate": 3.95180542040374e-06,
+      "loss": 0.2826,
+      "step": 540
+    },
+    {
+      "epoch": 1.0848126232741617,
+      "grad_norm": 2.34540520465628,
+      "learning_rate": 3.910140811129166e-06,
+      "loss": 0.2817,
+      "step": 550
+    },
+    {
+      "epoch": 1.1045364891518739,
+      "grad_norm": 1.5731137478504271,
+      "learning_rate": 3.8679270923854596e-06,
+      "loss": 0.2816,
+      "step": 560
+    },
+    {
+      "epoch": 1.1242603550295858,
+      "grad_norm": 1.9641564243584235,
+      "learning_rate": 3.825184244465071e-06,
+      "loss": 0.2833,
+      "step": 570
+    },
+    {
+      "epoch": 1.143984220907298,
+      "grad_norm": 1.5653763677552233,
+      "learning_rate": 3.7819324981040517e-06,
+      "loss": 0.2835,
+      "step": 580
+    },
+    {
+      "epoch": 1.1637080867850098,
+      "grad_norm": 1.4455902546137582,
+      "learning_rate": 3.7381923249065838e-06,
+      "loss": 0.2806,
+      "step": 590
+    },
+    {
+      "epoch": 1.183431952662722,
+      "grad_norm": 1.4589441051909717,
+      "learning_rate": 3.6939844276555146e-06,
+      "loss": 0.2842,
+      "step": 600
+    },
+    {
+      "epoch": 1.2031558185404339,
+      "grad_norm": 1.4737079619190827,
+      "learning_rate": 3.649329730513461e-06,
+      "loss": 0.2818,
+      "step": 610
+    },
+    {
+      "epoch": 1.222879684418146,
+      "grad_norm": 1.424470321783783,
+      "learning_rate": 3.6042493691191377e-06,
+      "loss": 0.2835,
+      "step": 620
+    },
+    {
+      "epoch": 1.242603550295858,
+      "grad_norm": 1.43822809638539,
+      "learning_rate": 3.558764680583589e-06,
+      "loss": 0.2829,
+      "step": 630
+    },
+    {
+      "epoch": 1.26232741617357,
+      "grad_norm": 1.4491877471048427,
+      "learning_rate": 3.51289719339106e-06,
+      "loss": 0.2823,
+      "step": 640
+    },
+    {
+      "epoch": 1.282051282051282,
+      "grad_norm": 1.4979353903583295,
+      "learning_rate": 3.4666686172092927e-06,
+      "loss": 0.2859,
+      "step": 650
+    },
+    {
+      "epoch": 1.301775147928994,
+      "grad_norm": 1.4793881592613725,
+      "learning_rate": 3.4201008326140596e-06,
+      "loss": 0.2849,
+      "step": 660
+    },
+    {
+      "epoch": 1.3214990138067062,
+      "grad_norm": 1.6343693105840815,
+      "learning_rate": 3.3732158807328116e-06,
+      "loss": 0.2875,
+      "step": 670
+    },
+    {
+      "epoch": 1.3412228796844181,
+      "grad_norm": 1.5638318327999918,
+      "learning_rate": 3.3260359528123266e-06,
+      "loss": 0.2877,
+      "step": 680
+    },
+    {
+      "epoch": 1.3609467455621302,
+      "grad_norm": 1.434550639059279,
+      "learning_rate": 3.2785833797153115e-06,
+      "loss": 0.2817,
+      "step": 690
+    },
+    {
+      "epoch": 1.3806706114398422,
+      "grad_norm": 1.3783604211664602,
+      "learning_rate": 3.2308806213509204e-06,
+      "loss": 0.2809,
+      "step": 700
+    },
+    {
+      "epoch": 1.4003944773175543,
+      "grad_norm": 1.7104337243982326,
+      "learning_rate": 3.182950256044188e-06,
+      "loss": 0.2825,
+      "step": 710
+    },
+    {
+      "epoch": 1.4201183431952662,
+      "grad_norm": 1.9527331404429782,
+      "learning_rate": 3.1348149698494233e-06,
+      "loss": 0.2827,
+      "step": 720
+    },
+    {
+      "epoch": 1.4398422090729783,
+      "grad_norm": 1.5082040480125063,
+      "learning_rate": 3.0864975458126158e-06,
+      "loss": 0.2857,
+      "step": 730
+    },
+    {
+      "epoch": 1.4595660749506902,
+      "grad_norm": 1.5939434329404958,
+      "learning_rate": 3.038020853187914e-06,
+      "loss": 0.2831,
+      "step": 740
+    },
+    {
+      "epoch": 1.4792899408284024,
+      "grad_norm": 1.425454732201556,
+      "learning_rate": 2.98940783661333e-06,
+      "loss": 0.2802,
+      "step": 750
+    },
+    {
+      "epoch": 1.4990138067061145,
+      "grad_norm": 1.4324944544127631,
+      "learning_rate": 2.940681505250742e-06,
+      "loss": 0.2848,
+      "step": 760
+    },
+    {
+      "epoch": 1.5187376725838264,
+      "grad_norm": 1.4082984304420074,
+      "learning_rate": 2.8918649218953624e-06,
+      "loss": 0.2801,
+      "step": 770
+    },
+    {
+      "epoch": 1.5384615384615383,
+      "grad_norm": 1.5895657718154816,
+      "learning_rate": 2.84298119205983e-06,
+      "loss": 0.2807,
+      "step": 780
+    },
+    {
+      "epoch": 1.5581854043392505,
+      "grad_norm": 1.6080440377232041,
+      "learning_rate": 2.7940534530380666e-06,
+      "loss": 0.2835,
+      "step": 790
+    },
+    {
+      "epoch": 1.5779092702169626,
+      "grad_norm": 1.404915797241871,
+      "learning_rate": 2.7451048629541045e-06,
+      "loss": 0.2808,
+      "step": 800
+    },
+    {
+      "epoch": 1.5976331360946747,
+      "grad_norm": 1.4879672080505235,
+      "learning_rate": 2.6961585898010523e-06,
+      "loss": 0.2806,
+      "step": 810
+    },
+    {
+      "epoch": 1.6173570019723866,
+      "grad_norm": 1.3888602093522253,
+      "learning_rate": 2.647237800475384e-06,
+      "loss": 0.2832,
+      "step": 820
+    },
+    {
+      "epoch": 1.6370808678500985,
+      "grad_norm": 1.3670120148082392,
+      "learning_rate": 2.5983656498117525e-06,
+      "loss": 0.2825,
+      "step": 830
+    },
+    {
+      "epoch": 1.6568047337278107,
+      "grad_norm": 1.2812642080517738,
+      "learning_rate": 2.54956526962351e-06,
+      "loss": 0.279,
+      "step": 840
+    },
+    {
+      "epoch": 1.6765285996055228,
+      "grad_norm": 1.252430854449729,
+      "learning_rate": 2.5008597577541288e-06,
+      "loss": 0.2814,
+      "step": 850
+    },
+    {
+      "epoch": 1.6962524654832347,
+      "grad_norm": 1.2750427994477165,
+      "learning_rate": 2.45227216714469e-06,
+      "loss": 0.2792,
+      "step": 860
+    },
+    {
+      "epoch": 1.7159763313609466,
+      "grad_norm": 1.354377403404739,
+      "learning_rate": 2.403825494922636e-06,
+      "loss": 0.282,
+      "step": 870
+    },
+    {
+      "epoch": 1.7357001972386588,
+      "grad_norm": 1.4267990848182481,
+      "learning_rate": 2.3555426715169396e-06,
+      "loss": 0.2791,
+      "step": 880
+    },
+    {
+      "epoch": 1.755424063116371,
+      "grad_norm": 1.252857555239978,
+      "learning_rate": 2.3074465498048303e-06,
+      "loss": 0.2826,
+      "step": 890
+    },
+    {
+      "epoch": 1.7751479289940828,
+      "grad_norm": 1.2876786054611615,
+      "learning_rate": 2.259559894295244e-06,
+      "loss": 0.2789,
+      "step": 900
+    },
+    {
+      "epoch": 1.7948717948717947,
+      "grad_norm": 1.2629901820145135,
+      "learning_rate": 2.2119053703540866e-06,
+      "loss": 0.2791,
+      "step": 910
+    },
+    {
+      "epoch": 1.8145956607495068,
+      "grad_norm": 1.3562733049556417,
+      "learning_rate": 2.1645055334764237e-06,
+      "loss": 0.2807,
+      "step": 920
+    },
+    {
+      "epoch": 1.834319526627219,
+      "grad_norm": 1.3132542320273741,
+      "learning_rate": 2.1173828186106828e-06,
+      "loss": 0.2782,
+      "step": 930
+    },
+    {
+      "epoch": 1.854043392504931,
+      "grad_norm": 1.372645351488049,
+      "learning_rate": 2.0705595295399e-06,
+      "loss": 0.28,
+      "step": 940
+    },
+    {
+      "epoch": 1.873767258382643,
+      "grad_norm": 1.286506818666612,
+      "learning_rate": 2.0240578283250596e-06,
+      "loss": 0.2788,
+      "step": 950
+    },
+    {
+      "epoch": 1.893491124260355,
+      "grad_norm": 1.343985774681719,
+      "learning_rate": 1.9778997248155013e-06,
+      "loss": 0.2779,
+      "step": 960
+    },
+    {
+      "epoch": 1.913214990138067,
+      "grad_norm": 1.3873943864064089,
+      "learning_rate": 1.9321070662313824e-06,
+      "loss": 0.2768,
+      "step": 970
+    },
+    {
+      "epoch": 1.9329388560157792,
+      "grad_norm": 1.3822544572854645,
+      "learning_rate": 1.88670152682311e-06,
+      "loss": 0.2753,
+      "step": 980
+    },
+    {
+      "epoch": 1.952662721893491,
+      "grad_norm": 1.3724554338840655,
+      "learning_rate": 1.8417045976126347e-06,
+      "loss": 0.274,
+      "step": 990
+    },
+    {
+      "epoch": 1.972386587771203,
+      "grad_norm": 1.428387339598408,
+      "learning_rate": 1.797137576221482e-06,
+      "loss": 0.2775,
+      "step": 1000
+    },
+    {
+      "epoch": 1.9921104536489151,
+      "grad_norm": 1.2370547509299645,
+      "learning_rate": 1.753021556790314e-06,
+      "loss": 0.2746,
+      "step": 1010
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.3482723832130432,
+      "eval_runtime": 46.4255,
+      "eval_samples_per_second": 293.804,
+      "eval_steps_per_second": 1.163,
+      "step": 1014
+    },
+    {
+      "epoch": 2.0118343195266273,
+      "grad_norm": 1.9502351693684774,
+      "learning_rate": 1.7093774199948004e-06,
+      "loss": 0.2309,
+      "step": 1020
+    },
+    {
+      "epoch": 2.0315581854043394,
+      "grad_norm": 1.5862323859503984,
+      "learning_rate": 1.6662258231625331e-06,
+      "loss": 0.2026,
+      "step": 1030
+    },
+    {
+      "epoch": 2.051282051282051,
+      "grad_norm": 1.3292614459089434,
+      "learning_rate": 1.6235871904956431e-06,
+      "loss": 0.2034,
+      "step": 1040
+    },
+    {
+      "epoch": 2.0710059171597632,
+      "grad_norm": 1.2370582334736997,
+      "learning_rate": 1.5814817034037715e-06,
+      "loss": 0.2008,
+      "step": 1050
+    },
+    {
+      "epoch": 2.0907297830374754,
+      "grad_norm": 1.325897622024457,
+      "learning_rate": 1.5399292909519422e-06,
+      "loss": 0.2042,
+      "step": 1060
+    },
+    {
+      "epoch": 2.1104536489151875,
+      "grad_norm": 1.4548395791353137,
+      "learning_rate": 1.4989496204278897e-06,
+      "loss": 0.2025,
+      "step": 1070
+    },
+    {
+      "epoch": 2.1301775147928996,
+      "grad_norm": 1.36179677292465,
+      "learning_rate": 1.458562088033273e-06,
+      "loss": 0.1978,
+      "step": 1080
+    },
+    {
+      "epoch": 2.1499013806706113,
+      "grad_norm": 1.4589926591648759,
+      "learning_rate": 1.4187858097032086e-06,
+      "loss": 0.2024,
+      "step": 1090
+    },
+    {
+      "epoch": 2.1696252465483234,
+      "grad_norm": 1.3095440667780154,
+      "learning_rate": 1.3796396120584576e-06,
+      "loss": 0.2032,
+      "step": 1100
+    },
+    {
+      "epoch": 2.1893491124260356,
+      "grad_norm": 1.3522834520399176,
+      "learning_rate": 1.341142023494537e-06,
+      "loss": 0.1992,
+      "step": 1110
+    },
+    {
+      "epoch": 2.2090729783037477,
+      "grad_norm": 1.3914925068585928,
+      "learning_rate": 1.3033112654120032e-06,
+      "loss": 0.2029,
+      "step": 1120
+    },
+    {
+      "epoch": 2.2287968441814594,
+      "grad_norm": 1.2392072409116117,
+      "learning_rate": 1.266165243592024e-06,
+      "loss": 0.2019,
+      "step": 1130
+    },
+    {
+      "epoch": 2.2485207100591715,
+      "grad_norm": 1.450828785906611,
+      "learning_rate": 1.2297215397213442e-06,
+      "loss": 0.2029,
+      "step": 1140
+    },
+    {
+      "epoch": 2.2682445759368837,
+      "grad_norm": 1.3539897715774756,
+      "learning_rate": 1.1939974030706499e-06,
+      "loss": 0.1989,
+      "step": 1150
+    },
+    {
+      "epoch": 2.287968441814596,
+      "grad_norm": 1.3124427663284721,
+      "learning_rate": 1.1590097423302681e-06,
+      "loss": 0.2013,
+      "step": 1160
+    },
+    {
+      "epoch": 2.3076923076923075,
+      "grad_norm": 1.2751387286158546,
+      "learning_rate": 1.1247751176070688e-06,
+      "loss": 0.2003,
+      "step": 1170
+    },
+    {
+      "epoch": 2.3274161735700196,
+      "grad_norm": 1.2826788452929796,
+      "learning_rate": 1.0913097325863526e-06,
+      "loss": 0.2013,
+      "step": 1180
+    },
+    {
+      "epoch": 2.3471400394477318,
+      "grad_norm": 1.3449233167779666,
+      "learning_rate": 1.0586294268624391e-06,
+      "loss": 0.2031,
+      "step": 1190
+    },
+    {
+      "epoch": 2.366863905325444,
+      "grad_norm": 1.3034368496811286,
+      "learning_rate": 1.026749668441587e-06,
+      "loss": 0.1994,
+      "step": 1200
+    },
+    {
+      "epoch": 2.386587771203156,
+      "grad_norm": 1.3565807097213252,
+      "learning_rate": 9.956855464207873e-07,
+      "loss": 0.2,
+      "step": 1210
+    },
+    {
+      "epoch": 2.4063116370808677,
+      "grad_norm": 1.451004027193357,
+      "learning_rate": 9.654517638459015e-07,
+      "loss": 0.1996,
+      "step": 1220
+    },
+    {
+      "epoch": 2.42603550295858,
+      "grad_norm": 1.3107553476519733,
+      "learning_rate": 9.360626307525231e-07,
+      "loss": 0.2004,
+      "step": 1230
+    },
+    {
+      "epoch": 2.445759368836292,
+      "grad_norm": 1.2866100592193557,
+      "learning_rate": 9.075320573928513e-07,
+      "loss": 0.2026,
+      "step": 1240
+    },
+    {
+      "epoch": 2.465483234714004,
+      "grad_norm": 1.3169876215045113,
+      "learning_rate": 8.798735476517964e-07,
+      "loss": 0.2027,
+      "step": 1250
+    },
+    {
+      "epoch": 2.485207100591716,
+      "grad_norm": 1.2821201625196061,
+      "learning_rate": 8.531001926554134e-07,
+      "loss": 0.2011,
+      "step": 1260
+    },
+    {
+      "epoch": 2.504930966469428,
+      "grad_norm": 1.315132765819279,
+      "learning_rate": 8.272246645747072e-07,
+      "loss": 0.199,
+      "step": 1270
+    },
+    {
+      "epoch": 2.52465483234714,
+      "grad_norm": 1.276154658164099,
+      "learning_rate": 8.022592106277332e-07,
+      "loss": 0.2008,
+      "step": 1280
+    },
+    {
+      "epoch": 2.544378698224852,
+      "grad_norm": 1.2274421062761773,
+      "learning_rate": 7.782156472828299e-07,
+      "loss": 0.1998,
+      "step": 1290
+    },
+    {
+      "epoch": 2.564102564102564,
+      "grad_norm": 1.2435720383981574,
+      "learning_rate": 7.551053546657356e-07,
+      "loss": 0.1995,
+      "step": 1300
+    },
+    {
+      "epoch": 2.583826429980276,
+      "grad_norm": 1.2327909078947592,
+      "learning_rate": 7.329392711732278e-07,
+      "loss": 0.2024,
+      "step": 1310
+    },
+    {
+      "epoch": 2.603550295857988,
+      "grad_norm": 1.1783489485507048,
+      "learning_rate": 7.117278882958421e-07,
+      "loss": 0.2003,
+      "step": 1320
+    },
+    {
+      "epoch": 2.6232741617357003,
+      "grad_norm": 1.2687230261577986,
+      "learning_rate": 6.914812456521138e-07,
+      "loss": 0.2006,
+      "step": 1330
+    },
+    {
+      "epoch": 2.6429980276134124,
+      "grad_norm": 1.2646158919927277,
+      "learning_rate": 6.722089262366993e-07,
+      "loss": 0.1982,
+      "step": 1340
+    },
+    {
+      "epoch": 2.662721893491124,
+      "grad_norm": 1.2236131305338422,
+      "learning_rate": 6.539200518846226e-07,
+      "loss": 0.2001,
+      "step": 1350
+    },
+    {
+      "epoch": 2.6824457593688362,
+      "grad_norm": 1.2428023457207789,
+      "learning_rate": 6.366232789537923e-07,
+      "loss": 0.2048,
+      "step": 1360
+    },
+    {
+      "epoch": 2.7021696252465484,
+      "grad_norm": 1.2559417256017682,
+      "learning_rate": 6.203267942278395e-07,
+      "loss": 0.2012,
+      "step": 1370
+    },
+    {
+      "epoch": 2.7218934911242605,
+      "grad_norm": 1.2572564112264348,
+      "learning_rate": 6.050383110412069e-07,
+      "loss": 0.1994,
+      "step": 1380
+    },
+    {
+      "epoch": 2.7416173570019726,
+      "grad_norm": 1.1764889460619852,
+      "learning_rate": 5.907650656283289e-07,
+      "loss": 0.2002,
+      "step": 1390
+    },
+    {
+      "epoch": 2.7613412228796843,
+      "grad_norm": 1.2804661059833917,
+      "learning_rate": 5.775138136986298e-07,
+      "loss": 0.2002,
+      "step": 1400
+    },
+    {
+      "epoch": 2.7810650887573964,
+      "grad_norm": 1.3077263435732718,
+      "learning_rate": 5.652908272389604e-07,
+      "loss": 0.1995,
+      "step": 1410
+    },
+    {
+      "epoch": 2.8007889546351086,
+      "grad_norm": 1.231137370296971,
+      "learning_rate": 5.541018915449863e-07,
+      "loss": 0.1989,
+      "step": 1420
+    },
+    {
+      "epoch": 2.8205128205128203,
+      "grad_norm": 1.3443797697665705,
+      "learning_rate": 5.439523024829335e-07,
+      "loss": 0.1983,
+      "step": 1430
+    },
+    {
+      "epoch": 2.8402366863905324,
+      "grad_norm": 1.2092638219767884,
+      "learning_rate": 5.348468639829871e-07,
+      "loss": 0.2007,
+      "step": 1440
+    },
+    {
+      "epoch": 2.8599605522682445,
+      "grad_norm": 1.2392545674361426,
+      "learning_rate": 5.267898857655307e-07,
+      "loss": 0.201,
+      "step": 1450
+    },
+    {
+      "epoch": 2.8796844181459567,
+      "grad_norm": 1.255507262390408,
+      "learning_rate": 5.19785181301299e-07,
+      "loss": 0.2008,
+      "step": 1460
+    },
+    {
+      "epoch": 2.899408284023669,
+      "grad_norm": 1.2545629120536586,
+      "learning_rate": 5.138360660064146e-07,
+      "loss": 0.1979,
+      "step": 1470
+    },
+    {
+      "epoch": 2.9191321499013805,
+      "grad_norm": 1.2279624795193589,
+      "learning_rate": 5.08945355673159e-07,
+      "loss": 0.201,
+      "step": 1480
+    },
+    {
+      "epoch": 2.9388560157790926,
+      "grad_norm": 1.2395946923655343,
+      "learning_rate": 5.05115365137222e-07,
+      "loss": 0.1999,
+      "step": 1490
+    },
+    {
+      "epoch": 2.9585798816568047,
+      "grad_norm": 1.2212433583596156,
+      "learning_rate": 5.023479071820607e-07,
+      "loss": 0.1989,
+      "step": 1500
+    },
+    {
+      "epoch": 2.978303747534517,
+      "grad_norm": 1.298954785136158,
+      "learning_rate": 5.006442916808849e-07,
+      "loss": 0.2019,
+      "step": 1510
+    },
+    {
+      "epoch": 2.998027613412229,
+      "grad_norm": 1.3586461216494594,
+      "learning_rate": 5.000053249766787e-07,
+      "loss": 0.1999,
+      "step": 1520
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.37075862288475037,
+      "eval_runtime": 53.9042,
+      "eval_samples_per_second": 253.041,
+      "eval_steps_per_second": 1.002,
+      "step": 1521
+    },
+    {
+      "epoch": 3.0,
+      "step": 1521,
+      "total_flos": 2547731650314240.0,
+      "train_loss": 0.2954053143131192,
+      "train_runtime": 9131.3778,
+      "train_samples_per_second": 85.143,
+      "train_steps_per_second": 0.167
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1521,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2547731650314240.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed