End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +1291 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: top_16_ranking_stackexchange
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # top_16_ranking_stackexchange
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7988

 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: top_16_ranking_stackexchange
 # top_16_ranking_stackexchange
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/top_16_ranking_stackexchange dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7988

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.9980781550288276,
+    "eval_loss": 0.7988265156745911,
+    "eval_runtime": 629.3142,
+    "eval_samples_per_second": 25.067,
+    "eval_steps_per_second": 0.392,
+    "total_flos": 2939480986091520.0,
+    "train_loss": 0.7835989086716263,
+    "train_runtime": 103740.1935,
+    "train_samples_per_second": 8.667,
+    "train_steps_per_second": 0.017
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.9980781550288276,
+    "eval_loss": 0.7988265156745911,
+    "eval_runtime": 629.3142,
+    "eval_samples_per_second": 25.067,
+    "eval_steps_per_second": 0.392
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.9980781550288276,
+    "total_flos": 2939480986091520.0,
+    "train_loss": 0.7835989086716263,
+    "train_runtime": 103740.1935,
+    "train_samples_per_second": 8.667,
+    "train_steps_per_second": 0.017
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1291 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9980781550288276,
+  "eval_steps": 500,
+  "global_step": 1755,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01708306641042067,
+      "grad_norm": 30.988456901868677,
+      "learning_rate": 5e-06,
+      "loss": 1.0716,
+      "step": 10
+    },
+    {
+      "epoch": 0.03416613282084134,
+      "grad_norm": 3.981743724280262,
+      "learning_rate": 5e-06,
+      "loss": 0.9952,
+      "step": 20
+    },
+    {
+      "epoch": 0.05124919923126201,
+      "grad_norm": 0.9238645350776216,
+      "learning_rate": 5e-06,
+      "loss": 0.9502,
+      "step": 30
+    },
+    {
+      "epoch": 0.06833226564168268,
+      "grad_norm": 1.0528903755575718,
+      "learning_rate": 5e-06,
+      "loss": 0.9206,
+      "step": 40
+    },
+    {
+      "epoch": 0.08541533205210335,
+      "grad_norm": 1.1750428009605969,
+      "learning_rate": 5e-06,
+      "loss": 0.9102,
+      "step": 50
+    },
+    {
+      "epoch": 0.10249839846252402,
+      "grad_norm": 0.7437586737148641,
+      "learning_rate": 5e-06,
+      "loss": 0.8945,
+      "step": 60
+    },
+    {
+      "epoch": 0.11958146487294469,
+      "grad_norm": 0.6775772425198474,
+      "learning_rate": 5e-06,
+      "loss": 0.8836,
+      "step": 70
+    },
+    {
+      "epoch": 0.13666453128336536,
+      "grad_norm": 0.5992902350468045,
+      "learning_rate": 5e-06,
+      "loss": 0.8771,
+      "step": 80
+    },
+    {
+      "epoch": 0.15374759769378604,
+      "grad_norm": 0.5246527905235602,
+      "learning_rate": 5e-06,
+      "loss": 0.8688,
+      "step": 90
+    },
+    {
+      "epoch": 0.1708306641042067,
+      "grad_norm": 0.6744242377338667,
+      "learning_rate": 5e-06,
+      "loss": 0.8674,
+      "step": 100
+    },
+    {
+      "epoch": 0.18791373051462737,
+      "grad_norm": 0.5815020177486712,
+      "learning_rate": 5e-06,
+      "loss": 0.8658,
+      "step": 110
+    },
+    {
+      "epoch": 0.20499679692504805,
+      "grad_norm": 0.612553188685598,
+      "learning_rate": 5e-06,
+      "loss": 0.8607,
+      "step": 120
+    },
+    {
+      "epoch": 0.22207986333546872,
+      "grad_norm": 0.550502713970348,
+      "learning_rate": 5e-06,
+      "loss": 0.8597,
+      "step": 130
+    },
+    {
+      "epoch": 0.23916292974588937,
+      "grad_norm": 0.6852509049344128,
+      "learning_rate": 5e-06,
+      "loss": 0.854,
+      "step": 140
+    },
+    {
+      "epoch": 0.25624599615631005,
+      "grad_norm": 0.578544920290097,
+      "learning_rate": 5e-06,
+      "loss": 0.8583,
+      "step": 150
+    },
+    {
+      "epoch": 0.27332906256673073,
+      "grad_norm": 0.5189175609736144,
+      "learning_rate": 5e-06,
+      "loss": 0.851,
+      "step": 160
+    },
+    {
+      "epoch": 0.2904121289771514,
+      "grad_norm": 0.5087573435881564,
+      "learning_rate": 5e-06,
+      "loss": 0.8505,
+      "step": 170
+    },
+    {
+      "epoch": 0.3074951953875721,
+      "grad_norm": 0.6966346988112697,
+      "learning_rate": 5e-06,
+      "loss": 0.8454,
+      "step": 180
+    },
+    {
+      "epoch": 0.32457826179799276,
+      "grad_norm": 0.5997059592749316,
+      "learning_rate": 5e-06,
+      "loss": 0.8484,
+      "step": 190
+    },
+    {
+      "epoch": 0.3416613282084134,
+      "grad_norm": 0.6852631417751086,
+      "learning_rate": 5e-06,
+      "loss": 0.8483,
+      "step": 200
+    },
+    {
+      "epoch": 0.35874439461883406,
+      "grad_norm": 0.6164595381164006,
+      "learning_rate": 5e-06,
+      "loss": 0.8462,
+      "step": 210
+    },
+    {
+      "epoch": 0.37582746102925474,
+      "grad_norm": 0.6904944801515591,
+      "learning_rate": 5e-06,
+      "loss": 0.8445,
+      "step": 220
+    },
+    {
+      "epoch": 0.3929105274396754,
+      "grad_norm": 0.6512004476683885,
+      "learning_rate": 5e-06,
+      "loss": 0.8399,
+      "step": 230
+    },
+    {
+      "epoch": 0.4099935938500961,
+      "grad_norm": 0.6184327993659001,
+      "learning_rate": 5e-06,
+      "loss": 0.8391,
+      "step": 240
+    },
+    {
+      "epoch": 0.42707666026051677,
+      "grad_norm": 0.802165112115216,
+      "learning_rate": 5e-06,
+      "loss": 0.8384,
+      "step": 250
+    },
+    {
+      "epoch": 0.44415972667093745,
+      "grad_norm": 0.6499733286909369,
+      "learning_rate": 5e-06,
+      "loss": 0.8364,
+      "step": 260
+    },
+    {
+      "epoch": 0.4612427930813581,
+      "grad_norm": 0.5388351566067404,
+      "learning_rate": 5e-06,
+      "loss": 0.8303,
+      "step": 270
+    },
+    {
+      "epoch": 0.47832585949177875,
+      "grad_norm": 0.5438990498346035,
+      "learning_rate": 5e-06,
+      "loss": 0.832,
+      "step": 280
+    },
+    {
+      "epoch": 0.4954089259021994,
+      "grad_norm": 0.5187067150821502,
+      "learning_rate": 5e-06,
+      "loss": 0.8325,
+      "step": 290
+    },
+    {
+      "epoch": 0.5124919923126201,
+      "grad_norm": 0.6697424824465937,
+      "learning_rate": 5e-06,
+      "loss": 0.8337,
+      "step": 300
+    },
+    {
+      "epoch": 0.5295750587230408,
+      "grad_norm": 0.6633949919937228,
+      "learning_rate": 5e-06,
+      "loss": 0.8296,
+      "step": 310
+    },
+    {
+      "epoch": 0.5466581251334615,
+      "grad_norm": 0.5550184232084733,
+      "learning_rate": 5e-06,
+      "loss": 0.8293,
+      "step": 320
+    },
+    {
+      "epoch": 0.5637411915438821,
+      "grad_norm": 0.725923344191194,
+      "learning_rate": 5e-06,
+      "loss": 0.8315,
+      "step": 330
+    },
+    {
+      "epoch": 0.5808242579543028,
+      "grad_norm": 0.6017986140852183,
+      "learning_rate": 5e-06,
+      "loss": 0.828,
+      "step": 340
+    },
+    {
+      "epoch": 0.5979073243647235,
+      "grad_norm": 0.5482521266135052,
+      "learning_rate": 5e-06,
+      "loss": 0.8299,
+      "step": 350
+    },
+    {
+      "epoch": 0.6149903907751442,
+      "grad_norm": 0.5278215410540681,
+      "learning_rate": 5e-06,
+      "loss": 0.832,
+      "step": 360
+    },
+    {
+      "epoch": 0.6320734571855648,
+      "grad_norm": 0.6984298439291815,
+      "learning_rate": 5e-06,
+      "loss": 0.8291,
+      "step": 370
+    },
+    {
+      "epoch": 0.6491565235959855,
+      "grad_norm": 0.5017079870431141,
+      "learning_rate": 5e-06,
+      "loss": 0.827,
+      "step": 380
+    },
+    {
+      "epoch": 0.6662395900064062,
+      "grad_norm": 0.5032298742038609,
+      "learning_rate": 5e-06,
+      "loss": 0.8272,
+      "step": 390
+    },
+    {
+      "epoch": 0.6833226564168268,
+      "grad_norm": 0.5330416146652471,
+      "learning_rate": 5e-06,
+      "loss": 0.8247,
+      "step": 400
+    },
+    {
+      "epoch": 0.7004057228272474,
+      "grad_norm": 0.57373005832922,
+      "learning_rate": 5e-06,
+      "loss": 0.8242,
+      "step": 410
+    },
+    {
+      "epoch": 0.7174887892376681,
+      "grad_norm": 0.5257091555093115,
+      "learning_rate": 5e-06,
+      "loss": 0.8266,
+      "step": 420
+    },
+    {
+      "epoch": 0.7345718556480888,
+      "grad_norm": 0.5789856149074786,
+      "learning_rate": 5e-06,
+      "loss": 0.8242,
+      "step": 430
+    },
+    {
+      "epoch": 0.7516549220585095,
+      "grad_norm": 0.5299653272462573,
+      "learning_rate": 5e-06,
+      "loss": 0.816,
+      "step": 440
+    },
+    {
+      "epoch": 0.7687379884689302,
+      "grad_norm": 0.5964593947123102,
+      "learning_rate": 5e-06,
+      "loss": 0.8242,
+      "step": 450
+    },
+    {
+      "epoch": 0.7858210548793508,
+      "grad_norm": 0.7283098877992732,
+      "learning_rate": 5e-06,
+      "loss": 0.8241,
+      "step": 460
+    },
+    {
+      "epoch": 0.8029041212897715,
+      "grad_norm": 0.5985448658584591,
+      "learning_rate": 5e-06,
+      "loss": 0.8197,
+      "step": 470
+    },
+    {
+      "epoch": 0.8199871877001922,
+      "grad_norm": 0.5623410005491558,
+      "learning_rate": 5e-06,
+      "loss": 0.8213,
+      "step": 480
+    },
+    {
+      "epoch": 0.8370702541106129,
+      "grad_norm": 0.6408816581220068,
+      "learning_rate": 5e-06,
+      "loss": 0.823,
+      "step": 490
+    },
+    {
+      "epoch": 0.8541533205210335,
+      "grad_norm": 0.6249632483859644,
+      "learning_rate": 5e-06,
+      "loss": 0.8184,
+      "step": 500
+    },
+    {
+      "epoch": 0.8712363869314542,
+      "grad_norm": 0.5922346578431508,
+      "learning_rate": 5e-06,
+      "loss": 0.8144,
+      "step": 510
+    },
+    {
+      "epoch": 0.8883194533418749,
+      "grad_norm": 0.690797808616181,
+      "learning_rate": 5e-06,
+      "loss": 0.8179,
+      "step": 520
+    },
+    {
+      "epoch": 0.9054025197522956,
+      "grad_norm": 0.5637410385766849,
+      "learning_rate": 5e-06,
+      "loss": 0.8153,
+      "step": 530
+    },
+    {
+      "epoch": 0.9224855861627163,
+      "grad_norm": 0.6713092701845222,
+      "learning_rate": 5e-06,
+      "loss": 0.8156,
+      "step": 540
+    },
+    {
+      "epoch": 0.9395686525731369,
+      "grad_norm": 0.5614251903253611,
+      "learning_rate": 5e-06,
+      "loss": 0.8151,
+      "step": 550
+    },
+    {
+      "epoch": 0.9566517189835575,
+      "grad_norm": 0.488524190594288,
+      "learning_rate": 5e-06,
+      "loss": 0.8165,
+      "step": 560
+    },
+    {
+      "epoch": 0.9737347853939782,
+      "grad_norm": 0.5588451830957717,
+      "learning_rate": 5e-06,
+      "loss": 0.8147,
+      "step": 570
+    },
+    {
+      "epoch": 0.9908178518043989,
+      "grad_norm": 0.5319341754740086,
+      "learning_rate": 5e-06,
+      "loss": 0.8146,
+      "step": 580
+    },
+    {
+      "epoch": 0.9993593850096092,
+      "eval_loss": 0.8145768046379089,
+      "eval_runtime": 623.2585,
+      "eval_samples_per_second": 25.311,
+      "eval_steps_per_second": 0.396,
+      "step": 585
+    },
+    {
+      "epoch": 1.0079009182148195,
+      "grad_norm": 0.7444773824556985,
+      "learning_rate": 5e-06,
+      "loss": 0.8416,
+      "step": 590
+    },
+    {
+      "epoch": 1.0249839846252402,
+      "grad_norm": 0.6182001774270124,
+      "learning_rate": 5e-06,
+      "loss": 0.7786,
+      "step": 600
+    },
+    {
+      "epoch": 1.0420670510356609,
+      "grad_norm": 0.5471145639195996,
+      "learning_rate": 5e-06,
+      "loss": 0.7689,
+      "step": 610
+    },
+    {
+      "epoch": 1.0591501174460816,
+      "grad_norm": 0.5749072203498992,
+      "learning_rate": 5e-06,
+      "loss": 0.774,
+      "step": 620
+    },
+    {
+      "epoch": 1.0762331838565022,
+      "grad_norm": 0.5458121480997504,
+      "learning_rate": 5e-06,
+      "loss": 0.7727,
+      "step": 630
+    },
+    {
+      "epoch": 1.093316250266923,
+      "grad_norm": 0.57658998771773,
+      "learning_rate": 5e-06,
+      "loss": 0.7723,
+      "step": 640
+    },
+    {
+      "epoch": 1.1103993166773436,
+      "grad_norm": 0.718911287142942,
+      "learning_rate": 5e-06,
+      "loss": 0.7761,
+      "step": 650
+    },
+    {
+      "epoch": 1.1274823830877643,
+      "grad_norm": 0.7129614149484951,
+      "learning_rate": 5e-06,
+      "loss": 0.7791,
+      "step": 660
+    },
+    {
+      "epoch": 1.144565449498185,
+      "grad_norm": 0.5411663435831485,
+      "learning_rate": 5e-06,
+      "loss": 0.7737,
+      "step": 670
+    },
+    {
+      "epoch": 1.1616485159086056,
+      "grad_norm": 0.7987307718934373,
+      "learning_rate": 5e-06,
+      "loss": 0.7665,
+      "step": 680
+    },
+    {
+      "epoch": 1.1787315823190263,
+      "grad_norm": 0.5752310814305064,
+      "learning_rate": 5e-06,
+      "loss": 0.7742,
+      "step": 690
+    },
+    {
+      "epoch": 1.195814648729447,
+      "grad_norm": 0.5310768207788683,
+      "learning_rate": 5e-06,
+      "loss": 0.7738,
+      "step": 700
+    },
+    {
+      "epoch": 1.2128977151398677,
+      "grad_norm": 0.5646734820206145,
+      "learning_rate": 5e-06,
+      "loss": 0.7745,
+      "step": 710
+    },
+    {
+      "epoch": 1.2299807815502883,
+      "grad_norm": 0.5131848643270003,
+      "learning_rate": 5e-06,
+      "loss": 0.7749,
+      "step": 720
+    },
+    {
+      "epoch": 1.247063847960709,
+      "grad_norm": 0.7018347821869065,
+      "learning_rate": 5e-06,
+      "loss": 0.7761,
+      "step": 730
+    },
+    {
+      "epoch": 1.2641469143711297,
+      "grad_norm": 0.5677858771240941,
+      "learning_rate": 5e-06,
+      "loss": 0.7733,
+      "step": 740
+    },
+    {
+      "epoch": 1.2812299807815504,
+      "grad_norm": 0.5314774866996713,
+      "learning_rate": 5e-06,
+      "loss": 0.7751,
+      "step": 750
+    },
+    {
+      "epoch": 1.298313047191971,
+      "grad_norm": 0.6656368518895404,
+      "learning_rate": 5e-06,
+      "loss": 0.7749,
+      "step": 760
+    },
+    {
+      "epoch": 1.3153961136023917,
+      "grad_norm": 0.5039492371844833,
+      "learning_rate": 5e-06,
+      "loss": 0.7741,
+      "step": 770
+    },
+    {
+      "epoch": 1.3324791800128124,
+      "grad_norm": 0.5105033014036762,
+      "learning_rate": 5e-06,
+      "loss": 0.7784,
+      "step": 780
+    },
+    {
+      "epoch": 1.349562246423233,
+      "grad_norm": 0.5030749236842763,
+      "learning_rate": 5e-06,
+      "loss": 0.7758,
+      "step": 790
+    },
+    {
+      "epoch": 1.3666453128336538,
+      "grad_norm": 0.5846299051076495,
+      "learning_rate": 5e-06,
+      "loss": 0.7733,
+      "step": 800
+    },
+    {
+      "epoch": 1.3837283792440744,
+      "grad_norm": 0.5992440442463463,
+      "learning_rate": 5e-06,
+      "loss": 0.7739,
+      "step": 810
+    },
+    {
+      "epoch": 1.4008114456544951,
+      "grad_norm": 0.547090040748775,
+      "learning_rate": 5e-06,
+      "loss": 0.7744,
+      "step": 820
+    },
+    {
+      "epoch": 1.4178945120649158,
+      "grad_norm": 0.5741884776951681,
+      "learning_rate": 5e-06,
+      "loss": 0.7703,
+      "step": 830
+    },
+    {
+      "epoch": 1.4349775784753362,
+      "grad_norm": 0.5238258323687885,
+      "learning_rate": 5e-06,
+      "loss": 0.7701,
+      "step": 840
+    },
+    {
+      "epoch": 1.452060644885757,
+      "grad_norm": 0.5265486787202277,
+      "learning_rate": 5e-06,
+      "loss": 0.7687,
+      "step": 850
+    },
+    {
+      "epoch": 1.4691437112961776,
+      "grad_norm": 0.5591682134523662,
+      "learning_rate": 5e-06,
+      "loss": 0.7694,
+      "step": 860
+    },
+    {
+      "epoch": 1.4862267777065983,
+      "grad_norm": 0.5637486227419112,
+      "learning_rate": 5e-06,
+      "loss": 0.7713,
+      "step": 870
+    },
+    {
+      "epoch": 1.503309844117019,
+      "grad_norm": 0.5276872431482891,
+      "learning_rate": 5e-06,
+      "loss": 0.7687,
+      "step": 880
+    },
+    {
+      "epoch": 1.5203929105274396,
+      "grad_norm": 0.5299879511165935,
+      "learning_rate": 5e-06,
+      "loss": 0.7719,
+      "step": 890
+    },
+    {
+      "epoch": 1.5374759769378603,
+      "grad_norm": 0.48443212446653844,
+      "learning_rate": 5e-06,
+      "loss": 0.7704,
+      "step": 900
+    },
+    {
+      "epoch": 1.554559043348281,
+      "grad_norm": 0.5258029162836203,
+      "learning_rate": 5e-06,
+      "loss": 0.7681,
+      "step": 910
+    },
+    {
+      "epoch": 1.5716421097587017,
+      "grad_norm": 0.5839360099287706,
+      "learning_rate": 5e-06,
+      "loss": 0.772,
+      "step": 920
+    },
+    {
+      "epoch": 1.5887251761691223,
+      "grad_norm": 0.5806331874369932,
+      "learning_rate": 5e-06,
+      "loss": 0.7736,
+      "step": 930
+    },
+    {
+      "epoch": 1.605808242579543,
+      "grad_norm": 0.6613985728737157,
+      "learning_rate": 5e-06,
+      "loss": 0.7724,
+      "step": 940
+    },
+    {
+      "epoch": 1.6228913089899637,
+      "grad_norm": 0.5224489011940004,
+      "learning_rate": 5e-06,
+      "loss": 0.7711,
+      "step": 950
+    },
+    {
+      "epoch": 1.6399743754003844,
+      "grad_norm": 0.5454437716534818,
+      "learning_rate": 5e-06,
+      "loss": 0.7715,
+      "step": 960
+    },
+    {
+      "epoch": 1.657057441810805,
+      "grad_norm": 0.5161526858636564,
+      "learning_rate": 5e-06,
+      "loss": 0.7783,
+      "step": 970
+    },
+    {
+      "epoch": 1.6741405082212257,
+      "grad_norm": 0.7631274530949943,
+      "learning_rate": 5e-06,
+      "loss": 0.7721,
+      "step": 980
+    },
+    {
+      "epoch": 1.6912235746316464,
+      "grad_norm": 0.6083126340996768,
+      "learning_rate": 5e-06,
+      "loss": 0.7718,
+      "step": 990
+    },
+    {
+      "epoch": 1.708306641042067,
+      "grad_norm": 0.5310268793627193,
+      "learning_rate": 5e-06,
+      "loss": 0.7741,
+      "step": 1000
+    },
+    {
+      "epoch": 1.7253897074524878,
+      "grad_norm": 0.4883757515317452,
+      "learning_rate": 5e-06,
+      "loss": 0.771,
+      "step": 1010
+    },
+    {
+      "epoch": 1.7424727738629084,
+      "grad_norm": 0.5215621795180689,
+      "learning_rate": 5e-06,
+      "loss": 0.7728,
+      "step": 1020
+    },
+    {
+      "epoch": 1.759555840273329,
+      "grad_norm": 0.49887803010112675,
+      "learning_rate": 5e-06,
+      "loss": 0.7702,
+      "step": 1030
+    },
+    {
+      "epoch": 1.7766389066837496,
+      "grad_norm": 0.538143965723932,
+      "learning_rate": 5e-06,
+      "loss": 0.7695,
+      "step": 1040
+    },
+    {
+      "epoch": 1.7937219730941703,
+      "grad_norm": 0.5943865951120142,
+      "learning_rate": 5e-06,
+      "loss": 0.7713,
+      "step": 1050
+    },
+    {
+      "epoch": 1.810805039504591,
+      "grad_norm": 0.5034904524114908,
+      "learning_rate": 5e-06,
+      "loss": 0.7707,
+      "step": 1060
+    },
+    {
+      "epoch": 1.8278881059150116,
+      "grad_norm": 0.5739027654813702,
+      "learning_rate": 5e-06,
+      "loss": 0.767,
+      "step": 1070
+    },
+    {
+      "epoch": 1.8449711723254323,
+      "grad_norm": 0.5345337736484315,
+      "learning_rate": 5e-06,
+      "loss": 0.7739,
+      "step": 1080
+    },
+    {
+      "epoch": 1.862054238735853,
+      "grad_norm": 0.8233000709404549,
+      "learning_rate": 5e-06,
+      "loss": 0.7697,
+      "step": 1090
+    },
+    {
+      "epoch": 1.8791373051462736,
+      "grad_norm": 0.5699216631288021,
+      "learning_rate": 5e-06,
+      "loss": 0.7715,
+      "step": 1100
+    },
+    {
+      "epoch": 1.8962203715566943,
+      "grad_norm": 0.5480692157923471,
+      "learning_rate": 5e-06,
+      "loss": 0.7715,
+      "step": 1110
+    },
+    {
+      "epoch": 1.913303437967115,
+      "grad_norm": 0.6493917189844246,
+      "learning_rate": 5e-06,
+      "loss": 0.776,
+      "step": 1120
+    },
+    {
+      "epoch": 1.9303865043775357,
+      "grad_norm": 0.4971193417821817,
+      "learning_rate": 5e-06,
+      "loss": 0.7689,
+      "step": 1130
+    },
+    {
+      "epoch": 1.9474695707879563,
+      "grad_norm": 0.5213534104860004,
+      "learning_rate": 5e-06,
+      "loss": 0.7691,
+      "step": 1140
+    },
+    {
+      "epoch": 1.964552637198377,
+      "grad_norm": 0.5515331643144213,
+      "learning_rate": 5e-06,
+      "loss": 0.7684,
+      "step": 1150
+    },
+    {
+      "epoch": 1.9816357036087977,
+      "grad_norm": 0.6132524891266977,
+      "learning_rate": 5e-06,
+      "loss": 0.7651,
+      "step": 1160
+    },
+    {
+      "epoch": 1.9987187700192184,
+      "grad_norm": 0.5207251406889574,
+      "learning_rate": 5e-06,
+      "loss": 0.7679,
+      "step": 1170
+    },
+    {
+      "epoch": 1.9987187700192184,
+      "eval_loss": 0.8001261949539185,
+      "eval_runtime": 623.1773,
+      "eval_samples_per_second": 25.314,
+      "eval_steps_per_second": 0.396,
+      "step": 1170
+    },
+    {
+      "epoch": 2.015801836429639,
+      "grad_norm": 0.696280654661383,
+      "learning_rate": 5e-06,
+      "loss": 0.7761,
+      "step": 1180
+    },
+    {
+      "epoch": 2.0328849028400597,
+      "grad_norm": 0.6141244434895877,
+      "learning_rate": 5e-06,
+      "loss": 0.7251,
+      "step": 1190
+    },
+    {
+      "epoch": 2.0499679692504804,
+      "grad_norm": 0.6049456191917907,
+      "learning_rate": 5e-06,
+      "loss": 0.7275,
+      "step": 1200
+    },
+    {
+      "epoch": 2.067051035660901,
+      "grad_norm": 0.5271804189451788,
+      "learning_rate": 5e-06,
+      "loss": 0.7222,
+      "step": 1210
+    },
+    {
+      "epoch": 2.0841341020713218,
+      "grad_norm": 0.5825184609454925,
+      "learning_rate": 5e-06,
+      "loss": 0.7265,
+      "step": 1220
+    },
+    {
+      "epoch": 2.1012171684817424,
+      "grad_norm": 0.5032946228138522,
+      "learning_rate": 5e-06,
+      "loss": 0.7253,
+      "step": 1230
+    },
+    {
+      "epoch": 2.118300234892163,
+      "grad_norm": 0.5147327767567343,
+      "learning_rate": 5e-06,
+      "loss": 0.7237,
+      "step": 1240
+    },
+    {
+      "epoch": 2.135383301302584,
+      "grad_norm": 0.5337482087146928,
+      "learning_rate": 5e-06,
+      "loss": 0.7281,
+      "step": 1250
+    },
+    {
+      "epoch": 2.1524663677130045,
+      "grad_norm": 0.59826922882338,
+      "learning_rate": 5e-06,
+      "loss": 0.7328,
+      "step": 1260
+    },
+    {
+      "epoch": 2.169549434123425,
+      "grad_norm": 0.5946570187866338,
+      "learning_rate": 5e-06,
+      "loss": 0.7295,
+      "step": 1270
+    },
+    {
+      "epoch": 2.186632500533846,
+      "grad_norm": 0.6437960040336966,
+      "learning_rate": 5e-06,
+      "loss": 0.7337,
+      "step": 1280
+    },
+    {
+      "epoch": 2.2037155669442665,
+      "grad_norm": 0.5667571654097528,
+      "learning_rate": 5e-06,
+      "loss": 0.7257,
+      "step": 1290
+    },
+    {
+      "epoch": 2.220798633354687,
+      "grad_norm": 0.5711027651356156,
+      "learning_rate": 5e-06,
+      "loss": 0.7295,
+      "step": 1300
+    },
+    {
+      "epoch": 2.237881699765108,
+      "grad_norm": 0.5604913831263466,
+      "learning_rate": 5e-06,
+      "loss": 0.7258,
+      "step": 1310
+    },
+    {
+      "epoch": 2.2549647661755285,
+      "grad_norm": 0.49584992475251044,
+      "learning_rate": 5e-06,
+      "loss": 0.7282,
+      "step": 1320
+    },
+    {
+      "epoch": 2.2720478325859492,
+      "grad_norm": 0.48636367384700585,
+      "learning_rate": 5e-06,
+      "loss": 0.7289,
+      "step": 1330
+    },
+    {
+      "epoch": 2.28913089899637,
+      "grad_norm": 0.5593094273198317,
+      "learning_rate": 5e-06,
+      "loss": 0.7261,
+      "step": 1340
+    },
+    {
+      "epoch": 2.3062139654067906,
+      "grad_norm": 0.5334937979304919,
+      "learning_rate": 5e-06,
+      "loss": 0.7274,
+      "step": 1350
+    },
+    {
+      "epoch": 2.3232970318172113,
+      "grad_norm": 0.63384705400206,
+      "learning_rate": 5e-06,
+      "loss": 0.7295,
+      "step": 1360
+    },
+    {
+      "epoch": 2.340380098227632,
+      "grad_norm": 0.5143434005458392,
+      "learning_rate": 5e-06,
+      "loss": 0.7283,
+      "step": 1370
+    },
+    {
+      "epoch": 2.3574631646380526,
+      "grad_norm": 0.6076783258962472,
+      "learning_rate": 5e-06,
+      "loss": 0.7296,
+      "step": 1380
+    },
+    {
+      "epoch": 2.3745462310484733,
+      "grad_norm": 0.6067756593571197,
+      "learning_rate": 5e-06,
+      "loss": 0.7248,
+      "step": 1390
+    },
+    {
+      "epoch": 2.391629297458894,
+      "grad_norm": 0.6071395306047531,
+      "learning_rate": 5e-06,
+      "loss": 0.7318,
+      "step": 1400
+    },
+    {
+      "epoch": 2.4087123638693146,
+      "grad_norm": 0.5997962743795122,
+      "learning_rate": 5e-06,
+      "loss": 0.7272,
+      "step": 1410
+    },
+    {
+      "epoch": 2.4257954302797353,
+      "grad_norm": 0.6633499264729928,
+      "learning_rate": 5e-06,
+      "loss": 0.7277,
+      "step": 1420
+    },
+    {
+      "epoch": 2.442878496690156,
+      "grad_norm": 0.5544821713317218,
+      "learning_rate": 5e-06,
+      "loss": 0.7307,
+      "step": 1430
+    },
+    {
+      "epoch": 2.4599615631005767,
+      "grad_norm": 0.5348856989916878,
+      "learning_rate": 5e-06,
+      "loss": 0.7313,
+      "step": 1440
+    },
+    {
+      "epoch": 2.4770446295109974,
+      "grad_norm": 0.5567282104551005,
+      "learning_rate": 5e-06,
+      "loss": 0.7304,
+      "step": 1450
+    },
+    {
+      "epoch": 2.494127695921418,
+      "grad_norm": 0.49476735212888745,
+      "learning_rate": 5e-06,
+      "loss": 0.7301,
+      "step": 1460
+    },
+    {
+      "epoch": 2.5112107623318387,
+      "grad_norm": 0.5172195774829064,
+      "learning_rate": 5e-06,
+      "loss": 0.7279,
+      "step": 1470
+    },
+    {
+      "epoch": 2.5282938287422594,
+      "grad_norm": 0.5822469173059942,
+      "learning_rate": 5e-06,
+      "loss": 0.728,
+      "step": 1480
+    },
+    {
+      "epoch": 2.54537689515268,
+      "grad_norm": 0.5665275024242861,
+      "learning_rate": 5e-06,
+      "loss": 0.727,
+      "step": 1490
+    },
+    {
+      "epoch": 2.5624599615631007,
+      "grad_norm": 0.5261365140179813,
+      "learning_rate": 5e-06,
+      "loss": 0.7326,
+      "step": 1500
+    },
+    {
+      "epoch": 2.5795430279735214,
+      "grad_norm": 0.5317881820166406,
+      "learning_rate": 5e-06,
+      "loss": 0.7316,
+      "step": 1510
+    },
+    {
+      "epoch": 2.596626094383942,
+      "grad_norm": 0.5623482133625999,
+      "learning_rate": 5e-06,
+      "loss": 0.7312,
+      "step": 1520
+    },
+    {
+      "epoch": 2.6137091607943628,
+      "grad_norm": 0.5379867031953368,
+      "learning_rate": 5e-06,
+      "loss": 0.729,
+      "step": 1530
+    },
+    {
+      "epoch": 2.6307922272047835,
+      "grad_norm": 0.552606133346205,
+      "learning_rate": 5e-06,
+      "loss": 0.7282,
+      "step": 1540
+    },
+    {
+      "epoch": 2.647875293615204,
+      "grad_norm": 0.5260176310975024,
+      "learning_rate": 5e-06,
+      "loss": 0.7316,
+      "step": 1550
+    },
+    {
+      "epoch": 2.664958360025625,
+      "grad_norm": 0.5417204925891144,
+      "learning_rate": 5e-06,
+      "loss": 0.7311,
+      "step": 1560
+    },
+    {
+      "epoch": 2.6820414264360455,
+      "grad_norm": 0.5191792624557837,
+      "learning_rate": 5e-06,
+      "loss": 0.7317,
+      "step": 1570
+    },
+    {
+      "epoch": 2.699124492846466,
+      "grad_norm": 0.5082503207244659,
+      "learning_rate": 5e-06,
+      "loss": 0.7308,
+      "step": 1580
+    },
+    {
+      "epoch": 2.716207559256887,
+      "grad_norm": 0.5352199374254042,
+      "learning_rate": 5e-06,
+      "loss": 0.7322,
+      "step": 1590
+    },
+    {
+      "epoch": 2.7332906256673075,
+      "grad_norm": 0.4938969791102419,
+      "learning_rate": 5e-06,
+      "loss": 0.7299,
+      "step": 1600
+    },
+    {
+      "epoch": 2.750373692077728,
+      "grad_norm": 0.5349619842682974,
+      "learning_rate": 5e-06,
+      "loss": 0.7311,
+      "step": 1610
+    },
+    {
+      "epoch": 2.767456758488149,
+      "grad_norm": 0.5982776306942509,
+      "learning_rate": 5e-06,
+      "loss": 0.7326,
+      "step": 1620
+    },
+    {
+      "epoch": 2.7845398248985695,
+      "grad_norm": 0.5610641447482575,
+      "learning_rate": 5e-06,
+      "loss": 0.7283,
+      "step": 1630
+    },
+    {
+      "epoch": 2.8016228913089902,
+      "grad_norm": 0.5289582066062115,
+      "learning_rate": 5e-06,
+      "loss": 0.7322,
+      "step": 1640
+    },
+    {
+      "epoch": 2.818705957719411,
+      "grad_norm": 0.5307084408188756,
+      "learning_rate": 5e-06,
+      "loss": 0.7305,
+      "step": 1650
+    },
+    {
+      "epoch": 2.8357890241298316,
+      "grad_norm": 0.5768867367143191,
+      "learning_rate": 5e-06,
+      "loss": 0.7318,
+      "step": 1660
+    },
+    {
+      "epoch": 2.852872090540252,
+      "grad_norm": 0.5013755884966334,
+      "learning_rate": 5e-06,
+      "loss": 0.7261,
+      "step": 1670
+    },
+    {
+      "epoch": 2.8699551569506725,
+      "grad_norm": 0.5386292168646896,
+      "learning_rate": 5e-06,
+      "loss": 0.7326,
+      "step": 1680
+    },
+    {
+      "epoch": 2.887038223361093,
+      "grad_norm": 0.5042887110473108,
+      "learning_rate": 5e-06,
+      "loss": 0.7267,
+      "step": 1690
+    },
+    {
+      "epoch": 2.904121289771514,
+      "grad_norm": 0.5778864247918416,
+      "learning_rate": 5e-06,
+      "loss": 0.7304,
+      "step": 1700
+    },
+    {
+      "epoch": 2.9212043561819345,
+      "grad_norm": 0.4628270969407437,
+      "learning_rate": 5e-06,
+      "loss": 0.7337,
+      "step": 1710
+    },
+    {
+      "epoch": 2.938287422592355,
+      "grad_norm": 0.4828533328054976,
+      "learning_rate": 5e-06,
+      "loss": 0.7305,
+      "step": 1720
+    },
+    {
+      "epoch": 2.955370489002776,
+      "grad_norm": 0.5335294858018457,
+      "learning_rate": 5e-06,
+      "loss": 0.7255,
+      "step": 1730
+    },
+    {
+      "epoch": 2.9724535554131966,
+      "grad_norm": 0.4855853932089583,
+      "learning_rate": 5e-06,
+      "loss": 0.7299,
+      "step": 1740
+    },
+    {
+      "epoch": 2.9895366218236172,
+      "grad_norm": 0.4876186210552259,
+      "learning_rate": 5e-06,
+      "loss": 0.7294,
+      "step": 1750
+    },
+    {
+      "epoch": 2.9980781550288276,
+      "eval_loss": 0.7988265156745911,
+      "eval_runtime": 623.1353,
+      "eval_samples_per_second": 25.316,
+      "eval_steps_per_second": 0.396,
+      "step": 1755
+    },
+    {
+      "epoch": 2.9980781550288276,
+      "step": 1755,
+      "total_flos": 2939480986091520.0,
+      "train_loss": 0.7835989086716263,
+      "train_runtime": 103740.1935,
+      "train_samples_per_second": 8.667,
+      "train_steps_per_second": 0.017
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1755,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2939480986091520.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed