top_20_ranking_stackexchange / trainer_state.json
sedrickkeh's picture
End of training
be5ffc7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.999498746867168,
"eval_steps": 500,
"global_step": 2244,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013366750208855471,
"grad_norm": 2.4323846059062397,
"learning_rate": 5e-06,
"loss": 1.0521,
"step": 10
},
{
"epoch": 0.026733500417710943,
"grad_norm": 1.2707159898783558,
"learning_rate": 5e-06,
"loss": 0.9449,
"step": 20
},
{
"epoch": 0.040100250626566414,
"grad_norm": 0.6645760066182232,
"learning_rate": 5e-06,
"loss": 0.9205,
"step": 30
},
{
"epoch": 0.053467000835421885,
"grad_norm": 0.6860381528425127,
"learning_rate": 5e-06,
"loss": 0.9062,
"step": 40
},
{
"epoch": 0.06683375104427736,
"grad_norm": 0.8462056832267063,
"learning_rate": 5e-06,
"loss": 0.8941,
"step": 50
},
{
"epoch": 0.08020050125313283,
"grad_norm": 0.5498617128094427,
"learning_rate": 5e-06,
"loss": 0.8866,
"step": 60
},
{
"epoch": 0.0935672514619883,
"grad_norm": 0.6217303867910247,
"learning_rate": 5e-06,
"loss": 0.8719,
"step": 70
},
{
"epoch": 0.10693400167084377,
"grad_norm": 0.687429978149511,
"learning_rate": 5e-06,
"loss": 0.876,
"step": 80
},
{
"epoch": 0.12030075187969924,
"grad_norm": 0.7639829931940186,
"learning_rate": 5e-06,
"loss": 0.8704,
"step": 90
},
{
"epoch": 0.1336675020885547,
"grad_norm": 0.5349974897408032,
"learning_rate": 5e-06,
"loss": 0.8677,
"step": 100
},
{
"epoch": 0.14703425229741018,
"grad_norm": 0.6212381364086903,
"learning_rate": 5e-06,
"loss": 0.8624,
"step": 110
},
{
"epoch": 0.16040100250626566,
"grad_norm": 0.5610901155787884,
"learning_rate": 5e-06,
"loss": 0.8621,
"step": 120
},
{
"epoch": 0.17376775271512113,
"grad_norm": 0.6155926013296407,
"learning_rate": 5e-06,
"loss": 0.8582,
"step": 130
},
{
"epoch": 0.1871345029239766,
"grad_norm": 0.6528571036607788,
"learning_rate": 5e-06,
"loss": 0.8532,
"step": 140
},
{
"epoch": 0.20050125313283207,
"grad_norm": 0.5372075443842537,
"learning_rate": 5e-06,
"loss": 0.8492,
"step": 150
},
{
"epoch": 0.21386800334168754,
"grad_norm": 0.7095829143035569,
"learning_rate": 5e-06,
"loss": 0.8494,
"step": 160
},
{
"epoch": 0.227234753550543,
"grad_norm": 0.7745444177509586,
"learning_rate": 5e-06,
"loss": 0.8476,
"step": 170
},
{
"epoch": 0.24060150375939848,
"grad_norm": 0.7586050901974903,
"learning_rate": 5e-06,
"loss": 0.8494,
"step": 180
},
{
"epoch": 0.25396825396825395,
"grad_norm": 0.5964597569119979,
"learning_rate": 5e-06,
"loss": 0.8498,
"step": 190
},
{
"epoch": 0.2673350041771094,
"grad_norm": 0.6293549963407589,
"learning_rate": 5e-06,
"loss": 0.8432,
"step": 200
},
{
"epoch": 0.2807017543859649,
"grad_norm": 0.5524407679849426,
"learning_rate": 5e-06,
"loss": 0.8475,
"step": 210
},
{
"epoch": 0.29406850459482037,
"grad_norm": 0.524350214049005,
"learning_rate": 5e-06,
"loss": 0.8431,
"step": 220
},
{
"epoch": 0.30743525480367584,
"grad_norm": 0.6760002252683699,
"learning_rate": 5e-06,
"loss": 0.8386,
"step": 230
},
{
"epoch": 0.3208020050125313,
"grad_norm": 0.5906902446596286,
"learning_rate": 5e-06,
"loss": 0.8349,
"step": 240
},
{
"epoch": 0.3341687552213868,
"grad_norm": 0.5723926384792003,
"learning_rate": 5e-06,
"loss": 0.8361,
"step": 250
},
{
"epoch": 0.34753550543024225,
"grad_norm": 0.5616096712561062,
"learning_rate": 5e-06,
"loss": 0.8368,
"step": 260
},
{
"epoch": 0.3609022556390977,
"grad_norm": 0.5507735559959206,
"learning_rate": 5e-06,
"loss": 0.835,
"step": 270
},
{
"epoch": 0.3742690058479532,
"grad_norm": 0.4803949597709757,
"learning_rate": 5e-06,
"loss": 0.8414,
"step": 280
},
{
"epoch": 0.38763575605680867,
"grad_norm": 0.5121852118343002,
"learning_rate": 5e-06,
"loss": 0.8325,
"step": 290
},
{
"epoch": 0.40100250626566414,
"grad_norm": 0.5559477754717894,
"learning_rate": 5e-06,
"loss": 0.8364,
"step": 300
},
{
"epoch": 0.4143692564745196,
"grad_norm": 0.7469026400245374,
"learning_rate": 5e-06,
"loss": 0.8306,
"step": 310
},
{
"epoch": 0.4277360066833751,
"grad_norm": 0.5090947427034287,
"learning_rate": 5e-06,
"loss": 0.8339,
"step": 320
},
{
"epoch": 0.44110275689223055,
"grad_norm": 0.6018861983279394,
"learning_rate": 5e-06,
"loss": 0.8283,
"step": 330
},
{
"epoch": 0.454469507101086,
"grad_norm": 0.5434521657719814,
"learning_rate": 5e-06,
"loss": 0.8285,
"step": 340
},
{
"epoch": 0.4678362573099415,
"grad_norm": 0.5903702809830117,
"learning_rate": 5e-06,
"loss": 0.8324,
"step": 350
},
{
"epoch": 0.48120300751879697,
"grad_norm": 0.6243867601355255,
"learning_rate": 5e-06,
"loss": 0.8284,
"step": 360
},
{
"epoch": 0.49456975772765244,
"grad_norm": 0.6094144532555286,
"learning_rate": 5e-06,
"loss": 0.8283,
"step": 370
},
{
"epoch": 0.5079365079365079,
"grad_norm": 0.5482360219270039,
"learning_rate": 5e-06,
"loss": 0.8289,
"step": 380
},
{
"epoch": 0.5213032581453634,
"grad_norm": 0.5061542985510644,
"learning_rate": 5e-06,
"loss": 0.8317,
"step": 390
},
{
"epoch": 0.5346700083542189,
"grad_norm": 0.6652440131533577,
"learning_rate": 5e-06,
"loss": 0.8256,
"step": 400
},
{
"epoch": 0.5480367585630743,
"grad_norm": 0.5613018728699922,
"learning_rate": 5e-06,
"loss": 0.8252,
"step": 410
},
{
"epoch": 0.5614035087719298,
"grad_norm": 0.7255190718604577,
"learning_rate": 5e-06,
"loss": 0.8247,
"step": 420
},
{
"epoch": 0.5747702589807853,
"grad_norm": 0.6781380945175464,
"learning_rate": 5e-06,
"loss": 0.823,
"step": 430
},
{
"epoch": 0.5881370091896407,
"grad_norm": 0.5530197743336887,
"learning_rate": 5e-06,
"loss": 0.8251,
"step": 440
},
{
"epoch": 0.6015037593984962,
"grad_norm": 0.571851888660113,
"learning_rate": 5e-06,
"loss": 0.8232,
"step": 450
},
{
"epoch": 0.6148705096073517,
"grad_norm": 0.5208791337420644,
"learning_rate": 5e-06,
"loss": 0.8235,
"step": 460
},
{
"epoch": 0.6282372598162071,
"grad_norm": 0.5198842932978275,
"learning_rate": 5e-06,
"loss": 0.8238,
"step": 470
},
{
"epoch": 0.6416040100250626,
"grad_norm": 0.48452315583166233,
"learning_rate": 5e-06,
"loss": 0.8221,
"step": 480
},
{
"epoch": 0.6549707602339181,
"grad_norm": 0.5219240912238245,
"learning_rate": 5e-06,
"loss": 0.8168,
"step": 490
},
{
"epoch": 0.6683375104427736,
"grad_norm": 0.51813285089071,
"learning_rate": 5e-06,
"loss": 0.8173,
"step": 500
},
{
"epoch": 0.681704260651629,
"grad_norm": 0.49897768190410446,
"learning_rate": 5e-06,
"loss": 0.8193,
"step": 510
},
{
"epoch": 0.6950710108604845,
"grad_norm": 0.546834157816808,
"learning_rate": 5e-06,
"loss": 0.8129,
"step": 520
},
{
"epoch": 0.70843776106934,
"grad_norm": 0.5295360571693272,
"learning_rate": 5e-06,
"loss": 0.8194,
"step": 530
},
{
"epoch": 0.7218045112781954,
"grad_norm": 0.6854942956404928,
"learning_rate": 5e-06,
"loss": 0.8193,
"step": 540
},
{
"epoch": 0.7351712614870509,
"grad_norm": 0.6819748794747951,
"learning_rate": 5e-06,
"loss": 0.8161,
"step": 550
},
{
"epoch": 0.7485380116959064,
"grad_norm": 0.7134808000164234,
"learning_rate": 5e-06,
"loss": 0.8166,
"step": 560
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.6412479917820569,
"learning_rate": 5e-06,
"loss": 0.8172,
"step": 570
},
{
"epoch": 0.7752715121136173,
"grad_norm": 0.5246142664617556,
"learning_rate": 5e-06,
"loss": 0.8145,
"step": 580
},
{
"epoch": 0.7886382623224728,
"grad_norm": 0.588843604202556,
"learning_rate": 5e-06,
"loss": 0.82,
"step": 590
},
{
"epoch": 0.8020050125313283,
"grad_norm": 0.5124861711768851,
"learning_rate": 5e-06,
"loss": 0.8156,
"step": 600
},
{
"epoch": 0.8153717627401837,
"grad_norm": 0.5015203839251716,
"learning_rate": 5e-06,
"loss": 0.8191,
"step": 610
},
{
"epoch": 0.8287385129490392,
"grad_norm": 0.6441893371422894,
"learning_rate": 5e-06,
"loss": 0.812,
"step": 620
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.5838304398634407,
"learning_rate": 5e-06,
"loss": 0.8086,
"step": 630
},
{
"epoch": 0.8554720133667502,
"grad_norm": 0.5107304906894905,
"learning_rate": 5e-06,
"loss": 0.8155,
"step": 640
},
{
"epoch": 0.8688387635756056,
"grad_norm": 0.5122885155184959,
"learning_rate": 5e-06,
"loss": 0.8131,
"step": 650
},
{
"epoch": 0.8822055137844611,
"grad_norm": 0.5985811394437027,
"learning_rate": 5e-06,
"loss": 0.8104,
"step": 660
},
{
"epoch": 0.8955722639933166,
"grad_norm": 0.5323936368547137,
"learning_rate": 5e-06,
"loss": 0.8186,
"step": 670
},
{
"epoch": 0.908939014202172,
"grad_norm": 0.616312309430872,
"learning_rate": 5e-06,
"loss": 0.8124,
"step": 680
},
{
"epoch": 0.9223057644110275,
"grad_norm": 0.6593022396181776,
"learning_rate": 5e-06,
"loss": 0.8156,
"step": 690
},
{
"epoch": 0.935672514619883,
"grad_norm": 0.5181097754729659,
"learning_rate": 5e-06,
"loss": 0.8135,
"step": 700
},
{
"epoch": 0.9490392648287385,
"grad_norm": 0.5160202542043503,
"learning_rate": 5e-06,
"loss": 0.8108,
"step": 710
},
{
"epoch": 0.9624060150375939,
"grad_norm": 0.5439429222609182,
"learning_rate": 5e-06,
"loss": 0.8098,
"step": 720
},
{
"epoch": 0.9757727652464494,
"grad_norm": 0.5666778381149935,
"learning_rate": 5e-06,
"loss": 0.8064,
"step": 730
},
{
"epoch": 0.9891395154553049,
"grad_norm": 0.5087008142559319,
"learning_rate": 5e-06,
"loss": 0.8124,
"step": 740
},
{
"epoch": 0.9998329156223893,
"eval_loss": 0.8087860345840454,
"eval_runtime": 793.9439,
"eval_samples_per_second": 25.391,
"eval_steps_per_second": 0.397,
"step": 748
},
{
"epoch": 1.0025062656641603,
"grad_norm": 0.6885103061332264,
"learning_rate": 5e-06,
"loss": 0.8763,
"step": 750
},
{
"epoch": 1.0158730158730158,
"grad_norm": 0.6156521836752095,
"learning_rate": 5e-06,
"loss": 0.7692,
"step": 760
},
{
"epoch": 1.0292397660818713,
"grad_norm": 0.6134559509903806,
"learning_rate": 5e-06,
"loss": 0.7719,
"step": 770
},
{
"epoch": 1.0426065162907268,
"grad_norm": 0.635583159755333,
"learning_rate": 5e-06,
"loss": 0.7724,
"step": 780
},
{
"epoch": 1.0559732664995822,
"grad_norm": 0.5771840092558814,
"learning_rate": 5e-06,
"loss": 0.7724,
"step": 790
},
{
"epoch": 1.0693400167084377,
"grad_norm": 0.5138399093282234,
"learning_rate": 5e-06,
"loss": 0.7671,
"step": 800
},
{
"epoch": 1.0827067669172932,
"grad_norm": 0.5865180500219783,
"learning_rate": 5e-06,
"loss": 0.7741,
"step": 810
},
{
"epoch": 1.0960735171261486,
"grad_norm": 0.5737059877569465,
"learning_rate": 5e-06,
"loss": 0.7735,
"step": 820
},
{
"epoch": 1.1094402673350041,
"grad_norm": 0.7198057887439943,
"learning_rate": 5e-06,
"loss": 0.7715,
"step": 830
},
{
"epoch": 1.1228070175438596,
"grad_norm": 0.723247678442899,
"learning_rate": 5e-06,
"loss": 0.7688,
"step": 840
},
{
"epoch": 1.136173767752715,
"grad_norm": 0.5724777994659187,
"learning_rate": 5e-06,
"loss": 0.7709,
"step": 850
},
{
"epoch": 1.1495405179615705,
"grad_norm": 0.6343455699124487,
"learning_rate": 5e-06,
"loss": 0.7756,
"step": 860
},
{
"epoch": 1.162907268170426,
"grad_norm": 0.5975092244071976,
"learning_rate": 5e-06,
"loss": 0.7762,
"step": 870
},
{
"epoch": 1.1762740183792815,
"grad_norm": 0.5550810138685736,
"learning_rate": 5e-06,
"loss": 0.7713,
"step": 880
},
{
"epoch": 1.189640768588137,
"grad_norm": 0.6031833100946619,
"learning_rate": 5e-06,
"loss": 0.7717,
"step": 890
},
{
"epoch": 1.2030075187969924,
"grad_norm": 0.5674692784021945,
"learning_rate": 5e-06,
"loss": 0.7714,
"step": 900
},
{
"epoch": 1.2163742690058479,
"grad_norm": 0.6831373781930358,
"learning_rate": 5e-06,
"loss": 0.7727,
"step": 910
},
{
"epoch": 1.2297410192147034,
"grad_norm": 0.517398562451772,
"learning_rate": 5e-06,
"loss": 0.7715,
"step": 920
},
{
"epoch": 1.2431077694235588,
"grad_norm": 0.5689793551691444,
"learning_rate": 5e-06,
"loss": 0.7682,
"step": 930
},
{
"epoch": 1.2564745196324143,
"grad_norm": 0.6979997189308218,
"learning_rate": 5e-06,
"loss": 0.7753,
"step": 940
},
{
"epoch": 1.2698412698412698,
"grad_norm": 0.5431703707142987,
"learning_rate": 5e-06,
"loss": 0.7726,
"step": 950
},
{
"epoch": 1.2832080200501252,
"grad_norm": 0.5341233588300426,
"learning_rate": 5e-06,
"loss": 0.7721,
"step": 960
},
{
"epoch": 1.2965747702589807,
"grad_norm": 0.5621957425809071,
"learning_rate": 5e-06,
"loss": 0.7702,
"step": 970
},
{
"epoch": 1.3099415204678362,
"grad_norm": 0.6187116295591158,
"learning_rate": 5e-06,
"loss": 0.7755,
"step": 980
},
{
"epoch": 1.3233082706766917,
"grad_norm": 0.6251656247161459,
"learning_rate": 5e-06,
"loss": 0.7742,
"step": 990
},
{
"epoch": 1.3366750208855471,
"grad_norm": 0.6092934361550684,
"learning_rate": 5e-06,
"loss": 0.7732,
"step": 1000
},
{
"epoch": 1.3500417710944026,
"grad_norm": 0.8086073910477094,
"learning_rate": 5e-06,
"loss": 0.7663,
"step": 1010
},
{
"epoch": 1.363408521303258,
"grad_norm": 0.6337909009600926,
"learning_rate": 5e-06,
"loss": 0.7698,
"step": 1020
},
{
"epoch": 1.3767752715121135,
"grad_norm": 0.6156017975821142,
"learning_rate": 5e-06,
"loss": 0.7687,
"step": 1030
},
{
"epoch": 1.390142021720969,
"grad_norm": 0.4791494199069362,
"learning_rate": 5e-06,
"loss": 0.7707,
"step": 1040
},
{
"epoch": 1.4035087719298245,
"grad_norm": 0.5102907384647386,
"learning_rate": 5e-06,
"loss": 0.7698,
"step": 1050
},
{
"epoch": 1.41687552213868,
"grad_norm": 0.60763231448239,
"learning_rate": 5e-06,
"loss": 0.7722,
"step": 1060
},
{
"epoch": 1.4302422723475354,
"grad_norm": 0.5538961425736992,
"learning_rate": 5e-06,
"loss": 0.7769,
"step": 1070
},
{
"epoch": 1.443609022556391,
"grad_norm": 0.511489662319519,
"learning_rate": 5e-06,
"loss": 0.7709,
"step": 1080
},
{
"epoch": 1.4569757727652464,
"grad_norm": 0.5006381424370965,
"learning_rate": 5e-06,
"loss": 0.7652,
"step": 1090
},
{
"epoch": 1.4703425229741018,
"grad_norm": 0.6446877306415851,
"learning_rate": 5e-06,
"loss": 0.7668,
"step": 1100
},
{
"epoch": 1.4837092731829573,
"grad_norm": 0.6472792025046472,
"learning_rate": 5e-06,
"loss": 0.7748,
"step": 1110
},
{
"epoch": 1.4970760233918128,
"grad_norm": 0.5297094594069526,
"learning_rate": 5e-06,
"loss": 0.7716,
"step": 1120
},
{
"epoch": 1.5104427736006683,
"grad_norm": 0.5172754876638852,
"learning_rate": 5e-06,
"loss": 0.7693,
"step": 1130
},
{
"epoch": 1.5238095238095237,
"grad_norm": 0.5499645842959932,
"learning_rate": 5e-06,
"loss": 0.7663,
"step": 1140
},
{
"epoch": 1.5371762740183792,
"grad_norm": 0.5115786493746641,
"learning_rate": 5e-06,
"loss": 0.7707,
"step": 1150
},
{
"epoch": 1.5505430242272347,
"grad_norm": 0.5733666230248589,
"learning_rate": 5e-06,
"loss": 0.7708,
"step": 1160
},
{
"epoch": 1.5639097744360901,
"grad_norm": 0.4914243878129098,
"learning_rate": 5e-06,
"loss": 0.769,
"step": 1170
},
{
"epoch": 1.5772765246449456,
"grad_norm": 0.5986514689445189,
"learning_rate": 5e-06,
"loss": 0.7722,
"step": 1180
},
{
"epoch": 1.590643274853801,
"grad_norm": 0.49301214049058534,
"learning_rate": 5e-06,
"loss": 0.7709,
"step": 1190
},
{
"epoch": 1.6040100250626566,
"grad_norm": 0.49122462674305145,
"learning_rate": 5e-06,
"loss": 0.7684,
"step": 1200
},
{
"epoch": 1.617376775271512,
"grad_norm": 0.5231320343494373,
"learning_rate": 5e-06,
"loss": 0.773,
"step": 1210
},
{
"epoch": 1.6307435254803675,
"grad_norm": 0.5974519524827527,
"learning_rate": 5e-06,
"loss": 0.7703,
"step": 1220
},
{
"epoch": 1.644110275689223,
"grad_norm": 0.49755848059450075,
"learning_rate": 5e-06,
"loss": 0.7684,
"step": 1230
},
{
"epoch": 1.6574770258980784,
"grad_norm": 0.49980350150699104,
"learning_rate": 5e-06,
"loss": 0.7648,
"step": 1240
},
{
"epoch": 1.670843776106934,
"grad_norm": 0.660197673406872,
"learning_rate": 5e-06,
"loss": 0.7663,
"step": 1250
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.501447743813946,
"learning_rate": 5e-06,
"loss": 0.7687,
"step": 1260
},
{
"epoch": 1.6975772765246449,
"grad_norm": 0.47339053427865196,
"learning_rate": 5e-06,
"loss": 0.7677,
"step": 1270
},
{
"epoch": 1.7109440267335003,
"grad_norm": 0.4776630843112484,
"learning_rate": 5e-06,
"loss": 0.7705,
"step": 1280
},
{
"epoch": 1.7243107769423558,
"grad_norm": 0.5805611285838953,
"learning_rate": 5e-06,
"loss": 0.7664,
"step": 1290
},
{
"epoch": 1.7376775271512113,
"grad_norm": 0.5589747352729452,
"learning_rate": 5e-06,
"loss": 0.7643,
"step": 1300
},
{
"epoch": 1.7510442773600667,
"grad_norm": 0.5862892637271495,
"learning_rate": 5e-06,
"loss": 0.767,
"step": 1310
},
{
"epoch": 1.7644110275689222,
"grad_norm": 0.6267084370944045,
"learning_rate": 5e-06,
"loss": 0.7701,
"step": 1320
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.5590629149887701,
"learning_rate": 5e-06,
"loss": 0.7725,
"step": 1330
},
{
"epoch": 1.7911445279866332,
"grad_norm": 0.589200505231269,
"learning_rate": 5e-06,
"loss": 0.768,
"step": 1340
},
{
"epoch": 1.8045112781954886,
"grad_norm": 0.4948446583957624,
"learning_rate": 5e-06,
"loss": 0.7685,
"step": 1350
},
{
"epoch": 1.817878028404344,
"grad_norm": 0.471229575382462,
"learning_rate": 5e-06,
"loss": 0.7685,
"step": 1360
},
{
"epoch": 1.8312447786131996,
"grad_norm": 0.5347363048336566,
"learning_rate": 5e-06,
"loss": 0.7668,
"step": 1370
},
{
"epoch": 1.844611528822055,
"grad_norm": 0.6085798758140744,
"learning_rate": 5e-06,
"loss": 0.7685,
"step": 1380
},
{
"epoch": 1.8579782790309105,
"grad_norm": 0.49237779847072155,
"learning_rate": 5e-06,
"loss": 0.766,
"step": 1390
},
{
"epoch": 1.871345029239766,
"grad_norm": 0.5429938063483495,
"learning_rate": 5e-06,
"loss": 0.7675,
"step": 1400
},
{
"epoch": 1.8847117794486214,
"grad_norm": 0.5315522378087794,
"learning_rate": 5e-06,
"loss": 0.7651,
"step": 1410
},
{
"epoch": 1.898078529657477,
"grad_norm": 0.5774851920268103,
"learning_rate": 5e-06,
"loss": 0.7683,
"step": 1420
},
{
"epoch": 1.9114452798663324,
"grad_norm": 0.4774206459938876,
"learning_rate": 5e-06,
"loss": 0.7651,
"step": 1430
},
{
"epoch": 1.9248120300751879,
"grad_norm": 0.48893280928600313,
"learning_rate": 5e-06,
"loss": 0.7664,
"step": 1440
},
{
"epoch": 1.9381787802840433,
"grad_norm": 0.47709822943051283,
"learning_rate": 5e-06,
"loss": 0.7667,
"step": 1450
},
{
"epoch": 1.9515455304928988,
"grad_norm": 0.5221458173728611,
"learning_rate": 5e-06,
"loss": 0.7649,
"step": 1460
},
{
"epoch": 1.9649122807017543,
"grad_norm": 0.5458985479332612,
"learning_rate": 5e-06,
"loss": 0.7653,
"step": 1470
},
{
"epoch": 1.9782790309106097,
"grad_norm": 0.5449151757658263,
"learning_rate": 5e-06,
"loss": 0.7665,
"step": 1480
},
{
"epoch": 1.9916457811194652,
"grad_norm": 0.5792068417255367,
"learning_rate": 5e-06,
"loss": 0.7674,
"step": 1490
},
{
"epoch": 1.9996658312447786,
"eval_loss": 0.7951143383979797,
"eval_runtime": 795.386,
"eval_samples_per_second": 25.345,
"eval_steps_per_second": 0.396,
"step": 1496
},
{
"epoch": 2.0050125313283207,
"grad_norm": 0.7521880602206925,
"learning_rate": 5e-06,
"loss": 0.8233,
"step": 1500
},
{
"epoch": 2.018379281537176,
"grad_norm": 0.6560054074439666,
"learning_rate": 5e-06,
"loss": 0.7256,
"step": 1510
},
{
"epoch": 2.0317460317460316,
"grad_norm": 0.5201512747130638,
"learning_rate": 5e-06,
"loss": 0.7218,
"step": 1520
},
{
"epoch": 2.045112781954887,
"grad_norm": 0.5262590120532872,
"learning_rate": 5e-06,
"loss": 0.7285,
"step": 1530
},
{
"epoch": 2.0584795321637426,
"grad_norm": 0.5393650388873087,
"learning_rate": 5e-06,
"loss": 0.7229,
"step": 1540
},
{
"epoch": 2.071846282372598,
"grad_norm": 0.5105428821348765,
"learning_rate": 5e-06,
"loss": 0.7231,
"step": 1550
},
{
"epoch": 2.0852130325814535,
"grad_norm": 0.6021970483052078,
"learning_rate": 5e-06,
"loss": 0.7239,
"step": 1560
},
{
"epoch": 2.098579782790309,
"grad_norm": 0.5009099309313954,
"learning_rate": 5e-06,
"loss": 0.7226,
"step": 1570
},
{
"epoch": 2.1119465329991645,
"grad_norm": 0.5605434690720502,
"learning_rate": 5e-06,
"loss": 0.7277,
"step": 1580
},
{
"epoch": 2.12531328320802,
"grad_norm": 0.5732299598938305,
"learning_rate": 5e-06,
"loss": 0.7286,
"step": 1590
},
{
"epoch": 2.1386800334168754,
"grad_norm": 0.5399334511302041,
"learning_rate": 5e-06,
"loss": 0.726,
"step": 1600
},
{
"epoch": 2.152046783625731,
"grad_norm": 0.505832452848056,
"learning_rate": 5e-06,
"loss": 0.7304,
"step": 1610
},
{
"epoch": 2.1654135338345863,
"grad_norm": 0.5674143618926153,
"learning_rate": 5e-06,
"loss": 0.7232,
"step": 1620
},
{
"epoch": 2.178780284043442,
"grad_norm": 0.5068914103748654,
"learning_rate": 5e-06,
"loss": 0.7336,
"step": 1630
},
{
"epoch": 2.1921470342522973,
"grad_norm": 0.5118320329600874,
"learning_rate": 5e-06,
"loss": 0.7255,
"step": 1640
},
{
"epoch": 2.2055137844611528,
"grad_norm": 0.5156250232792499,
"learning_rate": 5e-06,
"loss": 0.7295,
"step": 1650
},
{
"epoch": 2.2188805346700082,
"grad_norm": 0.6165225897496419,
"learning_rate": 5e-06,
"loss": 0.7274,
"step": 1660
},
{
"epoch": 2.2322472848788637,
"grad_norm": 0.5863877720536036,
"learning_rate": 5e-06,
"loss": 0.7256,
"step": 1670
},
{
"epoch": 2.245614035087719,
"grad_norm": 0.5641007704480012,
"learning_rate": 5e-06,
"loss": 0.7308,
"step": 1680
},
{
"epoch": 2.2589807852965746,
"grad_norm": 0.6101312501534099,
"learning_rate": 5e-06,
"loss": 0.7314,
"step": 1690
},
{
"epoch": 2.27234753550543,
"grad_norm": 0.5200998469176243,
"learning_rate": 5e-06,
"loss": 0.7275,
"step": 1700
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.5398343134194046,
"learning_rate": 5e-06,
"loss": 0.727,
"step": 1710
},
{
"epoch": 2.299081035923141,
"grad_norm": 0.5247712631574941,
"learning_rate": 5e-06,
"loss": 0.727,
"step": 1720
},
{
"epoch": 2.3124477861319965,
"grad_norm": 0.5655985095958795,
"learning_rate": 5e-06,
"loss": 0.7286,
"step": 1730
},
{
"epoch": 2.325814536340852,
"grad_norm": 0.5927409653328921,
"learning_rate": 5e-06,
"loss": 0.7271,
"step": 1740
},
{
"epoch": 2.3391812865497075,
"grad_norm": 0.6148593425957483,
"learning_rate": 5e-06,
"loss": 0.733,
"step": 1750
},
{
"epoch": 2.352548036758563,
"grad_norm": 0.5969831864554942,
"learning_rate": 5e-06,
"loss": 0.7302,
"step": 1760
},
{
"epoch": 2.3659147869674184,
"grad_norm": 0.4985456007136878,
"learning_rate": 5e-06,
"loss": 0.7341,
"step": 1770
},
{
"epoch": 2.379281537176274,
"grad_norm": 0.5005254522981937,
"learning_rate": 5e-06,
"loss": 0.7244,
"step": 1780
},
{
"epoch": 2.3926482873851294,
"grad_norm": 0.5288709360617612,
"learning_rate": 5e-06,
"loss": 0.7312,
"step": 1790
},
{
"epoch": 2.406015037593985,
"grad_norm": 0.5355584900475018,
"learning_rate": 5e-06,
"loss": 0.727,
"step": 1800
},
{
"epoch": 2.4193817878028403,
"grad_norm": 0.5666733459714918,
"learning_rate": 5e-06,
"loss": 0.731,
"step": 1810
},
{
"epoch": 2.4327485380116958,
"grad_norm": 0.5939862506331437,
"learning_rate": 5e-06,
"loss": 0.7292,
"step": 1820
},
{
"epoch": 2.4461152882205512,
"grad_norm": 0.5696153125681646,
"learning_rate": 5e-06,
"loss": 0.7295,
"step": 1830
},
{
"epoch": 2.4594820384294067,
"grad_norm": 0.5263801998302109,
"learning_rate": 5e-06,
"loss": 0.7289,
"step": 1840
},
{
"epoch": 2.472848788638262,
"grad_norm": 0.5564137280433736,
"learning_rate": 5e-06,
"loss": 0.7289,
"step": 1850
},
{
"epoch": 2.4862155388471177,
"grad_norm": 0.6117589560276474,
"learning_rate": 5e-06,
"loss": 0.7281,
"step": 1860
},
{
"epoch": 2.499582289055973,
"grad_norm": 0.5556838242891475,
"learning_rate": 5e-06,
"loss": 0.7296,
"step": 1870
},
{
"epoch": 2.5129490392648286,
"grad_norm": 0.4681598446789898,
"learning_rate": 5e-06,
"loss": 0.7296,
"step": 1880
},
{
"epoch": 2.526315789473684,
"grad_norm": 0.5231611697501862,
"learning_rate": 5e-06,
"loss": 0.7303,
"step": 1890
},
{
"epoch": 2.5396825396825395,
"grad_norm": 0.5126109088017671,
"learning_rate": 5e-06,
"loss": 0.7324,
"step": 1900
},
{
"epoch": 2.553049289891395,
"grad_norm": 0.5300428577804921,
"learning_rate": 5e-06,
"loss": 0.7273,
"step": 1910
},
{
"epoch": 2.5664160401002505,
"grad_norm": 0.4968055663040118,
"learning_rate": 5e-06,
"loss": 0.729,
"step": 1920
},
{
"epoch": 2.579782790309106,
"grad_norm": 0.568494743059541,
"learning_rate": 5e-06,
"loss": 0.7269,
"step": 1930
},
{
"epoch": 2.5931495405179614,
"grad_norm": 0.5482221484283202,
"learning_rate": 5e-06,
"loss": 0.7285,
"step": 1940
},
{
"epoch": 2.606516290726817,
"grad_norm": 0.47129332867964935,
"learning_rate": 5e-06,
"loss": 0.7292,
"step": 1950
},
{
"epoch": 2.6198830409356724,
"grad_norm": 0.5198836974979396,
"learning_rate": 5e-06,
"loss": 0.7264,
"step": 1960
},
{
"epoch": 2.633249791144528,
"grad_norm": 0.4945939304862693,
"learning_rate": 5e-06,
"loss": 0.7279,
"step": 1970
},
{
"epoch": 2.6466165413533833,
"grad_norm": 0.5751403403674279,
"learning_rate": 5e-06,
"loss": 0.7282,
"step": 1980
},
{
"epoch": 2.659983291562239,
"grad_norm": 0.5611452949151137,
"learning_rate": 5e-06,
"loss": 0.7331,
"step": 1990
},
{
"epoch": 2.6733500417710943,
"grad_norm": 0.6119128996618558,
"learning_rate": 5e-06,
"loss": 0.7296,
"step": 2000
},
{
"epoch": 2.6867167919799497,
"grad_norm": 0.4799215562608329,
"learning_rate": 5e-06,
"loss": 0.7298,
"step": 2010
},
{
"epoch": 2.700083542188805,
"grad_norm": 0.5541418078345739,
"learning_rate": 5e-06,
"loss": 0.7268,
"step": 2020
},
{
"epoch": 2.7134502923976607,
"grad_norm": 0.6870311878219804,
"learning_rate": 5e-06,
"loss": 0.7277,
"step": 2030
},
{
"epoch": 2.726817042606516,
"grad_norm": 0.5687894755714459,
"learning_rate": 5e-06,
"loss": 0.7298,
"step": 2040
},
{
"epoch": 2.7401837928153716,
"grad_norm": 0.5330460246090263,
"learning_rate": 5e-06,
"loss": 0.7325,
"step": 2050
},
{
"epoch": 2.753550543024227,
"grad_norm": 0.5427879116319339,
"learning_rate": 5e-06,
"loss": 0.7296,
"step": 2060
},
{
"epoch": 2.7669172932330826,
"grad_norm": 0.6013738539276209,
"learning_rate": 5e-06,
"loss": 0.7281,
"step": 2070
},
{
"epoch": 2.780284043441938,
"grad_norm": 0.6091854363964149,
"learning_rate": 5e-06,
"loss": 0.7294,
"step": 2080
},
{
"epoch": 2.7936507936507935,
"grad_norm": 0.5190279913663577,
"learning_rate": 5e-06,
"loss": 0.7248,
"step": 2090
},
{
"epoch": 2.807017543859649,
"grad_norm": 0.5126718278939274,
"learning_rate": 5e-06,
"loss": 0.7311,
"step": 2100
},
{
"epoch": 2.8203842940685044,
"grad_norm": 0.5571607138857257,
"learning_rate": 5e-06,
"loss": 0.7318,
"step": 2110
},
{
"epoch": 2.83375104427736,
"grad_norm": 0.5341175882686895,
"learning_rate": 5e-06,
"loss": 0.7336,
"step": 2120
},
{
"epoch": 2.8471177944862154,
"grad_norm": 0.4817774606348232,
"learning_rate": 5e-06,
"loss": 0.731,
"step": 2130
},
{
"epoch": 2.860484544695071,
"grad_norm": 0.5487220776810837,
"learning_rate": 5e-06,
"loss": 0.7282,
"step": 2140
},
{
"epoch": 2.8738512949039263,
"grad_norm": 0.6342699103351254,
"learning_rate": 5e-06,
"loss": 0.7335,
"step": 2150
},
{
"epoch": 2.887218045112782,
"grad_norm": 0.5078552425291176,
"learning_rate": 5e-06,
"loss": 0.7273,
"step": 2160
},
{
"epoch": 2.9005847953216373,
"grad_norm": 0.4819316377635323,
"learning_rate": 5e-06,
"loss": 0.7332,
"step": 2170
},
{
"epoch": 2.9139515455304927,
"grad_norm": 0.4627017239179797,
"learning_rate": 5e-06,
"loss": 0.7306,
"step": 2180
},
{
"epoch": 2.927318295739348,
"grad_norm": 0.4761325291977869,
"learning_rate": 5e-06,
"loss": 0.7314,
"step": 2190
},
{
"epoch": 2.9406850459482037,
"grad_norm": 0.5784029020001881,
"learning_rate": 5e-06,
"loss": 0.7298,
"step": 2200
},
{
"epoch": 2.954051796157059,
"grad_norm": 0.5120822643666457,
"learning_rate": 5e-06,
"loss": 0.731,
"step": 2210
},
{
"epoch": 2.9674185463659146,
"grad_norm": 0.5116915736315969,
"learning_rate": 5e-06,
"loss": 0.7322,
"step": 2220
},
{
"epoch": 2.98078529657477,
"grad_norm": 0.5021133290964584,
"learning_rate": 5e-06,
"loss": 0.7269,
"step": 2230
},
{
"epoch": 2.9941520467836256,
"grad_norm": 0.5317540745896701,
"learning_rate": 5e-06,
"loss": 0.7322,
"step": 2240
},
{
"epoch": 2.999498746867168,
"eval_loss": 0.7926730513572693,
"eval_runtime": 792.6639,
"eval_samples_per_second": 25.432,
"eval_steps_per_second": 0.397,
"step": 2244
},
{
"epoch": 2.999498746867168,
"step": 2244,
"total_flos": 3758574199111680.0,
"train_loss": 0.7796513685780625,
"train_runtime": 132137.1731,
"train_samples_per_second": 8.696,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 2244,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3758574199111680.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}