neuralwonderland's picture
Training in progress, step 2100, checkpoint
0fbb40e verified
{
"best_metric": 0.5752944946289062,
"best_model_checkpoint": "./output/checkpoint-1200",
"epoch": 4.719101123595506,
"eval_steps": 150,
"global_step": 2100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02247191011235955,
"grad_norm": 40.276283264160156,
"learning_rate": 1.25e-06,
"loss": 2.1095,
"step": 10
},
{
"epoch": 0.0449438202247191,
"grad_norm": 15.660475730895996,
"learning_rate": 2.5e-06,
"loss": 1.7071,
"step": 20
},
{
"epoch": 0.06741573033707865,
"grad_norm": 32.55162811279297,
"learning_rate": 3.75e-06,
"loss": 1.746,
"step": 30
},
{
"epoch": 0.0898876404494382,
"grad_norm": 15.043834686279297,
"learning_rate": 5e-06,
"loss": 1.5731,
"step": 40
},
{
"epoch": 0.11235955056179775,
"grad_norm": 24.83352279663086,
"learning_rate": 6.25e-06,
"loss": 1.3603,
"step": 50
},
{
"epoch": 0.1348314606741573,
"grad_norm": 35.10358428955078,
"learning_rate": 7.5e-06,
"loss": 1.3892,
"step": 60
},
{
"epoch": 0.15730337078651685,
"grad_norm": 35.139347076416016,
"learning_rate": 8.75e-06,
"loss": 1.2658,
"step": 70
},
{
"epoch": 0.1797752808988764,
"grad_norm": 28.40645980834961,
"learning_rate": 1e-05,
"loss": 1.187,
"step": 80
},
{
"epoch": 0.20224719101123595,
"grad_norm": 29.196102142333984,
"learning_rate": 1.125e-05,
"loss": 1.237,
"step": 90
},
{
"epoch": 0.2247191011235955,
"grad_norm": 25.024761199951172,
"learning_rate": 1.25e-05,
"loss": 1.2521,
"step": 100
},
{
"epoch": 0.24719101123595505,
"grad_norm": 25.139511108398438,
"learning_rate": 1.2499871543489788e-05,
"loss": 1.1565,
"step": 110
},
{
"epoch": 0.2696629213483146,
"grad_norm": 12.163933753967285,
"learning_rate": 1.2499486179239496e-05,
"loss": 1.1179,
"step": 120
},
{
"epoch": 0.29213483146067415,
"grad_norm": 7.7505998611450195,
"learning_rate": 1.2498843923089939e-05,
"loss": 1.1047,
"step": 130
},
{
"epoch": 0.3146067415730337,
"grad_norm": 21.35865592956543,
"learning_rate": 1.249794480144175e-05,
"loss": 1.1226,
"step": 140
},
{
"epoch": 0.33707865168539325,
"grad_norm": 15.02802562713623,
"learning_rate": 1.24967888512543e-05,
"loss": 0.9727,
"step": 150
},
{
"epoch": 0.33707865168539325,
"eval_loss": 0.9761117696762085,
"eval_runtime": 28.1312,
"eval_samples_per_second": 14.077,
"eval_steps_per_second": 14.077,
"step": 150
},
{
"epoch": 0.3595505617977528,
"grad_norm": 21.93962287902832,
"learning_rate": 1.2495376120044174e-05,
"loss": 1.1303,
"step": 160
},
{
"epoch": 0.38202247191011235,
"grad_norm": 22.738882064819336,
"learning_rate": 1.2493706665883217e-05,
"loss": 1.1115,
"step": 170
},
{
"epoch": 0.4044943820224719,
"grad_norm": 19.962158203125,
"learning_rate": 1.2491780557396153e-05,
"loss": 1.0215,
"step": 180
},
{
"epoch": 0.42696629213483145,
"grad_norm": 22.92249870300293,
"learning_rate": 1.2489597873757757e-05,
"loss": 1.0086,
"step": 190
},
{
"epoch": 0.449438202247191,
"grad_norm": 20.59571075439453,
"learning_rate": 1.2487158704689602e-05,
"loss": 0.9309,
"step": 200
},
{
"epoch": 0.47191011235955055,
"grad_norm": 21.059810638427734,
"learning_rate": 1.248446315045638e-05,
"loss": 1.0798,
"step": 210
},
{
"epoch": 0.4943820224719101,
"grad_norm": 17.537137985229492,
"learning_rate": 1.2481511321861762e-05,
"loss": 0.9138,
"step": 220
},
{
"epoch": 0.5168539325842697,
"grad_norm": 16.027517318725586,
"learning_rate": 1.2478303340243865e-05,
"loss": 0.9623,
"step": 230
},
{
"epoch": 0.5393258426966292,
"grad_norm": 18.933513641357422,
"learning_rate": 1.2474839337470245e-05,
"loss": 0.9063,
"step": 240
},
{
"epoch": 0.5617977528089888,
"grad_norm": 18.865337371826172,
"learning_rate": 1.2471119455932489e-05,
"loss": 0.8277,
"step": 250
},
{
"epoch": 0.5842696629213483,
"grad_norm": 17.30205535888672,
"learning_rate": 1.246714384854036e-05,
"loss": 0.7923,
"step": 260
},
{
"epoch": 0.6067415730337079,
"grad_norm": 14.635787963867188,
"learning_rate": 1.2462912678715502e-05,
"loss": 0.8241,
"step": 270
},
{
"epoch": 0.6292134831460674,
"grad_norm": 21.17339324951172,
"learning_rate": 1.245842612038474e-05,
"loss": 0.8488,
"step": 280
},
{
"epoch": 0.651685393258427,
"grad_norm": 19.38275909423828,
"learning_rate": 1.2453684357972907e-05,
"loss": 1.0164,
"step": 290
},
{
"epoch": 0.6741573033707865,
"grad_norm": 20.8955078125,
"learning_rate": 1.2448687586395288e-05,
"loss": 0.9468,
"step": 300
},
{
"epoch": 0.6741573033707865,
"eval_loss": 0.7623159885406494,
"eval_runtime": 28.3099,
"eval_samples_per_second": 13.988,
"eval_steps_per_second": 13.988,
"step": 300
},
{
"epoch": 0.6966292134831461,
"grad_norm": 14.662369728088379,
"learning_rate": 1.2443436011049593e-05,
"loss": 0.8178,
"step": 310
},
{
"epoch": 0.7191011235955056,
"grad_norm": 15.885370254516602,
"learning_rate": 1.2437929847807512e-05,
"loss": 0.9479,
"step": 320
},
{
"epoch": 0.7415730337078652,
"grad_norm": 14.278594970703125,
"learning_rate": 1.2432169323005851e-05,
"loss": 0.8635,
"step": 330
},
{
"epoch": 0.7640449438202247,
"grad_norm": 14.948090553283691,
"learning_rate": 1.2426154673437223e-05,
"loss": 0.7926,
"step": 340
},
{
"epoch": 0.7865168539325843,
"grad_norm": 20.756702423095703,
"learning_rate": 1.2419886146340315e-05,
"loss": 0.7433,
"step": 350
},
{
"epoch": 0.8089887640449438,
"grad_norm": 13.336278915405273,
"learning_rate": 1.2413363999389718e-05,
"loss": 0.9501,
"step": 360
},
{
"epoch": 0.8314606741573034,
"grad_norm": 18.14997100830078,
"learning_rate": 1.2406588500685356e-05,
"loss": 0.7787,
"step": 370
},
{
"epoch": 0.8539325842696629,
"grad_norm": 14.070672988891602,
"learning_rate": 1.2399559928741435e-05,
"loss": 0.8501,
"step": 380
},
{
"epoch": 0.8764044943820225,
"grad_norm": 12.321798324584961,
"learning_rate": 1.2392278572475025e-05,
"loss": 0.8531,
"step": 390
},
{
"epoch": 0.898876404494382,
"grad_norm": 13.217580795288086,
"learning_rate": 1.2384744731194159e-05,
"loss": 0.7066,
"step": 400
},
{
"epoch": 0.9213483146067416,
"grad_norm": 21.007793426513672,
"learning_rate": 1.2376958714585546e-05,
"loss": 0.7631,
"step": 410
},
{
"epoch": 0.9438202247191011,
"grad_norm": 25.68875503540039,
"learning_rate": 1.2368920842701831e-05,
"loss": 0.6831,
"step": 420
},
{
"epoch": 0.9662921348314607,
"grad_norm": 18.43454933166504,
"learning_rate": 1.2360631445948449e-05,
"loss": 0.7968,
"step": 430
},
{
"epoch": 0.9887640449438202,
"grad_norm": 11.963883399963379,
"learning_rate": 1.2352090865070027e-05,
"loss": 0.7182,
"step": 440
},
{
"epoch": 1.0112359550561798,
"grad_norm": 19.328462600708008,
"learning_rate": 1.2343299451136397e-05,
"loss": 0.7027,
"step": 450
},
{
"epoch": 1.0112359550561798,
"eval_loss": 0.6809844970703125,
"eval_runtime": 28.6076,
"eval_samples_per_second": 13.842,
"eval_steps_per_second": 13.842,
"step": 450
},
{
"epoch": 1.0337078651685394,
"grad_norm": 17.414684295654297,
"learning_rate": 1.2334257565528155e-05,
"loss": 0.5859,
"step": 460
},
{
"epoch": 1.0561797752808988,
"grad_norm": 15.297569274902344,
"learning_rate": 1.2324965579921801e-05,
"loss": 0.4641,
"step": 470
},
{
"epoch": 1.0786516853932584,
"grad_norm": 18.19464874267578,
"learning_rate": 1.2315423876274468e-05,
"loss": 0.5484,
"step": 480
},
{
"epoch": 1.101123595505618,
"grad_norm": 9.914576530456543,
"learning_rate": 1.2305632846808221e-05,
"loss": 0.6468,
"step": 490
},
{
"epoch": 1.1235955056179776,
"grad_norm": 14.114611625671387,
"learning_rate": 1.2295592893993934e-05,
"loss": 0.6038,
"step": 500
},
{
"epoch": 1.146067415730337,
"grad_norm": 16.208505630493164,
"learning_rate": 1.2285304430534745e-05,
"loss": 0.5434,
"step": 510
},
{
"epoch": 1.1685393258426966,
"grad_norm": 9.227933883666992,
"learning_rate": 1.2274767879349083e-05,
"loss": 0.5278,
"step": 520
},
{
"epoch": 1.1910112359550562,
"grad_norm": 13.076558113098145,
"learning_rate": 1.2263983673553307e-05,
"loss": 0.7342,
"step": 530
},
{
"epoch": 1.2134831460674158,
"grad_norm": 14.731674194335938,
"learning_rate": 1.2252952256443871e-05,
"loss": 0.5825,
"step": 540
},
{
"epoch": 1.2359550561797752,
"grad_norm": 12.931262016296387,
"learning_rate": 1.2241674081479129e-05,
"loss": 0.6817,
"step": 550
},
{
"epoch": 1.2584269662921348,
"grad_norm": 19.48564910888672,
"learning_rate": 1.223014961226068e-05,
"loss": 0.6012,
"step": 560
},
{
"epoch": 1.2808988764044944,
"grad_norm": 17.863492965698242,
"learning_rate": 1.2218379322514316e-05,
"loss": 0.6,
"step": 570
},
{
"epoch": 1.303370786516854,
"grad_norm": 16.681968688964844,
"learning_rate": 1.2206363696070545e-05,
"loss": 0.5687,
"step": 580
},
{
"epoch": 1.3258426966292136,
"grad_norm": 11.682393074035645,
"learning_rate": 1.219410322684471e-05,
"loss": 0.6333,
"step": 590
},
{
"epoch": 1.348314606741573,
"grad_norm": 10.291793823242188,
"learning_rate": 1.2181598418816679e-05,
"loss": 0.5697,
"step": 600
},
{
"epoch": 1.348314606741573,
"eval_loss": 0.648504376411438,
"eval_runtime": 28.6285,
"eval_samples_per_second": 13.832,
"eval_steps_per_second": 13.832,
"step": 600
},
{
"epoch": 1.3707865168539326,
"grad_norm": 18.982742309570312,
"learning_rate": 1.2168849786010134e-05,
"loss": 0.4987,
"step": 610
},
{
"epoch": 1.3932584269662922,
"grad_norm": 6.644986629486084,
"learning_rate": 1.2155857852471433e-05,
"loss": 0.6424,
"step": 620
},
{
"epoch": 1.4157303370786516,
"grad_norm": 10.34019947052002,
"learning_rate": 1.2142623152248081e-05,
"loss": 0.674,
"step": 630
},
{
"epoch": 1.4382022471910112,
"grad_norm": 20.887239456176758,
"learning_rate": 1.2129146229366767e-05,
"loss": 0.5793,
"step": 640
},
{
"epoch": 1.4606741573033708,
"grad_norm": 19.285852432250977,
"learning_rate": 1.2115427637811003e-05,
"loss": 0.5608,
"step": 650
},
{
"epoch": 1.4831460674157304,
"grad_norm": 14.167954444885254,
"learning_rate": 1.2101467941498358e-05,
"loss": 0.4507,
"step": 660
},
{
"epoch": 1.50561797752809,
"grad_norm": 17.428260803222656,
"learning_rate": 1.208726771425727e-05,
"loss": 0.4813,
"step": 670
},
{
"epoch": 1.5280898876404494,
"grad_norm": 14.576336860656738,
"learning_rate": 1.2072827539803463e-05,
"loss": 0.6145,
"step": 680
},
{
"epoch": 1.550561797752809,
"grad_norm": 11.203570365905762,
"learning_rate": 1.205814801171595e-05,
"loss": 0.4898,
"step": 690
},
{
"epoch": 1.5730337078651684,
"grad_norm": 16.303321838378906,
"learning_rate": 1.2043229733412637e-05,
"loss": 0.5359,
"step": 700
},
{
"epoch": 1.595505617977528,
"grad_norm": 12.990396499633789,
"learning_rate": 1.2028073318125511e-05,
"loss": 0.5608,
"step": 710
},
{
"epoch": 1.6179775280898876,
"grad_norm": 15.660808563232422,
"learning_rate": 1.2012679388875442e-05,
"loss": 0.5824,
"step": 720
},
{
"epoch": 1.6404494382022472,
"grad_norm": 20.047142028808594,
"learning_rate": 1.1997048578446569e-05,
"loss": 0.4947,
"step": 730
},
{
"epoch": 1.6629213483146068,
"grad_norm": 15.481894493103027,
"learning_rate": 1.1981181529360284e-05,
"loss": 0.6934,
"step": 740
},
{
"epoch": 1.6853932584269664,
"grad_norm": 13.069948196411133,
"learning_rate": 1.1965078893848829e-05,
"loss": 0.5284,
"step": 750
},
{
"epoch": 1.6853932584269664,
"eval_loss": 0.5964965224266052,
"eval_runtime": 28.8612,
"eval_samples_per_second": 13.721,
"eval_steps_per_second": 13.721,
"step": 750
},
{
"epoch": 1.7078651685393258,
"grad_norm": 15.860553741455078,
"learning_rate": 1.1948741333828482e-05,
"loss": 0.5074,
"step": 760
},
{
"epoch": 1.7303370786516854,
"grad_norm": 15.73456859588623,
"learning_rate": 1.1932169520872344e-05,
"loss": 0.5663,
"step": 770
},
{
"epoch": 1.7528089887640448,
"grad_norm": 15.158873558044434,
"learning_rate": 1.1915364136182738e-05,
"loss": 0.5084,
"step": 780
},
{
"epoch": 1.7752808988764044,
"grad_norm": 18.966686248779297,
"learning_rate": 1.189832587056321e-05,
"loss": 0.5863,
"step": 790
},
{
"epoch": 1.797752808988764,
"grad_norm": 16.93968391418457,
"learning_rate": 1.188105542439012e-05,
"loss": 0.4685,
"step": 800
},
{
"epoch": 1.8202247191011236,
"grad_norm": 15.844802856445312,
"learning_rate": 1.186355350758387e-05,
"loss": 0.4631,
"step": 810
},
{
"epoch": 1.8426966292134832,
"grad_norm": 6.261096000671387,
"learning_rate": 1.1845820839579707e-05,
"loss": 0.4791,
"step": 820
},
{
"epoch": 1.8651685393258428,
"grad_norm": 21.932397842407227,
"learning_rate": 1.1827858149298162e-05,
"loss": 0.5318,
"step": 830
},
{
"epoch": 1.8876404494382022,
"grad_norm": 20.80328369140625,
"learning_rate": 1.1809666175115075e-05,
"loss": 0.4769,
"step": 840
},
{
"epoch": 1.9101123595505618,
"grad_norm": 11.984331130981445,
"learning_rate": 1.1791245664831252e-05,
"loss": 0.4934,
"step": 850
},
{
"epoch": 1.9325842696629212,
"grad_norm": 15.739178657531738,
"learning_rate": 1.177259737564172e-05,
"loss": 0.546,
"step": 860
},
{
"epoch": 1.9550561797752808,
"grad_norm": 18.232080459594727,
"learning_rate": 1.1753722074104613e-05,
"loss": 0.5,
"step": 870
},
{
"epoch": 1.9775280898876404,
"grad_norm": 19.222999572753906,
"learning_rate": 1.1734620536109645e-05,
"loss": 0.5939,
"step": 880
},
{
"epoch": 2.0,
"grad_norm": 20.398212432861328,
"learning_rate": 1.1715293546846223e-05,
"loss": 0.5186,
"step": 890
},
{
"epoch": 2.0224719101123596,
"grad_norm": 15.60082721710205,
"learning_rate": 1.1695741900771185e-05,
"loss": 0.2977,
"step": 900
},
{
"epoch": 2.0224719101123596,
"eval_loss": 0.5883856415748596,
"eval_runtime": 28.4376,
"eval_samples_per_second": 13.925,
"eval_steps_per_second": 13.925,
"step": 900
},
{
"epoch": 2.044943820224719,
"grad_norm": 11.865720748901367,
"learning_rate": 1.1675966401576116e-05,
"loss": 0.3224,
"step": 910
},
{
"epoch": 2.067415730337079,
"grad_norm": 13.449054718017578,
"learning_rate": 1.1655967862154335e-05,
"loss": 0.3297,
"step": 920
},
{
"epoch": 2.0898876404494384,
"grad_norm": 18.293087005615234,
"learning_rate": 1.1635747104567469e-05,
"loss": 0.282,
"step": 930
},
{
"epoch": 2.1123595505617976,
"grad_norm": 15.209367752075195,
"learning_rate": 1.1615304960011663e-05,
"loss": 0.3504,
"step": 940
},
{
"epoch": 2.134831460674157,
"grad_norm": 17.038095474243164,
"learning_rate": 1.1594642268783414e-05,
"loss": 0.2422,
"step": 950
},
{
"epoch": 2.157303370786517,
"grad_norm": 14.031147956848145,
"learning_rate": 1.1573759880245028e-05,
"loss": 0.2884,
"step": 960
},
{
"epoch": 2.1797752808988764,
"grad_norm": 16.43686294555664,
"learning_rate": 1.1552658652789704e-05,
"loss": 0.2729,
"step": 970
},
{
"epoch": 2.202247191011236,
"grad_norm": 18.40167808532715,
"learning_rate": 1.153133945380626e-05,
"loss": 0.3775,
"step": 980
},
{
"epoch": 2.2247191011235956,
"grad_norm": 11.52310848236084,
"learning_rate": 1.1509803159643458e-05,
"loss": 0.4183,
"step": 990
},
{
"epoch": 2.247191011235955,
"grad_norm": 17.900732040405273,
"learning_rate": 1.1488050655574003e-05,
"loss": 0.3087,
"step": 1000
},
{
"epoch": 2.2696629213483144,
"grad_norm": 12.462130546569824,
"learning_rate": 1.1466082835758142e-05,
"loss": 0.371,
"step": 1010
},
{
"epoch": 2.292134831460674,
"grad_norm": 15.067594528198242,
"learning_rate": 1.1443900603206901e-05,
"loss": 0.2704,
"step": 1020
},
{
"epoch": 2.3146067415730336,
"grad_norm": 12.72071647644043,
"learning_rate": 1.1421504869744979e-05,
"loss": 0.2859,
"step": 1030
},
{
"epoch": 2.337078651685393,
"grad_norm": 19.164932250976562,
"learning_rate": 1.139889655597326e-05,
"loss": 0.3761,
"step": 1040
},
{
"epoch": 2.359550561797753,
"grad_norm": 12.603534698486328,
"learning_rate": 1.1376076591230975e-05,
"loss": 0.3465,
"step": 1050
},
{
"epoch": 2.359550561797753,
"eval_loss": 0.5863937735557556,
"eval_runtime": 28.6419,
"eval_samples_per_second": 13.826,
"eval_steps_per_second": 13.826,
"step": 1050
},
{
"epoch": 2.3820224719101124,
"grad_norm": 13.150344848632812,
"learning_rate": 1.1353045913557491e-05,
"loss": 0.3628,
"step": 1060
},
{
"epoch": 2.404494382022472,
"grad_norm": 13.253989219665527,
"learning_rate": 1.1329805469653767e-05,
"loss": 0.3521,
"step": 1070
},
{
"epoch": 2.4269662921348316,
"grad_norm": 15.47899055480957,
"learning_rate": 1.1306356214843423e-05,
"loss": 0.2289,
"step": 1080
},
{
"epoch": 2.449438202247191,
"grad_norm": 20.19160270690918,
"learning_rate": 1.1282699113033476e-05,
"loss": 0.2723,
"step": 1090
},
{
"epoch": 2.4719101123595504,
"grad_norm": 16.03728485107422,
"learning_rate": 1.125883513667473e-05,
"loss": 0.3398,
"step": 1100
},
{
"epoch": 2.49438202247191,
"grad_norm": 21.989660263061523,
"learning_rate": 1.123476526672178e-05,
"loss": 0.3926,
"step": 1110
},
{
"epoch": 2.5168539325842696,
"grad_norm": 16.33092498779297,
"learning_rate": 1.1210490492592705e-05,
"loss": 0.2243,
"step": 1120
},
{
"epoch": 2.539325842696629,
"grad_norm": 17.589994430541992,
"learning_rate": 1.118601181212839e-05,
"loss": 0.3433,
"step": 1130
},
{
"epoch": 2.561797752808989,
"grad_norm": 10.739886283874512,
"learning_rate": 1.1161330231551516e-05,
"loss": 0.2862,
"step": 1140
},
{
"epoch": 2.5842696629213484,
"grad_norm": 15.978007316589355,
"learning_rate": 1.1136446765425187e-05,
"loss": 0.2902,
"step": 1150
},
{
"epoch": 2.606741573033708,
"grad_norm": 12.640671730041504,
"learning_rate": 1.1111362436611233e-05,
"loss": 0.3325,
"step": 1160
},
{
"epoch": 2.629213483146067,
"grad_norm": 16.383241653442383,
"learning_rate": 1.1086078276228168e-05,
"loss": 0.3235,
"step": 1170
},
{
"epoch": 2.6516853932584272,
"grad_norm": 14.427544593811035,
"learning_rate": 1.1060595323608789e-05,
"loss": 0.3674,
"step": 1180
},
{
"epoch": 2.6741573033707864,
"grad_norm": 12.737131118774414,
"learning_rate": 1.1034914626257467e-05,
"loss": 0.2976,
"step": 1190
},
{
"epoch": 2.696629213483146,
"grad_norm": 13.494490623474121,
"learning_rate": 1.1009037239807091e-05,
"loss": 0.3386,
"step": 1200
},
{
"epoch": 2.696629213483146,
"eval_loss": 0.5752944946289062,
"eval_runtime": 28.5405,
"eval_samples_per_second": 13.875,
"eval_steps_per_second": 13.875,
"step": 1200
},
{
"epoch": 2.7191011235955056,
"grad_norm": 18.343557357788086,
"learning_rate": 1.098296422797566e-05,
"loss": 0.4059,
"step": 1210
},
{
"epoch": 2.741573033707865,
"grad_norm": 14.710756301879883,
"learning_rate": 1.095669666252257e-05,
"loss": 0.3214,
"step": 1220
},
{
"epoch": 2.764044943820225,
"grad_norm": 14.75973892211914,
"learning_rate": 1.0930235623204552e-05,
"loss": 0.3608,
"step": 1230
},
{
"epoch": 2.7865168539325844,
"grad_norm": 9.905698776245117,
"learning_rate": 1.0903582197731294e-05,
"loss": 0.299,
"step": 1240
},
{
"epoch": 2.808988764044944,
"grad_norm": 12.442407608032227,
"learning_rate": 1.0876737481720722e-05,
"loss": 0.4008,
"step": 1250
},
{
"epoch": 2.831460674157303,
"grad_norm": 11.765353202819824,
"learning_rate": 1.0849702578653969e-05,
"loss": 0.3743,
"step": 1260
},
{
"epoch": 2.853932584269663,
"grad_norm": 21.303213119506836,
"learning_rate": 1.0822478599830009e-05,
"loss": 0.4142,
"step": 1270
},
{
"epoch": 2.8764044943820224,
"grad_norm": 12.464083671569824,
"learning_rate": 1.0795066664319983e-05,
"loss": 0.3029,
"step": 1280
},
{
"epoch": 2.898876404494382,
"grad_norm": 20.486202239990234,
"learning_rate": 1.0767467898921198e-05,
"loss": 0.3827,
"step": 1290
},
{
"epoch": 2.9213483146067416,
"grad_norm": 16.888824462890625,
"learning_rate": 1.0739683438110799e-05,
"loss": 0.3854,
"step": 1300
},
{
"epoch": 2.943820224719101,
"grad_norm": 17.157941818237305,
"learning_rate": 1.0711714423999145e-05,
"loss": 0.3273,
"step": 1310
},
{
"epoch": 2.966292134831461,
"grad_norm": 16.582439422607422,
"learning_rate": 1.0683562006282862e-05,
"loss": 0.3334,
"step": 1320
},
{
"epoch": 2.98876404494382,
"grad_norm": 19.243640899658203,
"learning_rate": 1.0655227342197573e-05,
"loss": 0.3262,
"step": 1330
},
{
"epoch": 3.0112359550561796,
"grad_norm": 10.776522636413574,
"learning_rate": 1.0626711596470345e-05,
"loss": 0.2177,
"step": 1340
},
{
"epoch": 3.033707865168539,
"grad_norm": 13.728617668151855,
"learning_rate": 1.0598015941271792e-05,
"loss": 0.1772,
"step": 1350
},
{
"epoch": 3.033707865168539,
"eval_loss": 0.6071353554725647,
"eval_runtime": 28.1538,
"eval_samples_per_second": 14.066,
"eval_steps_per_second": 14.066,
"step": 1350
},
{
"epoch": 3.056179775280899,
"grad_norm": 15.800386428833008,
"learning_rate": 1.0569141556167905e-05,
"loss": 0.1571,
"step": 1360
},
{
"epoch": 3.0786516853932584,
"grad_norm": 11.595304489135742,
"learning_rate": 1.0540089628071565e-05,
"loss": 0.1715,
"step": 1370
},
{
"epoch": 3.101123595505618,
"grad_norm": 19.881999969482422,
"learning_rate": 1.0510861351193747e-05,
"loss": 0.1924,
"step": 1380
},
{
"epoch": 3.1235955056179776,
"grad_norm": 18.425518035888672,
"learning_rate": 1.0481457926994435e-05,
"loss": 0.1942,
"step": 1390
},
{
"epoch": 3.146067415730337,
"grad_norm": 19.82516098022461,
"learning_rate": 1.045188056413323e-05,
"loss": 0.1355,
"step": 1400
},
{
"epoch": 3.168539325842697,
"grad_norm": 21.702138900756836,
"learning_rate": 1.0422130478419676e-05,
"loss": 0.1985,
"step": 1410
},
{
"epoch": 3.191011235955056,
"grad_norm": 9.89587116241455,
"learning_rate": 1.0392208892763269e-05,
"loss": 0.1726,
"step": 1420
},
{
"epoch": 3.2134831460674156,
"grad_norm": 18.09695053100586,
"learning_rate": 1.0362117037123204e-05,
"loss": 0.2026,
"step": 1430
},
{
"epoch": 3.235955056179775,
"grad_norm": 18.88958168029785,
"learning_rate": 1.0331856148457804e-05,
"loss": 0.1631,
"step": 1440
},
{
"epoch": 3.258426966292135,
"grad_norm": 17.335662841796875,
"learning_rate": 1.030142747067368e-05,
"loss": 0.162,
"step": 1450
},
{
"epoch": 3.2808988764044944,
"grad_norm": 24.29629898071289,
"learning_rate": 1.027083225457459e-05,
"loss": 0.1695,
"step": 1460
},
{
"epoch": 3.303370786516854,
"grad_norm": 11.860310554504395,
"learning_rate": 1.0240071757810035e-05,
"loss": 0.1698,
"step": 1470
},
{
"epoch": 3.3258426966292136,
"grad_norm": 20.780662536621094,
"learning_rate": 1.0209147244823564e-05,
"loss": 0.1495,
"step": 1480
},
{
"epoch": 3.348314606741573,
"grad_norm": 21.13768768310547,
"learning_rate": 1.0178059986800773e-05,
"loss": 0.1736,
"step": 1490
},
{
"epoch": 3.370786516853933,
"grad_norm": 11.82013988494873,
"learning_rate": 1.0146811261617086e-05,
"loss": 0.2058,
"step": 1500
},
{
"epoch": 3.370786516853933,
"eval_loss": 0.5937667489051819,
"eval_runtime": 28.4093,
"eval_samples_per_second": 13.939,
"eval_steps_per_second": 13.939,
"step": 1500
},
{
"epoch": 3.393258426966292,
"grad_norm": 7.896106719970703,
"learning_rate": 1.0115402353785198e-05,
"loss": 0.1455,
"step": 1510
},
{
"epoch": 3.4157303370786516,
"grad_norm": 15.18211555480957,
"learning_rate": 1.0083834554402293e-05,
"loss": 0.1313,
"step": 1520
},
{
"epoch": 3.438202247191011,
"grad_norm": 14.026439666748047,
"learning_rate": 1.0052109161096959e-05,
"loss": 0.2089,
"step": 1530
},
{
"epoch": 3.460674157303371,
"grad_norm": 17.17043113708496,
"learning_rate": 1.0020227477975852e-05,
"loss": 0.214,
"step": 1540
},
{
"epoch": 3.4831460674157304,
"grad_norm": 16.3658447265625,
"learning_rate": 9.988190815570101e-06,
"loss": 0.2524,
"step": 1550
},
{
"epoch": 3.50561797752809,
"grad_norm": 13.139276504516602,
"learning_rate": 9.95600049078141e-06,
"loss": 0.1869,
"step": 1560
},
{
"epoch": 3.5280898876404496,
"grad_norm": 12.490981101989746,
"learning_rate": 9.923657826827957e-06,
"loss": 0.1826,
"step": 1570
},
{
"epoch": 3.550561797752809,
"grad_norm": 15.54690170288086,
"learning_rate": 9.891164153189975e-06,
"loss": 0.1897,
"step": 1580
},
{
"epoch": 3.5730337078651684,
"grad_norm": 15.49406623840332,
"learning_rate": 9.858520805555123e-06,
"loss": 0.1425,
"step": 1590
},
{
"epoch": 3.595505617977528,
"grad_norm": 15.071846008300781,
"learning_rate": 9.825729125763562e-06,
"loss": 0.2484,
"step": 1600
},
{
"epoch": 3.6179775280898876,
"grad_norm": 17.417387008666992,
"learning_rate": 9.792790461752813e-06,
"loss": 0.1701,
"step": 1610
},
{
"epoch": 3.640449438202247,
"grad_norm": 17.218463897705078,
"learning_rate": 9.759706167502343e-06,
"loss": 0.1796,
"step": 1620
},
{
"epoch": 3.662921348314607,
"grad_norm": 15.873698234558105,
"learning_rate": 9.726477602977906e-06,
"loss": 0.197,
"step": 1630
},
{
"epoch": 3.6853932584269664,
"grad_norm": 19.27431297302246,
"learning_rate": 9.693106134075641e-06,
"loss": 0.2239,
"step": 1640
},
{
"epoch": 3.7078651685393256,
"grad_norm": 16.935728073120117,
"learning_rate": 9.659593132565929e-06,
"loss": 0.1388,
"step": 1650
},
{
"epoch": 3.7078651685393256,
"eval_loss": 0.5907432436943054,
"eval_runtime": 28.0316,
"eval_samples_per_second": 14.127,
"eval_steps_per_second": 14.127,
"step": 1650
},
{
"epoch": 3.7303370786516856,
"grad_norm": 15.374393463134766,
"learning_rate": 9.625939976037002e-06,
"loss": 0.2022,
"step": 1660
},
{
"epoch": 3.752808988764045,
"grad_norm": 13.357222557067871,
"learning_rate": 9.59214804783831e-06,
"loss": 0.1715,
"step": 1670
},
{
"epoch": 3.7752808988764044,
"grad_norm": 15.346492767333984,
"learning_rate": 9.558218737023673e-06,
"loss": 0.2355,
"step": 1680
},
{
"epoch": 3.797752808988764,
"grad_norm": 8.775595664978027,
"learning_rate": 9.524153438294159e-06,
"loss": 0.192,
"step": 1690
},
{
"epoch": 3.8202247191011236,
"grad_norm": 18.73467254638672,
"learning_rate": 9.489953551940784e-06,
"loss": 0.226,
"step": 1700
},
{
"epoch": 3.842696629213483,
"grad_norm": 16.3373966217041,
"learning_rate": 9.455620483786914e-06,
"loss": 0.2295,
"step": 1710
},
{
"epoch": 3.865168539325843,
"grad_norm": 14.356298446655273,
"learning_rate": 9.421155645130514e-06,
"loss": 0.1508,
"step": 1720
},
{
"epoch": 3.8876404494382024,
"grad_norm": 20.167428970336914,
"learning_rate": 9.386560452686111e-06,
"loss": 0.2378,
"step": 1730
},
{
"epoch": 3.9101123595505616,
"grad_norm": 18.679561614990234,
"learning_rate": 9.351836328526564e-06,
"loss": 0.2386,
"step": 1740
},
{
"epoch": 3.932584269662921,
"grad_norm": 13.456331253051758,
"learning_rate": 9.316984700024613e-06,
"loss": 0.1977,
"step": 1750
},
{
"epoch": 3.955056179775281,
"grad_norm": 16.58353042602539,
"learning_rate": 9.282006999794201e-06,
"loss": 0.2215,
"step": 1760
},
{
"epoch": 3.9775280898876404,
"grad_norm": 12.72409439086914,
"learning_rate": 9.246904665631587e-06,
"loss": 0.166,
"step": 1770
},
{
"epoch": 4.0,
"grad_norm": 15.713534355163574,
"learning_rate": 9.211679140456241e-06,
"loss": 0.1595,
"step": 1780
},
{
"epoch": 4.022471910112359,
"grad_norm": 17.47097396850586,
"learning_rate": 9.176331872251538e-06,
"loss": 0.1038,
"step": 1790
},
{
"epoch": 4.044943820224719,
"grad_norm": 15.764228820800781,
"learning_rate": 9.140864314005223e-06,
"loss": 0.1084,
"step": 1800
},
{
"epoch": 4.044943820224719,
"eval_loss": 0.6473093032836914,
"eval_runtime": 28.5511,
"eval_samples_per_second": 13.87,
"eval_steps_per_second": 13.87,
"step": 1800
},
{
"epoch": 4.067415730337078,
"grad_norm": 19.521255493164062,
"learning_rate": 9.105277923649698e-06,
"loss": 0.1045,
"step": 1810
},
{
"epoch": 4.089887640449438,
"grad_norm": 6.521065711975098,
"learning_rate": 9.069574164002092e-06,
"loss": 0.0885,
"step": 1820
},
{
"epoch": 4.112359550561798,
"grad_norm": 10.468367576599121,
"learning_rate": 9.033754502704119e-06,
"loss": 0.0933,
"step": 1830
},
{
"epoch": 4.134831460674158,
"grad_norm": 8.916471481323242,
"learning_rate": 8.997820412161765e-06,
"loss": 0.1004,
"step": 1840
},
{
"epoch": 4.157303370786517,
"grad_norm": 7.306938171386719,
"learning_rate": 8.961773369484739e-06,
"loss": 0.1019,
"step": 1850
},
{
"epoch": 4.179775280898877,
"grad_norm": 16.291723251342773,
"learning_rate": 8.925614856425787e-06,
"loss": 0.1043,
"step": 1860
},
{
"epoch": 4.202247191011236,
"grad_norm": 11.017843246459961,
"learning_rate": 8.88934635931975e-06,
"loss": 0.1468,
"step": 1870
},
{
"epoch": 4.224719101123595,
"grad_norm": 11.328751564025879,
"learning_rate": 8.852969369022494e-06,
"loss": 0.1022,
"step": 1880
},
{
"epoch": 4.247191011235955,
"grad_norm": 10.77855396270752,
"learning_rate": 8.816485380849613e-06,
"loss": 0.1098,
"step": 1890
},
{
"epoch": 4.269662921348314,
"grad_norm": 21.963041305541992,
"learning_rate": 8.779895894514961e-06,
"loss": 0.0932,
"step": 1900
},
{
"epoch": 4.292134831460674,
"grad_norm": 10.421232223510742,
"learning_rate": 8.743202414069012e-06,
"loss": 0.1115,
"step": 1910
},
{
"epoch": 4.314606741573034,
"grad_norm": 12.438689231872559,
"learning_rate": 8.706406447837024e-06,
"loss": 0.093,
"step": 1920
},
{
"epoch": 4.337078651685394,
"grad_norm": 15.851973533630371,
"learning_rate": 8.669509508357052e-06,
"loss": 0.1099,
"step": 1930
},
{
"epoch": 4.359550561797753,
"grad_norm": 12.311159133911133,
"learning_rate": 8.632513112317761e-06,
"loss": 0.1131,
"step": 1940
},
{
"epoch": 4.382022471910112,
"grad_norm": 13.830821990966797,
"learning_rate": 8.59541878049609e-06,
"loss": 0.1418,
"step": 1950
},
{
"epoch": 4.382022471910112,
"eval_loss": 0.6235558390617371,
"eval_runtime": 28.0922,
"eval_samples_per_second": 14.096,
"eval_steps_per_second": 14.096,
"step": 1950
},
{
"epoch": 4.404494382022472,
"grad_norm": 20.932544708251953,
"learning_rate": 8.558228037694728e-06,
"loss": 0.0978,
"step": 1960
},
{
"epoch": 4.426966292134831,
"grad_norm": 17.910070419311523,
"learning_rate": 8.520942412679448e-06,
"loss": 0.1239,
"step": 1970
},
{
"epoch": 4.449438202247191,
"grad_norm": 13.0515718460083,
"learning_rate": 8.483563438116257e-06,
"loss": 0.0958,
"step": 1980
},
{
"epoch": 4.47191011235955,
"grad_norm": 14.775212287902832,
"learning_rate": 8.446092650508393e-06,
"loss": 0.0913,
"step": 1990
},
{
"epoch": 4.49438202247191,
"grad_norm": 14.85338306427002,
"learning_rate": 8.408531590133173e-06,
"loss": 0.1077,
"step": 2000
},
{
"epoch": 4.51685393258427,
"grad_norm": 10.560493469238281,
"learning_rate": 8.370881800978673e-06,
"loss": 0.1092,
"step": 2010
},
{
"epoch": 4.539325842696629,
"grad_norm": 11.044146537780762,
"learning_rate": 8.333144830680262e-06,
"loss": 0.1359,
"step": 2020
},
{
"epoch": 4.561797752808989,
"grad_norm": 10.464937210083008,
"learning_rate": 8.29532223045698e-06,
"loss": 0.0996,
"step": 2030
},
{
"epoch": 4.584269662921348,
"grad_norm": 11.029562950134277,
"learning_rate": 8.257415555047786e-06,
"loss": 0.1169,
"step": 2040
},
{
"epoch": 4.606741573033708,
"grad_norm": 7.897212028503418,
"learning_rate": 8.219426362647631e-06,
"loss": 0.101,
"step": 2050
},
{
"epoch": 4.629213483146067,
"grad_norm": 12.705263137817383,
"learning_rate": 8.181356214843423e-06,
"loss": 0.1036,
"step": 2060
},
{
"epoch": 4.651685393258427,
"grad_norm": 18.093433380126953,
"learning_rate": 8.143206676549826e-06,
"loss": 0.1008,
"step": 2070
},
{
"epoch": 4.674157303370786,
"grad_norm": 15.597516059875488,
"learning_rate": 8.104979315944941e-06,
"loss": 0.1057,
"step": 2080
},
{
"epoch": 4.696629213483146,
"grad_norm": 11.599696159362793,
"learning_rate": 8.066675704405837e-06,
"loss": 0.102,
"step": 2090
},
{
"epoch": 4.719101123595506,
"grad_norm": 13.57479190826416,
"learning_rate": 8.028297416443953e-06,
"loss": 0.1374,
"step": 2100
},
{
"epoch": 4.719101123595506,
"eval_loss": 0.6200308203697205,
"eval_runtime": 28.1044,
"eval_samples_per_second": 14.09,
"eval_steps_per_second": 14.09,
"step": 2100
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 12,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.0762916960406733e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}