xtrpali / checkpoint-2038 /trainer_state.json
VoVAllen's picture
Upload folder using huggingface_hub
8699a3e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 50,
"global_step": 2038,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004906771344455349,
"grad_norm": 0.1728515625,
"learning_rate": 5e-06,
"loss": 0.6955,
"step": 10
},
{
"epoch": 0.009813542688910697,
"grad_norm": 0.291015625,
"learning_rate": 1e-05,
"loss": 0.6952,
"step": 20
},
{
"epoch": 0.014720314033366046,
"grad_norm": 0.25,
"learning_rate": 1.5e-05,
"loss": 0.6964,
"step": 30
},
{
"epoch": 0.019627085377821395,
"grad_norm": 0.212890625,
"learning_rate": 2e-05,
"loss": 0.6966,
"step": 40
},
{
"epoch": 0.02453385672227674,
"grad_norm": 0.302734375,
"learning_rate": 2.5e-05,
"loss": 0.6944,
"step": 50
},
{
"epoch": 0.02453385672227674,
"eval_loss": 0.6937512159347534,
"eval_runtime": 28.3512,
"eval_samples_per_second": 7.054,
"eval_steps_per_second": 1.764,
"step": 50
},
{
"epoch": 0.029440628066732092,
"grad_norm": 0.2353515625,
"learning_rate": 3e-05,
"loss": 0.6944,
"step": 60
},
{
"epoch": 0.03434739941118744,
"grad_norm": 0.14453125,
"learning_rate": 3.5e-05,
"loss": 0.694,
"step": 70
},
{
"epoch": 0.03925417075564279,
"grad_norm": 0.26953125,
"learning_rate": 4e-05,
"loss": 0.6938,
"step": 80
},
{
"epoch": 0.04416094210009813,
"grad_norm": 0.279296875,
"learning_rate": 4.5e-05,
"loss": 0.6928,
"step": 90
},
{
"epoch": 0.04906771344455348,
"grad_norm": 0.2490234375,
"learning_rate": 5e-05,
"loss": 0.6928,
"step": 100
},
{
"epoch": 0.04906771344455348,
"eval_loss": 0.6929062008857727,
"eval_runtime": 28.6139,
"eval_samples_per_second": 6.99,
"eval_steps_per_second": 1.747,
"step": 100
},
{
"epoch": 0.053974484789008834,
"grad_norm": 0.296875,
"learning_rate": 4.974200206398349e-05,
"loss": 0.692,
"step": 110
},
{
"epoch": 0.058881256133464184,
"grad_norm": 0.33984375,
"learning_rate": 4.948400412796697e-05,
"loss": 0.6924,
"step": 120
},
{
"epoch": 0.06378802747791953,
"grad_norm": 0.3359375,
"learning_rate": 4.922600619195047e-05,
"loss": 0.6921,
"step": 130
},
{
"epoch": 0.06869479882237488,
"grad_norm": 0.267578125,
"learning_rate": 4.896800825593396e-05,
"loss": 0.6927,
"step": 140
},
{
"epoch": 0.07360157016683022,
"grad_norm": 0.2236328125,
"learning_rate": 4.8710010319917446e-05,
"loss": 0.6914,
"step": 150
},
{
"epoch": 0.07360157016683022,
"eval_loss": 0.6915245056152344,
"eval_runtime": 28.6003,
"eval_samples_per_second": 6.993,
"eval_steps_per_second": 1.748,
"step": 150
},
{
"epoch": 0.07850834151128558,
"grad_norm": 0.345703125,
"learning_rate": 4.845201238390093e-05,
"loss": 0.6923,
"step": 160
},
{
"epoch": 0.08341511285574092,
"grad_norm": 0.3046875,
"learning_rate": 4.819401444788442e-05,
"loss": 0.6893,
"step": 170
},
{
"epoch": 0.08832188420019627,
"grad_norm": 0.28125,
"learning_rate": 4.793601651186791e-05,
"loss": 0.6865,
"step": 180
},
{
"epoch": 0.09322865554465162,
"grad_norm": 0.43359375,
"learning_rate": 4.7678018575851394e-05,
"loss": 0.6877,
"step": 190
},
{
"epoch": 0.09813542688910697,
"grad_norm": 0.42578125,
"learning_rate": 4.742002063983488e-05,
"loss": 0.6852,
"step": 200
},
{
"epoch": 0.09813542688910697,
"eval_loss": 0.6849114894866943,
"eval_runtime": 28.416,
"eval_samples_per_second": 7.038,
"eval_steps_per_second": 1.76,
"step": 200
},
{
"epoch": 0.10304219823356231,
"grad_norm": 0.39453125,
"learning_rate": 4.716202270381837e-05,
"loss": 0.6767,
"step": 210
},
{
"epoch": 0.10794896957801767,
"grad_norm": 0.45703125,
"learning_rate": 4.690402476780186e-05,
"loss": 0.6804,
"step": 220
},
{
"epoch": 0.11285574092247301,
"grad_norm": 0.78125,
"learning_rate": 4.664602683178535e-05,
"loss": 0.6814,
"step": 230
},
{
"epoch": 0.11776251226692837,
"grad_norm": 2.046875,
"learning_rate": 4.638802889576884e-05,
"loss": 0.6586,
"step": 240
},
{
"epoch": 0.12266928361138371,
"grad_norm": 1.828125,
"learning_rate": 4.613003095975233e-05,
"loss": 0.6478,
"step": 250
},
{
"epoch": 0.12266928361138371,
"eval_loss": 0.6532555222511292,
"eval_runtime": 28.4259,
"eval_samples_per_second": 7.036,
"eval_steps_per_second": 1.759,
"step": 250
},
{
"epoch": 0.12757605495583907,
"grad_norm": 1.90625,
"learning_rate": 4.587203302373581e-05,
"loss": 0.6473,
"step": 260
},
{
"epoch": 0.1324828263002944,
"grad_norm": 1.84375,
"learning_rate": 4.56140350877193e-05,
"loss": 0.6397,
"step": 270
},
{
"epoch": 0.13738959764474976,
"grad_norm": 2.046875,
"learning_rate": 4.535603715170279e-05,
"loss": 0.6243,
"step": 280
},
{
"epoch": 0.1422963689892051,
"grad_norm": 1.640625,
"learning_rate": 4.5098039215686275e-05,
"loss": 0.6304,
"step": 290
},
{
"epoch": 0.14720314033366044,
"grad_norm": 1.84375,
"learning_rate": 4.4840041279669764e-05,
"loss": 0.6278,
"step": 300
},
{
"epoch": 0.14720314033366044,
"eval_loss": 0.6153883337974548,
"eval_runtime": 28.6968,
"eval_samples_per_second": 6.969,
"eval_steps_per_second": 1.742,
"step": 300
},
{
"epoch": 0.1521099116781158,
"grad_norm": 1.5,
"learning_rate": 4.458204334365325e-05,
"loss": 0.6067,
"step": 310
},
{
"epoch": 0.15701668302257116,
"grad_norm": 1.7734375,
"learning_rate": 4.432404540763674e-05,
"loss": 0.5956,
"step": 320
},
{
"epoch": 0.1619234543670265,
"grad_norm": 1.640625,
"learning_rate": 4.406604747162023e-05,
"loss": 0.6263,
"step": 330
},
{
"epoch": 0.16683022571148184,
"grad_norm": 1.5,
"learning_rate": 4.380804953560372e-05,
"loss": 0.5947,
"step": 340
},
{
"epoch": 0.1717369970559372,
"grad_norm": 1.0390625,
"learning_rate": 4.355005159958721e-05,
"loss": 0.609,
"step": 350
},
{
"epoch": 0.1717369970559372,
"eval_loss": 0.6010438799858093,
"eval_runtime": 28.6965,
"eval_samples_per_second": 6.969,
"eval_steps_per_second": 1.742,
"step": 350
},
{
"epoch": 0.17664376840039253,
"grad_norm": 1.90625,
"learning_rate": 4.329205366357069e-05,
"loss": 0.5948,
"step": 360
},
{
"epoch": 0.1815505397448479,
"grad_norm": 1.546875,
"learning_rate": 4.303405572755418e-05,
"loss": 0.6043,
"step": 370
},
{
"epoch": 0.18645731108930325,
"grad_norm": 1.71875,
"learning_rate": 4.2776057791537674e-05,
"loss": 0.5791,
"step": 380
},
{
"epoch": 0.19136408243375858,
"grad_norm": 1.2109375,
"learning_rate": 4.2518059855521156e-05,
"loss": 0.56,
"step": 390
},
{
"epoch": 0.19627085377821393,
"grad_norm": 1.7734375,
"learning_rate": 4.2260061919504645e-05,
"loss": 0.5801,
"step": 400
},
{
"epoch": 0.19627085377821393,
"eval_loss": 0.5847232937812805,
"eval_runtime": 28.6639,
"eval_samples_per_second": 6.977,
"eval_steps_per_second": 1.744,
"step": 400
},
{
"epoch": 0.2011776251226693,
"grad_norm": 1.703125,
"learning_rate": 4.200206398348813e-05,
"loss": 0.5517,
"step": 410
},
{
"epoch": 0.20608439646712462,
"grad_norm": 1.296875,
"learning_rate": 4.174406604747162e-05,
"loss": 0.5799,
"step": 420
},
{
"epoch": 0.21099116781157998,
"grad_norm": 1.1953125,
"learning_rate": 4.148606811145511e-05,
"loss": 0.54,
"step": 430
},
{
"epoch": 0.21589793915603533,
"grad_norm": 3.859375,
"learning_rate": 4.12280701754386e-05,
"loss": 0.5591,
"step": 440
},
{
"epoch": 0.22080471050049066,
"grad_norm": 1.1484375,
"learning_rate": 4.097007223942209e-05,
"loss": 0.5558,
"step": 450
},
{
"epoch": 0.22080471050049066,
"eval_loss": 0.5724604725837708,
"eval_runtime": 28.6438,
"eval_samples_per_second": 6.982,
"eval_steps_per_second": 1.746,
"step": 450
},
{
"epoch": 0.22571148184494602,
"grad_norm": 1.2265625,
"learning_rate": 4.071207430340557e-05,
"loss": 0.6203,
"step": 460
},
{
"epoch": 0.23061825318940138,
"grad_norm": 1.5234375,
"learning_rate": 4.0454076367389066e-05,
"loss": 0.5813,
"step": 470
},
{
"epoch": 0.23552502453385674,
"grad_norm": 2.5,
"learning_rate": 4.0196078431372555e-05,
"loss": 0.5946,
"step": 480
},
{
"epoch": 0.24043179587831207,
"grad_norm": 1.671875,
"learning_rate": 3.9938080495356037e-05,
"loss": 0.5635,
"step": 490
},
{
"epoch": 0.24533856722276742,
"grad_norm": 1.3125,
"learning_rate": 3.9680082559339525e-05,
"loss": 0.5625,
"step": 500
},
{
"epoch": 0.24533856722276742,
"eval_loss": 0.5732089877128601,
"eval_runtime": 28.6521,
"eval_samples_per_second": 6.98,
"eval_steps_per_second": 1.745,
"step": 500
},
{
"epoch": 0.25024533856722275,
"grad_norm": 1.453125,
"learning_rate": 3.9422084623323014e-05,
"loss": 0.5378,
"step": 510
},
{
"epoch": 0.25515210991167814,
"grad_norm": 1.046875,
"learning_rate": 3.91640866873065e-05,
"loss": 0.5908,
"step": 520
},
{
"epoch": 0.26005888125613347,
"grad_norm": 1.2421875,
"learning_rate": 3.890608875128999e-05,
"loss": 0.576,
"step": 530
},
{
"epoch": 0.2649656526005888,
"grad_norm": 0.87109375,
"learning_rate": 3.864809081527348e-05,
"loss": 0.5248,
"step": 540
},
{
"epoch": 0.2698724239450442,
"grad_norm": 2.046875,
"learning_rate": 3.839009287925697e-05,
"loss": 0.5448,
"step": 550
},
{
"epoch": 0.2698724239450442,
"eval_loss": 0.5607851147651672,
"eval_runtime": 28.5118,
"eval_samples_per_second": 7.015,
"eval_steps_per_second": 1.754,
"step": 550
},
{
"epoch": 0.2747791952894995,
"grad_norm": 1.828125,
"learning_rate": 3.813209494324045e-05,
"loss": 0.5693,
"step": 560
},
{
"epoch": 0.27968596663395484,
"grad_norm": 0.65234375,
"learning_rate": 3.7874097007223946e-05,
"loss": 0.5248,
"step": 570
},
{
"epoch": 0.2845927379784102,
"grad_norm": 1.4453125,
"learning_rate": 3.7616099071207435e-05,
"loss": 0.5815,
"step": 580
},
{
"epoch": 0.28949950932286556,
"grad_norm": 1.0078125,
"learning_rate": 3.735810113519092e-05,
"loss": 0.5524,
"step": 590
},
{
"epoch": 0.2944062806673209,
"grad_norm": 1.3984375,
"learning_rate": 3.7100103199174406e-05,
"loss": 0.5517,
"step": 600
},
{
"epoch": 0.2944062806673209,
"eval_loss": 0.5586492419242859,
"eval_runtime": 28.5258,
"eval_samples_per_second": 7.011,
"eval_steps_per_second": 1.753,
"step": 600
},
{
"epoch": 0.29931305201177627,
"grad_norm": 1.375,
"learning_rate": 3.6842105263157895e-05,
"loss": 0.5308,
"step": 610
},
{
"epoch": 0.3042198233562316,
"grad_norm": 1.3046875,
"learning_rate": 3.658410732714139e-05,
"loss": 0.5938,
"step": 620
},
{
"epoch": 0.30912659470068693,
"grad_norm": 1.78125,
"learning_rate": 3.632610939112487e-05,
"loss": 0.5611,
"step": 630
},
{
"epoch": 0.3140333660451423,
"grad_norm": 0.74609375,
"learning_rate": 3.606811145510836e-05,
"loss": 0.5709,
"step": 640
},
{
"epoch": 0.31894013738959764,
"grad_norm": 0.99609375,
"learning_rate": 3.581011351909185e-05,
"loss": 0.6033,
"step": 650
},
{
"epoch": 0.31894013738959764,
"eval_loss": 0.564706027507782,
"eval_runtime": 28.5712,
"eval_samples_per_second": 7.0,
"eval_steps_per_second": 1.75,
"step": 650
},
{
"epoch": 0.323846908734053,
"grad_norm": 1.2578125,
"learning_rate": 3.555211558307533e-05,
"loss": 0.5123,
"step": 660
},
{
"epoch": 0.32875368007850836,
"grad_norm": 0.97265625,
"learning_rate": 3.529411764705883e-05,
"loss": 0.5205,
"step": 670
},
{
"epoch": 0.3336604514229637,
"grad_norm": 1.6953125,
"learning_rate": 3.5036119711042316e-05,
"loss": 0.5854,
"step": 680
},
{
"epoch": 0.338567222767419,
"grad_norm": 1.453125,
"learning_rate": 3.4778121775025805e-05,
"loss": 0.5375,
"step": 690
},
{
"epoch": 0.3434739941118744,
"grad_norm": 1.03125,
"learning_rate": 3.452012383900929e-05,
"loss": 0.5336,
"step": 700
},
{
"epoch": 0.3434739941118744,
"eval_loss": 0.5557608008384705,
"eval_runtime": 28.4609,
"eval_samples_per_second": 7.027,
"eval_steps_per_second": 1.757,
"step": 700
},
{
"epoch": 0.34838076545632973,
"grad_norm": 1.5859375,
"learning_rate": 3.4262125902992775e-05,
"loss": 0.5533,
"step": 710
},
{
"epoch": 0.35328753680078506,
"grad_norm": 0.8984375,
"learning_rate": 3.400412796697627e-05,
"loss": 0.5333,
"step": 720
},
{
"epoch": 0.35819430814524045,
"grad_norm": 1.09375,
"learning_rate": 3.374613003095975e-05,
"loss": 0.5536,
"step": 730
},
{
"epoch": 0.3631010794896958,
"grad_norm": 1.1171875,
"learning_rate": 3.348813209494324e-05,
"loss": 0.5448,
"step": 740
},
{
"epoch": 0.3680078508341511,
"grad_norm": 0.96875,
"learning_rate": 3.323013415892673e-05,
"loss": 0.532,
"step": 750
},
{
"epoch": 0.3680078508341511,
"eval_loss": 0.5521541833877563,
"eval_runtime": 28.4595,
"eval_samples_per_second": 7.028,
"eval_steps_per_second": 1.757,
"step": 750
},
{
"epoch": 0.3729146221786065,
"grad_norm": 0.90625,
"learning_rate": 3.297213622291022e-05,
"loss": 0.565,
"step": 760
},
{
"epoch": 0.3778213935230618,
"grad_norm": 1.4140625,
"learning_rate": 3.271413828689371e-05,
"loss": 0.5278,
"step": 770
},
{
"epoch": 0.38272816486751715,
"grad_norm": 1.3359375,
"learning_rate": 3.24561403508772e-05,
"loss": 0.5582,
"step": 780
},
{
"epoch": 0.38763493621197254,
"grad_norm": 0.84375,
"learning_rate": 3.2198142414860685e-05,
"loss": 0.5253,
"step": 790
},
{
"epoch": 0.39254170755642787,
"grad_norm": 1.4296875,
"learning_rate": 3.194014447884417e-05,
"loss": 0.5596,
"step": 800
},
{
"epoch": 0.39254170755642787,
"eval_loss": 0.551950216293335,
"eval_runtime": 28.6541,
"eval_samples_per_second": 6.98,
"eval_steps_per_second": 1.745,
"step": 800
},
{
"epoch": 0.3974484789008832,
"grad_norm": 1.484375,
"learning_rate": 3.1682146542827656e-05,
"loss": 0.5513,
"step": 810
},
{
"epoch": 0.4023552502453386,
"grad_norm": 1.21875,
"learning_rate": 3.142414860681115e-05,
"loss": 0.5023,
"step": 820
},
{
"epoch": 0.4072620215897939,
"grad_norm": 1.109375,
"learning_rate": 3.1166150670794634e-05,
"loss": 0.5519,
"step": 830
},
{
"epoch": 0.41216879293424924,
"grad_norm": 1.3984375,
"learning_rate": 3.090815273477812e-05,
"loss": 0.51,
"step": 840
},
{
"epoch": 0.4170755642787046,
"grad_norm": 1.0859375,
"learning_rate": 3.065015479876161e-05,
"loss": 0.5496,
"step": 850
},
{
"epoch": 0.4170755642787046,
"eval_loss": 0.5526400208473206,
"eval_runtime": 28.572,
"eval_samples_per_second": 7.0,
"eval_steps_per_second": 1.75,
"step": 850
},
{
"epoch": 0.42198233562315995,
"grad_norm": 1.2265625,
"learning_rate": 3.0392156862745097e-05,
"loss": 0.5372,
"step": 860
},
{
"epoch": 0.4268891069676153,
"grad_norm": 1.9140625,
"learning_rate": 3.013415892672859e-05,
"loss": 0.5384,
"step": 870
},
{
"epoch": 0.43179587831207067,
"grad_norm": 0.99609375,
"learning_rate": 2.9876160990712077e-05,
"loss": 0.4739,
"step": 880
},
{
"epoch": 0.436702649656526,
"grad_norm": 1.15625,
"learning_rate": 2.9618163054695563e-05,
"loss": 0.4766,
"step": 890
},
{
"epoch": 0.44160942100098133,
"grad_norm": 1.7734375,
"learning_rate": 2.936016511867905e-05,
"loss": 0.5321,
"step": 900
},
{
"epoch": 0.44160942100098133,
"eval_loss": 0.5481389164924622,
"eval_runtime": 28.6453,
"eval_samples_per_second": 6.982,
"eval_steps_per_second": 1.745,
"step": 900
},
{
"epoch": 0.4465161923454367,
"grad_norm": 0.9453125,
"learning_rate": 2.9102167182662537e-05,
"loss": 0.5654,
"step": 910
},
{
"epoch": 0.45142296368989204,
"grad_norm": 1.09375,
"learning_rate": 2.884416924664603e-05,
"loss": 0.5514,
"step": 920
},
{
"epoch": 0.4563297350343474,
"grad_norm": 1.1875,
"learning_rate": 2.8586171310629518e-05,
"loss": 0.4833,
"step": 930
},
{
"epoch": 0.46123650637880276,
"grad_norm": 1.4609375,
"learning_rate": 2.8328173374613003e-05,
"loss": 0.549,
"step": 940
},
{
"epoch": 0.4661432777232581,
"grad_norm": 0.9453125,
"learning_rate": 2.8070175438596492e-05,
"loss": 0.5194,
"step": 950
},
{
"epoch": 0.4661432777232581,
"eval_loss": 0.5417376160621643,
"eval_runtime": 28.3949,
"eval_samples_per_second": 7.044,
"eval_steps_per_second": 1.761,
"step": 950
},
{
"epoch": 0.47105004906771347,
"grad_norm": 1.4296875,
"learning_rate": 2.7812177502579977e-05,
"loss": 0.5468,
"step": 960
},
{
"epoch": 0.4759568204121688,
"grad_norm": 0.859375,
"learning_rate": 2.755417956656347e-05,
"loss": 0.5307,
"step": 970
},
{
"epoch": 0.48086359175662413,
"grad_norm": 1.6640625,
"learning_rate": 2.7296181630546958e-05,
"loss": 0.5379,
"step": 980
},
{
"epoch": 0.4857703631010795,
"grad_norm": 1.8828125,
"learning_rate": 2.7038183694530443e-05,
"loss": 0.5197,
"step": 990
},
{
"epoch": 0.49067713444553485,
"grad_norm": 1.265625,
"learning_rate": 2.6780185758513932e-05,
"loss": 0.5557,
"step": 1000
},
{
"epoch": 0.49067713444553485,
"eval_loss": 0.5419167876243591,
"eval_runtime": 28.5704,
"eval_samples_per_second": 7.0,
"eval_steps_per_second": 1.75,
"step": 1000
},
{
"epoch": 0.4955839057899902,
"grad_norm": 0.984375,
"learning_rate": 2.6522187822497424e-05,
"loss": 0.5121,
"step": 1010
},
{
"epoch": 0.5004906771344455,
"grad_norm": 1.7421875,
"learning_rate": 2.626418988648091e-05,
"loss": 0.4932,
"step": 1020
},
{
"epoch": 0.5053974484789009,
"grad_norm": 1.1015625,
"learning_rate": 2.60061919504644e-05,
"loss": 0.5361,
"step": 1030
},
{
"epoch": 0.5103042198233563,
"grad_norm": 1.1015625,
"learning_rate": 2.5748194014447884e-05,
"loss": 0.5107,
"step": 1040
},
{
"epoch": 0.5152109911678115,
"grad_norm": 1.359375,
"learning_rate": 2.5490196078431373e-05,
"loss": 0.5111,
"step": 1050
},
{
"epoch": 0.5152109911678115,
"eval_loss": 0.5413140654563904,
"eval_runtime": 28.49,
"eval_samples_per_second": 7.02,
"eval_steps_per_second": 1.755,
"step": 1050
},
{
"epoch": 0.5201177625122669,
"grad_norm": 0.57421875,
"learning_rate": 2.5232198142414865e-05,
"loss": 0.5302,
"step": 1060
},
{
"epoch": 0.5250245338567223,
"grad_norm": 1.859375,
"learning_rate": 2.497420020639835e-05,
"loss": 0.5089,
"step": 1070
},
{
"epoch": 0.5299313052011776,
"grad_norm": 1.78125,
"learning_rate": 2.471620227038184e-05,
"loss": 0.5451,
"step": 1080
},
{
"epoch": 0.534838076545633,
"grad_norm": 1.0546875,
"learning_rate": 2.4458204334365324e-05,
"loss": 0.5736,
"step": 1090
},
{
"epoch": 0.5397448478900884,
"grad_norm": 1.28125,
"learning_rate": 2.4200206398348816e-05,
"loss": 0.5065,
"step": 1100
},
{
"epoch": 0.5397448478900884,
"eval_loss": 0.5397326350212097,
"eval_runtime": 28.806,
"eval_samples_per_second": 6.943,
"eval_steps_per_second": 1.736,
"step": 1100
},
{
"epoch": 0.5446516192345436,
"grad_norm": 1.046875,
"learning_rate": 2.39422084623323e-05,
"loss": 0.5719,
"step": 1110
},
{
"epoch": 0.549558390578999,
"grad_norm": 0.75390625,
"learning_rate": 2.368421052631579e-05,
"loss": 0.5343,
"step": 1120
},
{
"epoch": 0.5544651619234544,
"grad_norm": 1.6484375,
"learning_rate": 2.342621259029928e-05,
"loss": 0.523,
"step": 1130
},
{
"epoch": 0.5593719332679097,
"grad_norm": 0.9765625,
"learning_rate": 2.3168214654282765e-05,
"loss": 0.5345,
"step": 1140
},
{
"epoch": 0.5642787046123651,
"grad_norm": 0.8359375,
"learning_rate": 2.2910216718266257e-05,
"loss": 0.5264,
"step": 1150
},
{
"epoch": 0.5642787046123651,
"eval_loss": 0.5375524759292603,
"eval_runtime": 28.5508,
"eval_samples_per_second": 7.005,
"eval_steps_per_second": 1.751,
"step": 1150
},
{
"epoch": 0.5691854759568205,
"grad_norm": 0.9609375,
"learning_rate": 2.2652218782249742e-05,
"loss": 0.4863,
"step": 1160
},
{
"epoch": 0.5740922473012757,
"grad_norm": 1.6875,
"learning_rate": 2.2394220846233234e-05,
"loss": 0.4676,
"step": 1170
},
{
"epoch": 0.5789990186457311,
"grad_norm": 1.3984375,
"learning_rate": 2.213622291021672e-05,
"loss": 0.5751,
"step": 1180
},
{
"epoch": 0.5839057899901865,
"grad_norm": 2.15625,
"learning_rate": 2.1878224974200205e-05,
"loss": 0.5694,
"step": 1190
},
{
"epoch": 0.5888125613346418,
"grad_norm": 0.98828125,
"learning_rate": 2.1620227038183697e-05,
"loss": 0.5529,
"step": 1200
},
{
"epoch": 0.5888125613346418,
"eval_loss": 0.5384119153022766,
"eval_runtime": 28.624,
"eval_samples_per_second": 6.987,
"eval_steps_per_second": 1.747,
"step": 1200
},
{
"epoch": 0.5937193326790972,
"grad_norm": 1.984375,
"learning_rate": 2.1362229102167182e-05,
"loss": 0.5438,
"step": 1210
},
{
"epoch": 0.5986261040235525,
"grad_norm": 1.3203125,
"learning_rate": 2.1104231166150675e-05,
"loss": 0.5174,
"step": 1220
},
{
"epoch": 0.6035328753680078,
"grad_norm": 1.5078125,
"learning_rate": 2.084623323013416e-05,
"loss": 0.5421,
"step": 1230
},
{
"epoch": 0.6084396467124632,
"grad_norm": 1.234375,
"learning_rate": 2.058823529411765e-05,
"loss": 0.5427,
"step": 1240
},
{
"epoch": 0.6133464180569186,
"grad_norm": 0.95703125,
"learning_rate": 2.0330237358101137e-05,
"loss": 0.5356,
"step": 1250
},
{
"epoch": 0.6133464180569186,
"eval_loss": 0.5382110476493835,
"eval_runtime": 28.6288,
"eval_samples_per_second": 6.986,
"eval_steps_per_second": 1.746,
"step": 1250
},
{
"epoch": 0.6182531894013739,
"grad_norm": 1.53125,
"learning_rate": 2.0072239422084623e-05,
"loss": 0.5667,
"step": 1260
},
{
"epoch": 0.6231599607458292,
"grad_norm": 1.5078125,
"learning_rate": 1.9814241486068115e-05,
"loss": 0.5421,
"step": 1270
},
{
"epoch": 0.6280667320902846,
"grad_norm": 1.6015625,
"learning_rate": 1.95562435500516e-05,
"loss": 0.4955,
"step": 1280
},
{
"epoch": 0.6329735034347399,
"grad_norm": 1.2890625,
"learning_rate": 1.929824561403509e-05,
"loss": 0.4995,
"step": 1290
},
{
"epoch": 0.6378802747791953,
"grad_norm": 1.4140625,
"learning_rate": 1.9040247678018578e-05,
"loss": 0.5029,
"step": 1300
},
{
"epoch": 0.6378802747791953,
"eval_loss": 0.5337280035018921,
"eval_runtime": 28.4662,
"eval_samples_per_second": 7.026,
"eval_steps_per_second": 1.756,
"step": 1300
},
{
"epoch": 0.6427870461236507,
"grad_norm": 1.3203125,
"learning_rate": 1.8782249742002063e-05,
"loss": 0.5149,
"step": 1310
},
{
"epoch": 0.647693817468106,
"grad_norm": 0.91796875,
"learning_rate": 1.8524251805985555e-05,
"loss": 0.5011,
"step": 1320
},
{
"epoch": 0.6526005888125613,
"grad_norm": 1.2890625,
"learning_rate": 1.826625386996904e-05,
"loss": 0.5474,
"step": 1330
},
{
"epoch": 0.6575073601570167,
"grad_norm": 1.4375,
"learning_rate": 1.800825593395253e-05,
"loss": 0.5195,
"step": 1340
},
{
"epoch": 0.662414131501472,
"grad_norm": 1.375,
"learning_rate": 1.7750257997936018e-05,
"loss": 0.4843,
"step": 1350
},
{
"epoch": 0.662414131501472,
"eval_loss": 0.5355111360549927,
"eval_runtime": 28.5183,
"eval_samples_per_second": 7.013,
"eval_steps_per_second": 1.753,
"step": 1350
},
{
"epoch": 0.6673209028459274,
"grad_norm": 1.4140625,
"learning_rate": 1.7492260061919503e-05,
"loss": 0.5007,
"step": 1360
},
{
"epoch": 0.6722276741903828,
"grad_norm": 1.640625,
"learning_rate": 1.7234262125902996e-05,
"loss": 0.5328,
"step": 1370
},
{
"epoch": 0.677134445534838,
"grad_norm": 0.921875,
"learning_rate": 1.697626418988648e-05,
"loss": 0.5312,
"step": 1380
},
{
"epoch": 0.6820412168792934,
"grad_norm": 1.8359375,
"learning_rate": 1.671826625386997e-05,
"loss": 0.5863,
"step": 1390
},
{
"epoch": 0.6869479882237488,
"grad_norm": 0.87109375,
"learning_rate": 1.646026831785346e-05,
"loss": 0.5408,
"step": 1400
},
{
"epoch": 0.6869479882237488,
"eval_loss": 0.5326699614524841,
"eval_runtime": 28.6461,
"eval_samples_per_second": 6.982,
"eval_steps_per_second": 1.745,
"step": 1400
},
{
"epoch": 0.6918547595682041,
"grad_norm": 1.203125,
"learning_rate": 1.6202270381836944e-05,
"loss": 0.4963,
"step": 1410
},
{
"epoch": 0.6967615309126595,
"grad_norm": 1.2734375,
"learning_rate": 1.5944272445820436e-05,
"loss": 0.5244,
"step": 1420
},
{
"epoch": 0.7016683022571149,
"grad_norm": 1.328125,
"learning_rate": 1.568627450980392e-05,
"loss": 0.5301,
"step": 1430
},
{
"epoch": 0.7065750736015701,
"grad_norm": 0.8984375,
"learning_rate": 1.542827657378741e-05,
"loss": 0.4815,
"step": 1440
},
{
"epoch": 0.7114818449460255,
"grad_norm": 0.56640625,
"learning_rate": 1.5170278637770899e-05,
"loss": 0.5156,
"step": 1450
},
{
"epoch": 0.7114818449460255,
"eval_loss": 0.532850980758667,
"eval_runtime": 28.7098,
"eval_samples_per_second": 6.966,
"eval_steps_per_second": 1.742,
"step": 1450
},
{
"epoch": 0.7163886162904809,
"grad_norm": 1.203125,
"learning_rate": 1.4912280701754386e-05,
"loss": 0.5187,
"step": 1460
},
{
"epoch": 0.7212953876349362,
"grad_norm": 2.828125,
"learning_rate": 1.4654282765737876e-05,
"loss": 0.4884,
"step": 1470
},
{
"epoch": 0.7262021589793916,
"grad_norm": 1.2109375,
"learning_rate": 1.4396284829721363e-05,
"loss": 0.5434,
"step": 1480
},
{
"epoch": 0.7311089303238469,
"grad_norm": 0.96875,
"learning_rate": 1.4138286893704852e-05,
"loss": 0.5154,
"step": 1490
},
{
"epoch": 0.7360157016683022,
"grad_norm": 1.1640625,
"learning_rate": 1.388028895768834e-05,
"loss": 0.5312,
"step": 1500
},
{
"epoch": 0.7360157016683022,
"eval_loss": 0.5351821184158325,
"eval_runtime": 28.5695,
"eval_samples_per_second": 7.0,
"eval_steps_per_second": 1.75,
"step": 1500
},
{
"epoch": 0.7409224730127576,
"grad_norm": 1.328125,
"learning_rate": 1.3622291021671826e-05,
"loss": 0.5394,
"step": 1510
},
{
"epoch": 0.745829244357213,
"grad_norm": 2.078125,
"learning_rate": 1.3364293085655317e-05,
"loss": 0.5081,
"step": 1520
},
{
"epoch": 0.7507360157016683,
"grad_norm": 1.09375,
"learning_rate": 1.3106295149638804e-05,
"loss": 0.5498,
"step": 1530
},
{
"epoch": 0.7556427870461236,
"grad_norm": 0.71484375,
"learning_rate": 1.2848297213622292e-05,
"loss": 0.4704,
"step": 1540
},
{
"epoch": 0.760549558390579,
"grad_norm": 1.0078125,
"learning_rate": 1.259029927760578e-05,
"loss": 0.5095,
"step": 1550
},
{
"epoch": 0.760549558390579,
"eval_loss": 0.5329477787017822,
"eval_runtime": 28.5741,
"eval_samples_per_second": 6.999,
"eval_steps_per_second": 1.75,
"step": 1550
},
{
"epoch": 0.7654563297350343,
"grad_norm": 0.86328125,
"learning_rate": 1.2332301341589268e-05,
"loss": 0.5021,
"step": 1560
},
{
"epoch": 0.7703631010794897,
"grad_norm": 1.3671875,
"learning_rate": 1.2074303405572757e-05,
"loss": 0.5578,
"step": 1570
},
{
"epoch": 0.7752698724239451,
"grad_norm": 1.296875,
"learning_rate": 1.1816305469556244e-05,
"loss": 0.5031,
"step": 1580
},
{
"epoch": 0.7801766437684003,
"grad_norm": 0.87890625,
"learning_rate": 1.1558307533539733e-05,
"loss": 0.5225,
"step": 1590
},
{
"epoch": 0.7850834151128557,
"grad_norm": 1.34375,
"learning_rate": 1.130030959752322e-05,
"loss": 0.4909,
"step": 1600
},
{
"epoch": 0.7850834151128557,
"eval_loss": 0.5333244204521179,
"eval_runtime": 28.4593,
"eval_samples_per_second": 7.028,
"eval_steps_per_second": 1.757,
"step": 1600
},
{
"epoch": 0.7899901864573111,
"grad_norm": 1.2734375,
"learning_rate": 1.1042311661506709e-05,
"loss": 0.5154,
"step": 1610
},
{
"epoch": 0.7948969578017664,
"grad_norm": 2.0,
"learning_rate": 1.0784313725490197e-05,
"loss": 0.5048,
"step": 1620
},
{
"epoch": 0.7998037291462218,
"grad_norm": 1.078125,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.4927,
"step": 1630
},
{
"epoch": 0.8047105004906772,
"grad_norm": 1.2265625,
"learning_rate": 1.0268317853457173e-05,
"loss": 0.5636,
"step": 1640
},
{
"epoch": 0.8096172718351324,
"grad_norm": 1.1796875,
"learning_rate": 1.001031991744066e-05,
"loss": 0.5423,
"step": 1650
},
{
"epoch": 0.8096172718351324,
"eval_loss": 0.5323337316513062,
"eval_runtime": 28.6741,
"eval_samples_per_second": 6.975,
"eval_steps_per_second": 1.744,
"step": 1650
},
{
"epoch": 0.8145240431795878,
"grad_norm": 1.546875,
"learning_rate": 9.752321981424149e-06,
"loss": 0.4747,
"step": 1660
},
{
"epoch": 0.8194308145240432,
"grad_norm": 1.0078125,
"learning_rate": 9.494324045407638e-06,
"loss": 0.4759,
"step": 1670
},
{
"epoch": 0.8243375858684985,
"grad_norm": 0.87890625,
"learning_rate": 9.236326109391125e-06,
"loss": 0.5072,
"step": 1680
},
{
"epoch": 0.8292443572129539,
"grad_norm": 0.83984375,
"learning_rate": 8.978328173374614e-06,
"loss": 0.4662,
"step": 1690
},
{
"epoch": 0.8341511285574092,
"grad_norm": 0.95703125,
"learning_rate": 8.7203302373581e-06,
"loss": 0.5161,
"step": 1700
},
{
"epoch": 0.8341511285574092,
"eval_loss": 0.5313096046447754,
"eval_runtime": 28.6753,
"eval_samples_per_second": 6.975,
"eval_steps_per_second": 1.744,
"step": 1700
},
{
"epoch": 0.8390578999018645,
"grad_norm": 1.3203125,
"learning_rate": 8.46233230134159e-06,
"loss": 0.4837,
"step": 1710
},
{
"epoch": 0.8439646712463199,
"grad_norm": 1.1171875,
"learning_rate": 8.204334365325078e-06,
"loss": 0.49,
"step": 1720
},
{
"epoch": 0.8488714425907753,
"grad_norm": 1.515625,
"learning_rate": 7.946336429308567e-06,
"loss": 0.5959,
"step": 1730
},
{
"epoch": 0.8537782139352306,
"grad_norm": 1.1171875,
"learning_rate": 7.688338493292054e-06,
"loss": 0.5113,
"step": 1740
},
{
"epoch": 0.858684985279686,
"grad_norm": 1.4375,
"learning_rate": 7.430340557275542e-06,
"loss": 0.5343,
"step": 1750
},
{
"epoch": 0.858684985279686,
"eval_loss": 0.530957043170929,
"eval_runtime": 28.7519,
"eval_samples_per_second": 6.956,
"eval_steps_per_second": 1.739,
"step": 1750
},
{
"epoch": 0.8635917566241413,
"grad_norm": 1.109375,
"learning_rate": 7.1723426212590306e-06,
"loss": 0.5282,
"step": 1760
},
{
"epoch": 0.8684985279685966,
"grad_norm": 1.5,
"learning_rate": 6.9143446852425185e-06,
"loss": 0.4993,
"step": 1770
},
{
"epoch": 0.873405299313052,
"grad_norm": 1.1328125,
"learning_rate": 6.656346749226007e-06,
"loss": 0.4911,
"step": 1780
},
{
"epoch": 0.8783120706575074,
"grad_norm": 1.1640625,
"learning_rate": 6.398348813209494e-06,
"loss": 0.4945,
"step": 1790
},
{
"epoch": 0.8832188420019627,
"grad_norm": 0.6796875,
"learning_rate": 6.140350877192982e-06,
"loss": 0.5435,
"step": 1800
},
{
"epoch": 0.8832188420019627,
"eval_loss": 0.5307140350341797,
"eval_runtime": 28.6268,
"eval_samples_per_second": 6.986,
"eval_steps_per_second": 1.747,
"step": 1800
},
{
"epoch": 0.888125613346418,
"grad_norm": 0.7890625,
"learning_rate": 5.882352941176471e-06,
"loss": 0.5001,
"step": 1810
},
{
"epoch": 0.8930323846908734,
"grad_norm": 1.265625,
"learning_rate": 5.624355005159959e-06,
"loss": 0.5042,
"step": 1820
},
{
"epoch": 0.8979391560353287,
"grad_norm": 1.1875,
"learning_rate": 5.366357069143447e-06,
"loss": 0.5642,
"step": 1830
},
{
"epoch": 0.9028459273797841,
"grad_norm": 1.2265625,
"learning_rate": 5.1083591331269355e-06,
"loss": 0.5587,
"step": 1840
},
{
"epoch": 0.9077526987242395,
"grad_norm": 1.921875,
"learning_rate": 4.850361197110423e-06,
"loss": 0.539,
"step": 1850
},
{
"epoch": 0.9077526987242395,
"eval_loss": 0.5304572582244873,
"eval_runtime": 28.4933,
"eval_samples_per_second": 7.019,
"eval_steps_per_second": 1.755,
"step": 1850
},
{
"epoch": 0.9126594700686947,
"grad_norm": 1.375,
"learning_rate": 4.592363261093911e-06,
"loss": 0.5004,
"step": 1860
},
{
"epoch": 0.9175662414131501,
"grad_norm": 1.234375,
"learning_rate": 4.3343653250774e-06,
"loss": 0.4948,
"step": 1870
},
{
"epoch": 0.9224730127576055,
"grad_norm": 1.2109375,
"learning_rate": 4.076367389060888e-06,
"loss": 0.5392,
"step": 1880
},
{
"epoch": 0.9273797841020608,
"grad_norm": 1.1171875,
"learning_rate": 3.818369453044376e-06,
"loss": 0.5441,
"step": 1890
},
{
"epoch": 0.9322865554465162,
"grad_norm": 2.65625,
"learning_rate": 3.560371517027864e-06,
"loss": 0.5096,
"step": 1900
},
{
"epoch": 0.9322865554465162,
"eval_loss": 0.5307794809341431,
"eval_runtime": 28.4936,
"eval_samples_per_second": 7.019,
"eval_steps_per_second": 1.755,
"step": 1900
},
{
"epoch": 0.9371933267909716,
"grad_norm": 0.46875,
"learning_rate": 3.3023735810113516e-06,
"loss": 0.5092,
"step": 1910
},
{
"epoch": 0.9421000981354269,
"grad_norm": 1.0546875,
"learning_rate": 3.0443756449948404e-06,
"loss": 0.5825,
"step": 1920
},
{
"epoch": 0.9470068694798822,
"grad_norm": 1.5078125,
"learning_rate": 2.7863777089783283e-06,
"loss": 0.5331,
"step": 1930
},
{
"epoch": 0.9519136408243376,
"grad_norm": 1.03125,
"learning_rate": 2.5283797729618166e-06,
"loss": 0.5464,
"step": 1940
},
{
"epoch": 0.956820412168793,
"grad_norm": 1.1640625,
"learning_rate": 2.2703818369453045e-06,
"loss": 0.5155,
"step": 1950
},
{
"epoch": 0.956820412168793,
"eval_loss": 0.5306495428085327,
"eval_runtime": 28.7357,
"eval_samples_per_second": 6.96,
"eval_steps_per_second": 1.74,
"step": 1950
},
{
"epoch": 0.9617271835132483,
"grad_norm": 0.9921875,
"learning_rate": 2.012383900928793e-06,
"loss": 0.5282,
"step": 1960
},
{
"epoch": 0.9666339548577036,
"grad_norm": 0.40234375,
"learning_rate": 1.7543859649122807e-06,
"loss": 0.5208,
"step": 1970
},
{
"epoch": 0.971540726202159,
"grad_norm": 0.78515625,
"learning_rate": 1.4963880288957689e-06,
"loss": 0.5216,
"step": 1980
},
{
"epoch": 0.9764474975466143,
"grad_norm": 1.3359375,
"learning_rate": 1.2383900928792572e-06,
"loss": 0.516,
"step": 1990
},
{
"epoch": 0.9813542688910697,
"grad_norm": 3.125,
"learning_rate": 9.80392156862745e-07,
"loss": 0.529,
"step": 2000
},
{
"epoch": 0.9813542688910697,
"eval_loss": 0.5309420228004456,
"eval_runtime": 28.7264,
"eval_samples_per_second": 6.962,
"eval_steps_per_second": 1.741,
"step": 2000
},
{
"epoch": 0.9862610402355251,
"grad_norm": 1.3046875,
"learning_rate": 7.223942208462333e-07,
"loss": 0.5101,
"step": 2010
},
{
"epoch": 0.9911678115799804,
"grad_norm": 1.109375,
"learning_rate": 4.6439628482972136e-07,
"loss": 0.4883,
"step": 2020
},
{
"epoch": 0.9960745829244357,
"grad_norm": 0.71484375,
"learning_rate": 2.0639834881320948e-07,
"loss": 0.5151,
"step": 2030
}
],
"logging_steps": 10,
"max_steps": 2038,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.79578628062624e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}