lapaliv-0002 / checkpoint-650 /trainer_state.json
lapaliv's picture
Upload folder using huggingface_hub
26a4129 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.950191570881227,
"eval_steps": 17,
"global_step": 650,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01532567049808429,
"grad_norm": 3.475003242492676,
"learning_rate": 2e-05,
"loss": 1.9507,
"step": 1
},
{
"epoch": 0.01532567049808429,
"eval_loss": 1.9943002462387085,
"eval_runtime": 10.4694,
"eval_samples_per_second": 9.552,
"eval_steps_per_second": 4.776,
"step": 1
},
{
"epoch": 0.03065134099616858,
"grad_norm": 3.6678824424743652,
"learning_rate": 4e-05,
"loss": 2.0639,
"step": 2
},
{
"epoch": 0.04597701149425287,
"grad_norm": 3.1201210021972656,
"learning_rate": 6e-05,
"loss": 1.8136,
"step": 3
},
{
"epoch": 0.06130268199233716,
"grad_norm": 3.606743574142456,
"learning_rate": 8e-05,
"loss": 1.9302,
"step": 4
},
{
"epoch": 0.07662835249042145,
"grad_norm": 3.096000909805298,
"learning_rate": 0.0001,
"loss": 1.9869,
"step": 5
},
{
"epoch": 0.09195402298850575,
"grad_norm": 2.841855049133301,
"learning_rate": 0.00012,
"loss": 1.7556,
"step": 6
},
{
"epoch": 0.10727969348659004,
"grad_norm": 2.7530441284179688,
"learning_rate": 0.00014,
"loss": 1.8622,
"step": 7
},
{
"epoch": 0.12260536398467432,
"grad_norm": 2.9382359981536865,
"learning_rate": 0.00016,
"loss": 1.7264,
"step": 8
},
{
"epoch": 0.13793103448275862,
"grad_norm": 2.9906227588653564,
"learning_rate": 0.00018,
"loss": 1.8225,
"step": 9
},
{
"epoch": 0.1532567049808429,
"grad_norm": 2.951603889465332,
"learning_rate": 0.0002,
"loss": 1.8434,
"step": 10
},
{
"epoch": 0.1685823754789272,
"grad_norm": 2.783867120742798,
"learning_rate": 0.00019999916768504724,
"loss": 1.6941,
"step": 11
},
{
"epoch": 0.1839080459770115,
"grad_norm": 2.7186167240142822,
"learning_rate": 0.00019999667075404383,
"loss": 1.8163,
"step": 12
},
{
"epoch": 0.19923371647509577,
"grad_norm": 2.33475661277771,
"learning_rate": 0.00019999250924855456,
"loss": 1.6088,
"step": 13
},
{
"epoch": 0.21455938697318008,
"grad_norm": 2.289853811264038,
"learning_rate": 0.00019998668323785296,
"loss": 1.6944,
"step": 14
},
{
"epoch": 0.22988505747126436,
"grad_norm": 2.4338462352752686,
"learning_rate": 0.00019997919281892067,
"loss": 1.7205,
"step": 15
},
{
"epoch": 0.24521072796934865,
"grad_norm": 2.6904211044311523,
"learning_rate": 0.00019997003811644533,
"loss": 1.8309,
"step": 16
},
{
"epoch": 0.26053639846743293,
"grad_norm": 2.0868079662323,
"learning_rate": 0.00019995921928281894,
"loss": 1.714,
"step": 17
},
{
"epoch": 0.26053639846743293,
"eval_loss": 1.71925687789917,
"eval_runtime": 10.4582,
"eval_samples_per_second": 9.562,
"eval_steps_per_second": 4.781,
"step": 17
},
{
"epoch": 0.27586206896551724,
"grad_norm": 2.312363862991333,
"learning_rate": 0.00019994673649813497,
"loss": 1.7437,
"step": 18
},
{
"epoch": 0.29118773946360155,
"grad_norm": 2.1838905811309814,
"learning_rate": 0.00019993258997018566,
"loss": 1.6337,
"step": 19
},
{
"epoch": 0.3065134099616858,
"grad_norm": 2.2951676845550537,
"learning_rate": 0.0001999167799344583,
"loss": 1.6456,
"step": 20
},
{
"epoch": 0.3218390804597701,
"grad_norm": 2.147050380706787,
"learning_rate": 0.00019989930665413147,
"loss": 1.5753,
"step": 21
},
{
"epoch": 0.3371647509578544,
"grad_norm": 2.214049816131592,
"learning_rate": 0.00019988017042007065,
"loss": 1.8861,
"step": 22
},
{
"epoch": 0.3524904214559387,
"grad_norm": 2.1761178970336914,
"learning_rate": 0.00019985937155082327,
"loss": 1.5181,
"step": 23
},
{
"epoch": 0.367816091954023,
"grad_norm": 2.7011399269104004,
"learning_rate": 0.00019983691039261357,
"loss": 1.6559,
"step": 24
},
{
"epoch": 0.3831417624521073,
"grad_norm": 2.0692250728607178,
"learning_rate": 0.0001998127873193367,
"loss": 1.6602,
"step": 25
},
{
"epoch": 0.39846743295019155,
"grad_norm": 2.190605640411377,
"learning_rate": 0.00019978700273255254,
"loss": 1.6678,
"step": 26
},
{
"epoch": 0.41379310344827586,
"grad_norm": 2.303030252456665,
"learning_rate": 0.000199759557061479,
"loss": 1.7287,
"step": 27
},
{
"epoch": 0.42911877394636017,
"grad_norm": 2.3805620670318604,
"learning_rate": 0.000199730450762985,
"loss": 1.6801,
"step": 28
},
{
"epoch": 0.4444444444444444,
"grad_norm": 1.9173905849456787,
"learning_rate": 0.00019969968432158265,
"loss": 1.6536,
"step": 29
},
{
"epoch": 0.45977011494252873,
"grad_norm": 1.9623961448669434,
"learning_rate": 0.00019966725824941932,
"loss": 1.5311,
"step": 30
},
{
"epoch": 0.47509578544061304,
"grad_norm": 2.2046408653259277,
"learning_rate": 0.00019963317308626914,
"loss": 1.7119,
"step": 31
},
{
"epoch": 0.4904214559386973,
"grad_norm": 2.034040927886963,
"learning_rate": 0.00019959742939952392,
"loss": 1.6249,
"step": 32
},
{
"epoch": 0.5057471264367817,
"grad_norm": 2.274533271789551,
"learning_rate": 0.00019956002778418372,
"loss": 1.6809,
"step": 33
},
{
"epoch": 0.5210727969348659,
"grad_norm": 1.9758435487747192,
"learning_rate": 0.0001995209688628471,
"loss": 1.5507,
"step": 34
},
{
"epoch": 0.5210727969348659,
"eval_loss": 1.7039636373519897,
"eval_runtime": 10.4847,
"eval_samples_per_second": 9.538,
"eval_steps_per_second": 4.769,
"step": 34
},
{
"epoch": 0.5363984674329502,
"grad_norm": 1.908996820449829,
"learning_rate": 0.00019948025328570042,
"loss": 1.668,
"step": 35
},
{
"epoch": 0.5517241379310345,
"grad_norm": 2.0340089797973633,
"learning_rate": 0.00019943788173050744,
"loss": 1.6788,
"step": 36
},
{
"epoch": 0.5670498084291188,
"grad_norm": 2.1147003173828125,
"learning_rate": 0.0001993938549025977,
"loss": 1.5346,
"step": 37
},
{
"epoch": 0.5823754789272031,
"grad_norm": 2.2234580516815186,
"learning_rate": 0.00019934817353485501,
"loss": 1.6118,
"step": 38
},
{
"epoch": 0.5977011494252874,
"grad_norm": 1.8898108005523682,
"learning_rate": 0.00019930083838770504,
"loss": 1.542,
"step": 39
},
{
"epoch": 0.6130268199233716,
"grad_norm": 1.947200894355774,
"learning_rate": 0.00019925185024910277,
"loss": 1.6701,
"step": 40
},
{
"epoch": 0.6283524904214559,
"grad_norm": 1.9336851835250854,
"learning_rate": 0.00019920120993451948,
"loss": 1.6159,
"step": 41
},
{
"epoch": 0.6436781609195402,
"grad_norm": 2.044646978378296,
"learning_rate": 0.00019914891828692888,
"loss": 1.6761,
"step": 42
},
{
"epoch": 0.6590038314176245,
"grad_norm": 1.9677635431289673,
"learning_rate": 0.00019909497617679348,
"loss": 1.7505,
"step": 43
},
{
"epoch": 0.6743295019157088,
"grad_norm": 1.887392282485962,
"learning_rate": 0.00019903938450204972,
"loss": 1.6804,
"step": 44
},
{
"epoch": 0.6896551724137931,
"grad_norm": 2.1503148078918457,
"learning_rate": 0.0001989821441880933,
"loss": 1.5835,
"step": 45
},
{
"epoch": 0.7049808429118773,
"grad_norm": 1.8051438331604004,
"learning_rate": 0.00019892325618776351,
"loss": 1.721,
"step": 46
},
{
"epoch": 0.7203065134099617,
"grad_norm": 1.8534125089645386,
"learning_rate": 0.0001988627214813277,
"loss": 1.6925,
"step": 47
},
{
"epoch": 0.735632183908046,
"grad_norm": 1.6843996047973633,
"learning_rate": 0.00019880054107646467,
"loss": 1.7291,
"step": 48
},
{
"epoch": 0.7509578544061303,
"grad_norm": 2.0053601264953613,
"learning_rate": 0.000198736716008248,
"loss": 1.6344,
"step": 49
},
{
"epoch": 0.7662835249042146,
"grad_norm": 1.9978563785552979,
"learning_rate": 0.0001986712473391289,
"loss": 1.5687,
"step": 50
},
{
"epoch": 0.7816091954022989,
"grad_norm": 1.6498862504959106,
"learning_rate": 0.0001986041361589184,
"loss": 1.6354,
"step": 51
},
{
"epoch": 0.7816091954022989,
"eval_loss": 1.6665664911270142,
"eval_runtime": 10.4646,
"eval_samples_per_second": 9.556,
"eval_steps_per_second": 4.778,
"step": 51
},
{
"epoch": 0.7969348659003831,
"grad_norm": 2.0754377841949463,
"learning_rate": 0.00019853538358476932,
"loss": 1.7128,
"step": 52
},
{
"epoch": 0.8122605363984674,
"grad_norm": 1.8503700494766235,
"learning_rate": 0.0001984649907611575,
"loss": 1.6028,
"step": 53
},
{
"epoch": 0.8275862068965517,
"grad_norm": 1.9877614974975586,
"learning_rate": 0.00019839295885986296,
"loss": 1.7578,
"step": 54
},
{
"epoch": 0.842911877394636,
"grad_norm": 1.9744536876678467,
"learning_rate": 0.0001983192890799503,
"loss": 1.6639,
"step": 55
},
{
"epoch": 0.8582375478927203,
"grad_norm": 1.9516663551330566,
"learning_rate": 0.00019824398264774867,
"loss": 1.6724,
"step": 56
},
{
"epoch": 0.8735632183908046,
"grad_norm": 1.8794466257095337,
"learning_rate": 0.0001981670408168315,
"loss": 1.5008,
"step": 57
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.7897112369537354,
"learning_rate": 0.0001980884648679955,
"loss": 1.5942,
"step": 58
},
{
"epoch": 0.9042145593869731,
"grad_norm": 1.776986002922058,
"learning_rate": 0.00019800825610923934,
"loss": 1.5893,
"step": 59
},
{
"epoch": 0.9195402298850575,
"grad_norm": 1.9505722522735596,
"learning_rate": 0.00019792641587574212,
"loss": 1.6273,
"step": 60
},
{
"epoch": 0.9348659003831418,
"grad_norm": 1.9335532188415527,
"learning_rate": 0.00019784294552984078,
"loss": 1.5953,
"step": 61
},
{
"epoch": 0.9501915708812261,
"grad_norm": 2.057013750076294,
"learning_rate": 0.0001977578464610077,
"loss": 1.6479,
"step": 62
},
{
"epoch": 0.9655172413793104,
"grad_norm": 1.838173508644104,
"learning_rate": 0.00019767112008582736,
"loss": 1.6264,
"step": 63
},
{
"epoch": 0.9808429118773946,
"grad_norm": 1.8121559619903564,
"learning_rate": 0.000197582767847973,
"loss": 1.5673,
"step": 64
},
{
"epoch": 0.9961685823754789,
"grad_norm": 1.8894027471542358,
"learning_rate": 0.00019749279121818235,
"loss": 1.6727,
"step": 65
},
{
"epoch": 1.0076628352490422,
"grad_norm": 3.277520179748535,
"learning_rate": 0.00019740119169423337,
"loss": 2.0471,
"step": 66
},
{
"epoch": 1.0229885057471264,
"grad_norm": 1.553820013999939,
"learning_rate": 0.00019730797080091904,
"loss": 0.9425,
"step": 67
},
{
"epoch": 1.0383141762452108,
"grad_norm": 1.5284228324890137,
"learning_rate": 0.00019721313009002226,
"loss": 0.9188,
"step": 68
},
{
"epoch": 1.0383141762452108,
"eval_loss": 1.6558603048324585,
"eval_runtime": 10.461,
"eval_samples_per_second": 9.559,
"eval_steps_per_second": 4.78,
"step": 68
},
{
"epoch": 1.053639846743295,
"grad_norm": 1.4431841373443604,
"learning_rate": 0.0001971166711402899,
"loss": 0.8091,
"step": 69
},
{
"epoch": 1.0689655172413792,
"grad_norm": 1.6087971925735474,
"learning_rate": 0.00019701859555740648,
"loss": 0.9413,
"step": 70
},
{
"epoch": 1.0842911877394636,
"grad_norm": 1.6617636680603027,
"learning_rate": 0.0001969189049739674,
"loss": 0.895,
"step": 71
},
{
"epoch": 1.0996168582375478,
"grad_norm": 1.606227159500122,
"learning_rate": 0.00019681760104945203,
"loss": 0.8442,
"step": 72
},
{
"epoch": 1.1149425287356323,
"grad_norm": 1.4187818765640259,
"learning_rate": 0.00019671468547019573,
"loss": 0.8078,
"step": 73
},
{
"epoch": 1.1302681992337165,
"grad_norm": 1.5401397943496704,
"learning_rate": 0.00019661015994936203,
"loss": 0.9093,
"step": 74
},
{
"epoch": 1.1455938697318007,
"grad_norm": 1.633941888809204,
"learning_rate": 0.000196504026226914,
"loss": 0.8941,
"step": 75
},
{
"epoch": 1.160919540229885,
"grad_norm": 1.551140308380127,
"learning_rate": 0.00019639628606958533,
"loss": 0.8318,
"step": 76
},
{
"epoch": 1.1762452107279693,
"grad_norm": 1.920763373374939,
"learning_rate": 0.00019628694127085092,
"loss": 0.8781,
"step": 77
},
{
"epoch": 1.1915708812260537,
"grad_norm": 1.802857518196106,
"learning_rate": 0.00019617599365089693,
"loss": 0.9417,
"step": 78
},
{
"epoch": 1.206896551724138,
"grad_norm": 1.5704469680786133,
"learning_rate": 0.0001960634450565907,
"loss": 0.8462,
"step": 79
},
{
"epoch": 1.2222222222222223,
"grad_norm": 1.67445969581604,
"learning_rate": 0.00019594929736144976,
"loss": 0.9293,
"step": 80
},
{
"epoch": 1.2375478927203065,
"grad_norm": 1.6255979537963867,
"learning_rate": 0.00019583355246561074,
"loss": 0.8358,
"step": 81
},
{
"epoch": 1.2528735632183907,
"grad_norm": 1.6431758403778076,
"learning_rate": 0.00019571621229579782,
"loss": 0.9362,
"step": 82
},
{
"epoch": 1.2681992337164751,
"grad_norm": 1.6321423053741455,
"learning_rate": 0.00019559727880529059,
"loss": 0.9574,
"step": 83
},
{
"epoch": 1.2835249042145593,
"grad_norm": 1.4820754528045654,
"learning_rate": 0.00019547675397389141,
"loss": 0.7697,
"step": 84
},
{
"epoch": 1.2988505747126438,
"grad_norm": 1.6704702377319336,
"learning_rate": 0.00019535463980789277,
"loss": 0.8897,
"step": 85
},
{
"epoch": 1.2988505747126438,
"eval_loss": 1.6953216791152954,
"eval_runtime": 10.5357,
"eval_samples_per_second": 9.492,
"eval_steps_per_second": 4.746,
"step": 85
},
{
"epoch": 1.314176245210728,
"grad_norm": 1.5606012344360352,
"learning_rate": 0.00019523093834004356,
"loss": 0.8687,
"step": 86
},
{
"epoch": 1.3295019157088124,
"grad_norm": 1.69247567653656,
"learning_rate": 0.00019510565162951537,
"loss": 0.962,
"step": 87
},
{
"epoch": 1.3448275862068966,
"grad_norm": 1.77336847782135,
"learning_rate": 0.00019497878176186827,
"loss": 0.8073,
"step": 88
},
{
"epoch": 1.3601532567049808,
"grad_norm": 1.6945431232452393,
"learning_rate": 0.00019485033084901606,
"loss": 0.9388,
"step": 89
},
{
"epoch": 1.3754789272030652,
"grad_norm": 1.8969769477844238,
"learning_rate": 0.000194720301029191,
"loss": 0.9693,
"step": 90
},
{
"epoch": 1.3908045977011494,
"grad_norm": 1.6189223527908325,
"learning_rate": 0.0001945886944669084,
"loss": 0.8052,
"step": 91
},
{
"epoch": 1.4061302681992336,
"grad_norm": 1.652786135673523,
"learning_rate": 0.0001944555133529304,
"loss": 0.9079,
"step": 92
},
{
"epoch": 1.421455938697318,
"grad_norm": 1.5484676361083984,
"learning_rate": 0.00019432075990422968,
"loss": 0.8395,
"step": 93
},
{
"epoch": 1.4367816091954024,
"grad_norm": 1.625877022743225,
"learning_rate": 0.00019418443636395248,
"loss": 0.876,
"step": 94
},
{
"epoch": 1.4521072796934866,
"grad_norm": 1.922146201133728,
"learning_rate": 0.00019404654500138117,
"loss": 0.8344,
"step": 95
},
{
"epoch": 1.4674329501915708,
"grad_norm": 1.6981974840164185,
"learning_rate": 0.0001939070881118966,
"loss": 0.8232,
"step": 96
},
{
"epoch": 1.4827586206896552,
"grad_norm": 1.7996752262115479,
"learning_rate": 0.0001937660680169399,
"loss": 0.9207,
"step": 97
},
{
"epoch": 1.4980842911877394,
"grad_norm": 1.784002423286438,
"learning_rate": 0.00019362348706397373,
"loss": 0.8402,
"step": 98
},
{
"epoch": 1.5134099616858236,
"grad_norm": 1.436486005783081,
"learning_rate": 0.00019347934762644326,
"loss": 0.7129,
"step": 99
},
{
"epoch": 1.528735632183908,
"grad_norm": 1.5737037658691406,
"learning_rate": 0.0001933336521037367,
"loss": 0.9158,
"step": 100
},
{
"epoch": 1.5440613026819925,
"grad_norm": 1.516647219657898,
"learning_rate": 0.00019318640292114524,
"loss": 0.8451,
"step": 101
},
{
"epoch": 1.5593869731800765,
"grad_norm": 1.6449085474014282,
"learning_rate": 0.00019303760252982287,
"loss": 0.9014,
"step": 102
},
{
"epoch": 1.5593869731800765,
"eval_loss": 1.7118545770645142,
"eval_runtime": 10.4529,
"eval_samples_per_second": 9.567,
"eval_steps_per_second": 4.783,
"step": 102
},
{
"epoch": 1.5747126436781609,
"grad_norm": 1.578679084777832,
"learning_rate": 0.00019288725340674536,
"loss": 0.8788,
"step": 103
},
{
"epoch": 1.5900383141762453,
"grad_norm": 1.635235071182251,
"learning_rate": 0.00019273535805466917,
"loss": 0.8992,
"step": 104
},
{
"epoch": 1.6053639846743295,
"grad_norm": 1.637152075767517,
"learning_rate": 0.0001925819190020898,
"loss": 0.8922,
"step": 105
},
{
"epoch": 1.6206896551724137,
"grad_norm": 1.5802862644195557,
"learning_rate": 0.0001924269388031996,
"loss": 0.822,
"step": 106
},
{
"epoch": 1.6360153256704981,
"grad_norm": 1.5077544450759888,
"learning_rate": 0.00019227042003784527,
"loss": 0.7743,
"step": 107
},
{
"epoch": 1.6513409961685823,
"grad_norm": 1.7062519788742065,
"learning_rate": 0.000192112365311485,
"loss": 0.8473,
"step": 108
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.676834225654602,
"learning_rate": 0.0001919527772551451,
"loss": 0.96,
"step": 109
},
{
"epoch": 1.681992337164751,
"grad_norm": 1.775424838066101,
"learning_rate": 0.00019179165852537596,
"loss": 0.8855,
"step": 110
},
{
"epoch": 1.6973180076628354,
"grad_norm": 1.5298705101013184,
"learning_rate": 0.0001916290118042082,
"loss": 0.7232,
"step": 111
},
{
"epoch": 1.7126436781609196,
"grad_norm": 1.5757646560668945,
"learning_rate": 0.0001914648397991078,
"loss": 0.9097,
"step": 112
},
{
"epoch": 1.7279693486590038,
"grad_norm": 1.5786842107772827,
"learning_rate": 0.00019129914524293102,
"loss": 0.8836,
"step": 113
},
{
"epoch": 1.7432950191570882,
"grad_norm": 1.8097132444381714,
"learning_rate": 0.00019113193089387903,
"loss": 0.938,
"step": 114
},
{
"epoch": 1.7586206896551724,
"grad_norm": 1.771764874458313,
"learning_rate": 0.00019096319953545185,
"loss": 0.8042,
"step": 115
},
{
"epoch": 1.7739463601532566,
"grad_norm": 1.8478142023086548,
"learning_rate": 0.00019079295397640215,
"loss": 0.9323,
"step": 116
},
{
"epoch": 1.789272030651341,
"grad_norm": 1.5792856216430664,
"learning_rate": 0.00019062119705068843,
"loss": 0.8917,
"step": 117
},
{
"epoch": 1.8045977011494254,
"grad_norm": 1.6793948411941528,
"learning_rate": 0.00019044793161742782,
"loss": 0.8495,
"step": 118
},
{
"epoch": 1.8199233716475096,
"grad_norm": 1.6884868144989014,
"learning_rate": 0.00019027316056084858,
"loss": 0.8517,
"step": 119
},
{
"epoch": 1.8199233716475096,
"eval_loss": 1.7208638191223145,
"eval_runtime": 10.4697,
"eval_samples_per_second": 9.551,
"eval_steps_per_second": 4.776,
"step": 119
},
{
"epoch": 1.8352490421455938,
"grad_norm": 1.740159511566162,
"learning_rate": 0.0001900968867902419,
"loss": 0.96,
"step": 120
},
{
"epoch": 1.8505747126436782,
"grad_norm": 1.6979262828826904,
"learning_rate": 0.0001899191132399138,
"loss": 0.8892,
"step": 121
},
{
"epoch": 1.8659003831417624,
"grad_norm": 1.7245821952819824,
"learning_rate": 0.00018973984286913584,
"loss": 0.8417,
"step": 122
},
{
"epoch": 1.8812260536398466,
"grad_norm": 1.8138068914413452,
"learning_rate": 0.0001895590786620963,
"loss": 0.9722,
"step": 123
},
{
"epoch": 1.896551724137931,
"grad_norm": 1.4977965354919434,
"learning_rate": 0.00018937682362785022,
"loss": 0.8512,
"step": 124
},
{
"epoch": 1.9118773946360155,
"grad_norm": 1.5849545001983643,
"learning_rate": 0.0001891930808002694,
"loss": 0.7628,
"step": 125
},
{
"epoch": 1.9272030651340997,
"grad_norm": 1.8099451065063477,
"learning_rate": 0.00018900785323799189,
"loss": 0.9171,
"step": 126
},
{
"epoch": 1.9425287356321839,
"grad_norm": 1.5819072723388672,
"learning_rate": 0.00018882114402437106,
"loss": 0.7413,
"step": 127
},
{
"epoch": 1.9578544061302683,
"grad_norm": 1.8191732168197632,
"learning_rate": 0.00018863295626742437,
"loss": 1.0208,
"step": 128
},
{
"epoch": 1.9731800766283525,
"grad_norm": 1.7665985822677612,
"learning_rate": 0.00018844329309978145,
"loss": 0.8426,
"step": 129
},
{
"epoch": 1.9885057471264367,
"grad_norm": 1.9029268026351929,
"learning_rate": 0.00018825215767863214,
"loss": 0.983,
"step": 130
},
{
"epoch": 2.007662835249042,
"grad_norm": 1.5204992294311523,
"learning_rate": 0.0001880595531856738,
"loss": 0.6558,
"step": 131
},
{
"epoch": 2.0229885057471266,
"grad_norm": 1.225983738899231,
"learning_rate": 0.00018786548282705848,
"loss": 0.3984,
"step": 132
},
{
"epoch": 2.0383141762452106,
"grad_norm": 1.2345383167266846,
"learning_rate": 0.0001876699498333393,
"loss": 0.4303,
"step": 133
},
{
"epoch": 2.053639846743295,
"grad_norm": 1.2123405933380127,
"learning_rate": 0.00018747295745941703,
"loss": 0.4609,
"step": 134
},
{
"epoch": 2.0689655172413794,
"grad_norm": 1.2038960456848145,
"learning_rate": 0.00018727450898448563,
"loss": 0.3909,
"step": 135
},
{
"epoch": 2.0842911877394634,
"grad_norm": 1.2191224098205566,
"learning_rate": 0.00018707460771197774,
"loss": 0.4448,
"step": 136
},
{
"epoch": 2.0842911877394634,
"eval_loss": 1.796938419342041,
"eval_runtime": 10.4571,
"eval_samples_per_second": 9.563,
"eval_steps_per_second": 4.781,
"step": 136
},
{
"epoch": 2.099616858237548,
"grad_norm": 1.3134615421295166,
"learning_rate": 0.00018687325696950972,
"loss": 0.5176,
"step": 137
},
{
"epoch": 2.1149425287356323,
"grad_norm": 1.39946448802948,
"learning_rate": 0.00018667046010882626,
"loss": 0.4207,
"step": 138
},
{
"epoch": 2.1302681992337167,
"grad_norm": 1.20857834815979,
"learning_rate": 0.00018646622050574454,
"loss": 0.3165,
"step": 139
},
{
"epoch": 2.1455938697318007,
"grad_norm": 1.4676852226257324,
"learning_rate": 0.00018626054156009806,
"loss": 0.4934,
"step": 140
},
{
"epoch": 2.160919540229885,
"grad_norm": 1.2490851879119873,
"learning_rate": 0.0001860534266956801,
"loss": 0.4454,
"step": 141
},
{
"epoch": 2.1762452107279695,
"grad_norm": 1.5670422315597534,
"learning_rate": 0.00018584487936018661,
"loss": 0.4259,
"step": 142
},
{
"epoch": 2.1915708812260535,
"grad_norm": 1.5839508771896362,
"learning_rate": 0.0001856349030251589,
"loss": 0.4459,
"step": 143
},
{
"epoch": 2.206896551724138,
"grad_norm": 1.4877279996871948,
"learning_rate": 0.00018542350118592584,
"loss": 0.4585,
"step": 144
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.292151927947998,
"learning_rate": 0.00018521067736154568,
"loss": 0.3635,
"step": 145
},
{
"epoch": 2.2375478927203067,
"grad_norm": 1.3014862537384033,
"learning_rate": 0.00018499643509474738,
"loss": 0.4268,
"step": 146
},
{
"epoch": 2.2528735632183907,
"grad_norm": 1.3445168733596802,
"learning_rate": 0.00018478077795187187,
"loss": 0.4178,
"step": 147
},
{
"epoch": 2.268199233716475,
"grad_norm": 1.2323206663131714,
"learning_rate": 0.0001845637095228124,
"loss": 0.3389,
"step": 148
},
{
"epoch": 2.2835249042145596,
"grad_norm": 1.321321725845337,
"learning_rate": 0.000184345233420955,
"loss": 0.394,
"step": 149
},
{
"epoch": 2.2988505747126435,
"grad_norm": 1.3308717012405396,
"learning_rate": 0.00018412535328311814,
"loss": 0.3768,
"step": 150
},
{
"epoch": 2.314176245210728,
"grad_norm": 1.4169113636016846,
"learning_rate": 0.00018390407276949234,
"loss": 0.4106,
"step": 151
},
{
"epoch": 2.3295019157088124,
"grad_norm": 1.4107593297958374,
"learning_rate": 0.00018368139556357928,
"loss": 0.3955,
"step": 152
},
{
"epoch": 2.344827586206897,
"grad_norm": 1.2308950424194336,
"learning_rate": 0.00018345732537213027,
"loss": 0.4053,
"step": 153
},
{
"epoch": 2.344827586206897,
"eval_loss": 1.8346749544143677,
"eval_runtime": 10.5405,
"eval_samples_per_second": 9.487,
"eval_steps_per_second": 4.744,
"step": 153
},
{
"epoch": 2.3601532567049808,
"grad_norm": 1.2049033641815186,
"learning_rate": 0.0001832318659250847,
"loss": 0.3675,
"step": 154
},
{
"epoch": 2.375478927203065,
"grad_norm": 1.35014009475708,
"learning_rate": 0.00018300502097550806,
"loss": 0.4565,
"step": 155
},
{
"epoch": 2.3908045977011496,
"grad_norm": 1.2926514148712158,
"learning_rate": 0.00018277679429952912,
"loss": 0.3887,
"step": 156
},
{
"epoch": 2.4061302681992336,
"grad_norm": 1.1395353078842163,
"learning_rate": 0.0001825471896962774,
"loss": 0.3469,
"step": 157
},
{
"epoch": 2.421455938697318,
"grad_norm": 1.2925468683242798,
"learning_rate": 0.00018231621098781982,
"loss": 0.3811,
"step": 158
},
{
"epoch": 2.4367816091954024,
"grad_norm": 1.2556133270263672,
"learning_rate": 0.00018208386201909698,
"loss": 0.3961,
"step": 159
},
{
"epoch": 2.4521072796934864,
"grad_norm": 3.042213201522827,
"learning_rate": 0.00018185014665785936,
"loss": 0.4634,
"step": 160
},
{
"epoch": 2.467432950191571,
"grad_norm": 7.5744099617004395,
"learning_rate": 0.00018161506879460273,
"loss": 0.5113,
"step": 161
},
{
"epoch": 2.4827586206896552,
"grad_norm": 1.288672685623169,
"learning_rate": 0.00018137863234250347,
"loss": 0.3684,
"step": 162
},
{
"epoch": 2.4980842911877392,
"grad_norm": 1.3630754947662354,
"learning_rate": 0.00018114084123735356,
"loss": 0.4277,
"step": 163
},
{
"epoch": 2.5134099616858236,
"grad_norm": 1.344976544380188,
"learning_rate": 0.00018090169943749476,
"loss": 0.3682,
"step": 164
},
{
"epoch": 2.528735632183908,
"grad_norm": 1.5814900398254395,
"learning_rate": 0.000180661210923753,
"loss": 0.4435,
"step": 165
},
{
"epoch": 2.5440613026819925,
"grad_norm": 1.3256701231002808,
"learning_rate": 0.00018041937969937206,
"loss": 0.3651,
"step": 166
},
{
"epoch": 2.5593869731800765,
"grad_norm": 1.1954660415649414,
"learning_rate": 0.00018017620978994677,
"loss": 0.3662,
"step": 167
},
{
"epoch": 2.574712643678161,
"grad_norm": 1.2444689273834229,
"learning_rate": 0.00017993170524335615,
"loss": 0.4181,
"step": 168
},
{
"epoch": 2.5900383141762453,
"grad_norm": 1.3350296020507812,
"learning_rate": 0.00017968587012969604,
"loss": 0.4437,
"step": 169
},
{
"epoch": 2.6053639846743293,
"grad_norm": 1.1780810356140137,
"learning_rate": 0.00017943870854121124,
"loss": 0.3723,
"step": 170
},
{
"epoch": 2.6053639846743293,
"eval_loss": 1.8776559829711914,
"eval_runtime": 10.4883,
"eval_samples_per_second": 9.534,
"eval_steps_per_second": 4.767,
"step": 170
},
{
"epoch": 2.6206896551724137,
"grad_norm": 1.3304461240768433,
"learning_rate": 0.00017919022459222752,
"loss": 0.4096,
"step": 171
},
{
"epoch": 2.636015325670498,
"grad_norm": 1.429721474647522,
"learning_rate": 0.00017894042241908294,
"loss": 0.4662,
"step": 172
},
{
"epoch": 2.6513409961685825,
"grad_norm": 1.160591959953308,
"learning_rate": 0.0001786893061800592,
"loss": 0.3493,
"step": 173
},
{
"epoch": 2.6666666666666665,
"grad_norm": 1.2618906497955322,
"learning_rate": 0.00017843688005531226,
"loss": 0.3734,
"step": 174
},
{
"epoch": 2.681992337164751,
"grad_norm": 1.3741453886032104,
"learning_rate": 0.000178183148246803,
"loss": 0.4422,
"step": 175
},
{
"epoch": 2.6973180076628354,
"grad_norm": 1.336128830909729,
"learning_rate": 0.0001779281149782269,
"loss": 0.4071,
"step": 176
},
{
"epoch": 2.7126436781609193,
"grad_norm": 1.5618481636047363,
"learning_rate": 0.000177671784494944,
"loss": 0.3985,
"step": 177
},
{
"epoch": 2.7279693486590038,
"grad_norm": 1.4244683980941772,
"learning_rate": 0.00017741416106390826,
"loss": 0.4876,
"step": 178
},
{
"epoch": 2.743295019157088,
"grad_norm": 1.4463664293289185,
"learning_rate": 0.0001771552489735963,
"loss": 0.4698,
"step": 179
},
{
"epoch": 2.7586206896551726,
"grad_norm": 1.3060929775238037,
"learning_rate": 0.0001768950525339362,
"loss": 0.376,
"step": 180
},
{
"epoch": 2.7739463601532566,
"grad_norm": 1.5133682489395142,
"learning_rate": 0.00017663357607623577,
"loss": 0.4139,
"step": 181
},
{
"epoch": 2.789272030651341,
"grad_norm": 1.4014631509780884,
"learning_rate": 0.00017637082395311024,
"loss": 0.4094,
"step": 182
},
{
"epoch": 2.8045977011494254,
"grad_norm": 1.4687765836715698,
"learning_rate": 0.00017610680053841007,
"loss": 0.4123,
"step": 183
},
{
"epoch": 2.8199233716475094,
"grad_norm": 1.336650013923645,
"learning_rate": 0.000175841510227148,
"loss": 0.3737,
"step": 184
},
{
"epoch": 2.835249042145594,
"grad_norm": 1.5005886554718018,
"learning_rate": 0.00017557495743542585,
"loss": 0.4835,
"step": 185
},
{
"epoch": 2.8505747126436782,
"grad_norm": 1.3977274894714355,
"learning_rate": 0.00017530714660036112,
"loss": 0.4989,
"step": 186
},
{
"epoch": 2.8659003831417627,
"grad_norm": 1.1647838354110718,
"learning_rate": 0.00017503808218001304,
"loss": 0.339,
"step": 187
},
{
"epoch": 2.8659003831417627,
"eval_loss": 1.875050663948059,
"eval_runtime": 10.5813,
"eval_samples_per_second": 9.451,
"eval_steps_per_second": 4.725,
"step": 187
},
{
"epoch": 2.8812260536398466,
"grad_norm": 1.4600085020065308,
"learning_rate": 0.00017476776865330847,
"loss": 0.4327,
"step": 188
},
{
"epoch": 2.896551724137931,
"grad_norm": 1.3009713888168335,
"learning_rate": 0.00017449621051996713,
"loss": 0.3969,
"step": 189
},
{
"epoch": 2.9118773946360155,
"grad_norm": 1.5662423372268677,
"learning_rate": 0.000174223412300427,
"loss": 0.4866,
"step": 190
},
{
"epoch": 2.9272030651340994,
"grad_norm": 1.1687737703323364,
"learning_rate": 0.00017394937853576877,
"loss": 0.3411,
"step": 191
},
{
"epoch": 2.942528735632184,
"grad_norm": 1.3152905702590942,
"learning_rate": 0.0001736741137876405,
"loss": 0.4294,
"step": 192
},
{
"epoch": 2.9578544061302683,
"grad_norm": 1.5262017250061035,
"learning_rate": 0.00017339762263818146,
"loss": 0.433,
"step": 193
},
{
"epoch": 2.9731800766283527,
"grad_norm": 1.2779839038848877,
"learning_rate": 0.000173119909689946,
"loss": 0.4334,
"step": 194
},
{
"epoch": 2.9885057471264367,
"grad_norm": 1.2895079851150513,
"learning_rate": 0.00017284097956582692,
"loss": 0.4393,
"step": 195
},
{
"epoch": 3.003831417624521,
"grad_norm": 5.897226810455322,
"learning_rate": 0.0001725608369089785,
"loss": 0.5205,
"step": 196
},
{
"epoch": 3.0191570881226055,
"grad_norm": 1.2967376708984375,
"learning_rate": 0.00017227948638273916,
"loss": 0.202,
"step": 197
},
{
"epoch": 3.0344827586206895,
"grad_norm": 1.050823450088501,
"learning_rate": 0.00017199693267055393,
"loss": 0.2219,
"step": 198
},
{
"epoch": 3.049808429118774,
"grad_norm": 0.8004248738288879,
"learning_rate": 0.00017171318047589637,
"loss": 0.1918,
"step": 199
},
{
"epoch": 3.0651340996168583,
"grad_norm": 0.9603090286254883,
"learning_rate": 0.00017142823452219038,
"loss": 0.1627,
"step": 200
},
{
"epoch": 3.0804597701149423,
"grad_norm": 1.0117729902267456,
"learning_rate": 0.00017114209955273153,
"loss": 0.1734,
"step": 201
},
{
"epoch": 3.0957854406130267,
"grad_norm": 1.150023102760315,
"learning_rate": 0.00017085478033060806,
"loss": 0.2105,
"step": 202
},
{
"epoch": 3.111111111111111,
"grad_norm": 1.2649832963943481,
"learning_rate": 0.00017056628163862172,
"loss": 0.1996,
"step": 203
},
{
"epoch": 3.1264367816091956,
"grad_norm": 1.1088045835494995,
"learning_rate": 0.00017027660827920798,
"loss": 0.1614,
"step": 204
},
{
"epoch": 3.1264367816091956,
"eval_loss": 2.065758466720581,
"eval_runtime": 10.4748,
"eval_samples_per_second": 9.547,
"eval_steps_per_second": 4.773,
"step": 204
},
{
"epoch": 3.1417624521072796,
"grad_norm": 1.1436564922332764,
"learning_rate": 0.00016998576507435618,
"loss": 0.1886,
"step": 205
},
{
"epoch": 3.157088122605364,
"grad_norm": 1.2624493837356567,
"learning_rate": 0.00016969375686552937,
"loss": 0.1792,
"step": 206
},
{
"epoch": 3.1724137931034484,
"grad_norm": 1.0960315465927124,
"learning_rate": 0.00016940058851358343,
"loss": 0.196,
"step": 207
},
{
"epoch": 3.1877394636015324,
"grad_norm": 1.062483549118042,
"learning_rate": 0.00016910626489868649,
"loss": 0.1577,
"step": 208
},
{
"epoch": 3.203065134099617,
"grad_norm": 1.0054856538772583,
"learning_rate": 0.0001688107909202374,
"loss": 0.1893,
"step": 209
},
{
"epoch": 3.218390804597701,
"grad_norm": 1.111485481262207,
"learning_rate": 0.00016851417149678444,
"loss": 0.1796,
"step": 210
},
{
"epoch": 3.2337164750957856,
"grad_norm": 1.009745478630066,
"learning_rate": 0.00016821641156594317,
"loss": 0.1523,
"step": 211
},
{
"epoch": 3.2490421455938696,
"grad_norm": 1.213293433189392,
"learning_rate": 0.0001679175160843145,
"loss": 0.1619,
"step": 212
},
{
"epoch": 3.264367816091954,
"grad_norm": 1.5143858194351196,
"learning_rate": 0.00016761749002740193,
"loss": 0.1609,
"step": 213
},
{
"epoch": 3.2796934865900385,
"grad_norm": 1.3771694898605347,
"learning_rate": 0.00016731633838952905,
"loss": 0.1671,
"step": 214
},
{
"epoch": 3.2950191570881224,
"grad_norm": 1.1563445329666138,
"learning_rate": 0.00016701406618375596,
"loss": 0.1885,
"step": 215
},
{
"epoch": 3.310344827586207,
"grad_norm": 1.0585676431655884,
"learning_rate": 0.00016671067844179627,
"loss": 0.1634,
"step": 216
},
{
"epoch": 3.3256704980842913,
"grad_norm": 1.1020563840866089,
"learning_rate": 0.00016640618021393304,
"loss": 0.1838,
"step": 217
},
{
"epoch": 3.3409961685823752,
"grad_norm": 0.9592476487159729,
"learning_rate": 0.00016610057656893482,
"loss": 0.179,
"step": 218
},
{
"epoch": 3.3563218390804597,
"grad_norm": 0.9426510334014893,
"learning_rate": 0.00016579387259397127,
"loss": 0.1581,
"step": 219
},
{
"epoch": 3.371647509578544,
"grad_norm": 1.2259931564331055,
"learning_rate": 0.00016548607339452853,
"loss": 0.2017,
"step": 220
},
{
"epoch": 3.3869731800766285,
"grad_norm": 1.2636795043945312,
"learning_rate": 0.00016517718409432406,
"loss": 0.1804,
"step": 221
},
{
"epoch": 3.3869731800766285,
"eval_loss": 2.0642523765563965,
"eval_runtime": 10.4896,
"eval_samples_per_second": 9.533,
"eval_steps_per_second": 4.767,
"step": 221
},
{
"epoch": 3.4022988505747125,
"grad_norm": 0.9591987729072571,
"learning_rate": 0.00016486720983522156,
"loss": 0.1653,
"step": 222
},
{
"epoch": 3.417624521072797,
"grad_norm": 0.9433954954147339,
"learning_rate": 0.00016455615577714528,
"loss": 0.1843,
"step": 223
},
{
"epoch": 3.4329501915708813,
"grad_norm": 1.0256028175354004,
"learning_rate": 0.00016424402709799404,
"loss": 0.1596,
"step": 224
},
{
"epoch": 3.4482758620689653,
"grad_norm": 1.0997707843780518,
"learning_rate": 0.00016393082899355516,
"loss": 0.1897,
"step": 225
},
{
"epoch": 3.4636015325670497,
"grad_norm": 1.6630239486694336,
"learning_rate": 0.00016361656667741802,
"loss": 0.2045,
"step": 226
},
{
"epoch": 3.478927203065134,
"grad_norm": 0.9956857562065125,
"learning_rate": 0.00016330124538088705,
"loss": 0.1653,
"step": 227
},
{
"epoch": 3.4942528735632186,
"grad_norm": 1.3272435665130615,
"learning_rate": 0.0001629848703528949,
"loss": 0.198,
"step": 228
},
{
"epoch": 3.5095785440613025,
"grad_norm": 8.141691207885742,
"learning_rate": 0.0001626674468599149,
"loss": 0.2591,
"step": 229
},
{
"epoch": 3.524904214559387,
"grad_norm": 0.9597133994102478,
"learning_rate": 0.00016234898018587337,
"loss": 0.1818,
"step": 230
},
{
"epoch": 3.5402298850574714,
"grad_norm": 0.949269711971283,
"learning_rate": 0.00016202947563206187,
"loss": 0.1675,
"step": 231
},
{
"epoch": 3.5555555555555554,
"grad_norm": 1.0063790082931519,
"learning_rate": 0.00016170893851704876,
"loss": 0.1875,
"step": 232
},
{
"epoch": 3.57088122605364,
"grad_norm": 1.2696994543075562,
"learning_rate": 0.00016138737417659068,
"loss": 0.1746,
"step": 233
},
{
"epoch": 3.586206896551724,
"grad_norm": 1.055250644683838,
"learning_rate": 0.00016106478796354382,
"loss": 0.1919,
"step": 234
},
{
"epoch": 3.6015325670498086,
"grad_norm": 0.9498022794723511,
"learning_rate": 0.00016074118524777477,
"loss": 0.1441,
"step": 235
},
{
"epoch": 3.6168582375478926,
"grad_norm": 1.0420253276824951,
"learning_rate": 0.00016041657141607107,
"loss": 0.1634,
"step": 236
},
{
"epoch": 3.632183908045977,
"grad_norm": 1.2098767757415771,
"learning_rate": 0.0001600909518720517,
"loss": 0.187,
"step": 237
},
{
"epoch": 3.6475095785440614,
"grad_norm": 1.2031207084655762,
"learning_rate": 0.0001597643320360769,
"loss": 0.1881,
"step": 238
},
{
"epoch": 3.6475095785440614,
"eval_loss": 2.092371940612793,
"eval_runtime": 10.4707,
"eval_samples_per_second": 9.551,
"eval_steps_per_second": 4.775,
"step": 238
},
{
"epoch": 3.6628352490421454,
"grad_norm": 1.0068916082382202,
"learning_rate": 0.0001594367173451582,
"loss": 0.1499,
"step": 239
},
{
"epoch": 3.67816091954023,
"grad_norm": 1.188425898551941,
"learning_rate": 0.00015910811325286768,
"loss": 0.1928,
"step": 240
},
{
"epoch": 3.6934865900383143,
"grad_norm": 1.054997205734253,
"learning_rate": 0.00015877852522924732,
"loss": 0.1726,
"step": 241
},
{
"epoch": 3.7088122605363987,
"grad_norm": 1.0925296545028687,
"learning_rate": 0.000158447958760718,
"loss": 0.2032,
"step": 242
},
{
"epoch": 3.7241379310344827,
"grad_norm": 1.2014827728271484,
"learning_rate": 0.0001581164193499879,
"loss": 0.1907,
"step": 243
},
{
"epoch": 3.739463601532567,
"grad_norm": 1.1900111436843872,
"learning_rate": 0.0001577839125159613,
"loss": 0.1977,
"step": 244
},
{
"epoch": 3.7547892720306515,
"grad_norm": 1.049250602722168,
"learning_rate": 0.00015745044379364634,
"loss": 0.1734,
"step": 245
},
{
"epoch": 3.7701149425287355,
"grad_norm": 1.1495704650878906,
"learning_rate": 0.00015711601873406313,
"loss": 0.2184,
"step": 246
},
{
"epoch": 3.78544061302682,
"grad_norm": 0.9893819689750671,
"learning_rate": 0.00015678064290415122,
"loss": 0.1594,
"step": 247
},
{
"epoch": 3.8007662835249043,
"grad_norm": 1.0403058528900146,
"learning_rate": 0.00015644432188667695,
"loss": 0.165,
"step": 248
},
{
"epoch": 3.8160919540229887,
"grad_norm": 1.1845136880874634,
"learning_rate": 0.00015610706128014055,
"loss": 0.204,
"step": 249
},
{
"epoch": 3.8314176245210727,
"grad_norm": 1.1242119073867798,
"learning_rate": 0.00015576886669868296,
"loss": 0.1861,
"step": 250
},
{
"epoch": 3.846743295019157,
"grad_norm": 1.0183254480361938,
"learning_rate": 0.0001554297437719923,
"loss": 0.18,
"step": 251
},
{
"epoch": 3.862068965517241,
"grad_norm": 1.0303974151611328,
"learning_rate": 0.00015508969814521025,
"loss": 0.1951,
"step": 252
},
{
"epoch": 3.8773946360153255,
"grad_norm": 1.1616798639297485,
"learning_rate": 0.000154748735478838,
"loss": 0.2126,
"step": 253
},
{
"epoch": 3.89272030651341,
"grad_norm": 1.1582714319229126,
"learning_rate": 0.00015440686144864207,
"loss": 0.1696,
"step": 254
},
{
"epoch": 3.9080459770114944,
"grad_norm": 1.0691121816635132,
"learning_rate": 0.00015406408174555976,
"loss": 0.1762,
"step": 255
},
{
"epoch": 3.9080459770114944,
"eval_loss": 2.062448501586914,
"eval_runtime": 10.503,
"eval_samples_per_second": 9.521,
"eval_steps_per_second": 4.761,
"step": 255
},
{
"epoch": 3.923371647509579,
"grad_norm": 1.0353065729141235,
"learning_rate": 0.00015372040207560457,
"loss": 0.1894,
"step": 256
},
{
"epoch": 3.9386973180076628,
"grad_norm": 1.1007777452468872,
"learning_rate": 0.00015337582815977104,
"loss": 0.1864,
"step": 257
},
{
"epoch": 3.954022988505747,
"grad_norm": 0.9735039472579956,
"learning_rate": 0.00015303036573393962,
"loss": 0.1716,
"step": 258
},
{
"epoch": 3.969348659003831,
"grad_norm": 1.0294030904769897,
"learning_rate": 0.00015268402054878117,
"loss": 0.1842,
"step": 259
},
{
"epoch": 3.9846743295019156,
"grad_norm": 1.0041604042053223,
"learning_rate": 0.00015233679836966122,
"loss": 0.1904,
"step": 260
},
{
"epoch": 4.0,
"grad_norm": 2.519958734512329,
"learning_rate": 0.00015198870497654395,
"loss": 0.4303,
"step": 261
},
{
"epoch": 4.015325670498084,
"grad_norm": 0.9649507999420166,
"learning_rate": 0.0001516397461638962,
"loss": 0.1039,
"step": 262
},
{
"epoch": 4.030651340996169,
"grad_norm": 0.6340312361717224,
"learning_rate": 0.00015128992774059063,
"loss": 0.0831,
"step": 263
},
{
"epoch": 4.045977011494253,
"grad_norm": 2.8160183429718018,
"learning_rate": 0.00015093925552980933,
"loss": 0.0998,
"step": 264
},
{
"epoch": 4.061302681992337,
"grad_norm": 0.9386498332023621,
"learning_rate": 0.00015058773536894685,
"loss": 0.0737,
"step": 265
},
{
"epoch": 4.076628352490421,
"grad_norm": 0.6389781832695007,
"learning_rate": 0.00015023537310951282,
"loss": 0.0714,
"step": 266
},
{
"epoch": 4.091954022988506,
"grad_norm": 0.6236942410469055,
"learning_rate": 0.0001498821746170349,
"loss": 0.0713,
"step": 267
},
{
"epoch": 4.10727969348659,
"grad_norm": 0.7775859236717224,
"learning_rate": 0.00014952814577096071,
"loss": 0.0723,
"step": 268
},
{
"epoch": 4.1226053639846745,
"grad_norm": 0.8838902711868286,
"learning_rate": 0.0001491732924645604,
"loss": 0.0806,
"step": 269
},
{
"epoch": 4.137931034482759,
"grad_norm": 0.8139066696166992,
"learning_rate": 0.00014881762060482814,
"loss": 0.0681,
"step": 270
},
{
"epoch": 4.153256704980843,
"grad_norm": 0.7435247302055359,
"learning_rate": 0.00014846113611238413,
"loss": 0.0727,
"step": 271
},
{
"epoch": 4.168582375478927,
"grad_norm": 8.997066497802734,
"learning_rate": 0.0001481038449213758,
"loss": 0.195,
"step": 272
},
{
"epoch": 4.168582375478927,
"eval_loss": 2.326845169067383,
"eval_runtime": 10.5534,
"eval_samples_per_second": 9.476,
"eval_steps_per_second": 4.738,
"step": 272
},
{
"epoch": 4.183908045977011,
"grad_norm": 0.7295827269554138,
"learning_rate": 0.0001477457529793792,
"loss": 0.0834,
"step": 273
},
{
"epoch": 4.199233716475096,
"grad_norm": 0.9554088711738586,
"learning_rate": 0.00014738686624729986,
"loss": 0.0966,
"step": 274
},
{
"epoch": 4.21455938697318,
"grad_norm": 0.709963858127594,
"learning_rate": 0.0001470271906992737,
"loss": 0.0573,
"step": 275
},
{
"epoch": 4.2298850574712645,
"grad_norm": 0.8901592493057251,
"learning_rate": 0.00014666673232256738,
"loss": 0.076,
"step": 276
},
{
"epoch": 4.245210727969349,
"grad_norm": 0.706717848777771,
"learning_rate": 0.00014630549711747888,
"loss": 0.0746,
"step": 277
},
{
"epoch": 4.260536398467433,
"grad_norm": 3.1939444541931152,
"learning_rate": 0.00014594349109723744,
"loss": 0.122,
"step": 278
},
{
"epoch": 4.275862068965517,
"grad_norm": 0.8928236961364746,
"learning_rate": 0.00014558072028790354,
"loss": 0.1025,
"step": 279
},
{
"epoch": 4.291187739463601,
"grad_norm": 0.7875874638557434,
"learning_rate": 0.00014521719072826858,
"loss": 0.0856,
"step": 280
},
{
"epoch": 4.306513409961686,
"grad_norm": 1.0411407947540283,
"learning_rate": 0.00014485290846975431,
"loss": 0.0819,
"step": 281
},
{
"epoch": 4.32183908045977,
"grad_norm": 0.8319458365440369,
"learning_rate": 0.0001444878795763121,
"loss": 0.0625,
"step": 282
},
{
"epoch": 4.337164750957855,
"grad_norm": 0.7555274963378906,
"learning_rate": 0.00014412211012432212,
"loss": 0.0831,
"step": 283
},
{
"epoch": 4.352490421455939,
"grad_norm": 0.7779274582862854,
"learning_rate": 0.0001437556062024921,
"loss": 0.0991,
"step": 284
},
{
"epoch": 4.3678160919540225,
"grad_norm": 1.9860173463821411,
"learning_rate": 0.00014338837391175582,
"loss": 0.0907,
"step": 285
},
{
"epoch": 4.383141762452107,
"grad_norm": 0.9153367280960083,
"learning_rate": 0.0001430204193651719,
"loss": 0.0957,
"step": 286
},
{
"epoch": 4.398467432950191,
"grad_norm": 1.0085121393203735,
"learning_rate": 0.0001426517486878217,
"loss": 0.1071,
"step": 287
},
{
"epoch": 4.413793103448276,
"grad_norm": 0.7043394446372986,
"learning_rate": 0.00014228236801670763,
"loss": 0.077,
"step": 288
},
{
"epoch": 4.42911877394636,
"grad_norm": 0.7112743854522705,
"learning_rate": 0.00014191228350065078,
"loss": 0.0649,
"step": 289
},
{
"epoch": 4.42911877394636,
"eval_loss": 2.271777868270874,
"eval_runtime": 10.4648,
"eval_samples_per_second": 9.556,
"eval_steps_per_second": 4.778,
"step": 289
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.7803434729576111,
"learning_rate": 0.00014154150130018866,
"loss": 0.0704,
"step": 290
},
{
"epoch": 4.459770114942529,
"grad_norm": 0.7092854380607605,
"learning_rate": 0.00014117002758747268,
"loss": 0.0745,
"step": 291
},
{
"epoch": 4.4750957854406135,
"grad_norm": 0.7031986117362976,
"learning_rate": 0.00014079786854616537,
"loss": 0.0649,
"step": 292
},
{
"epoch": 4.490421455938697,
"grad_norm": 0.7902014255523682,
"learning_rate": 0.00014042503037133737,
"loss": 0.0908,
"step": 293
},
{
"epoch": 4.505747126436781,
"grad_norm": 1.1959948539733887,
"learning_rate": 0.00014005151926936452,
"loss": 0.0868,
"step": 294
},
{
"epoch": 4.521072796934866,
"grad_norm": 1.7838146686553955,
"learning_rate": 0.00013967734145782425,
"loss": 0.0785,
"step": 295
},
{
"epoch": 4.53639846743295,
"grad_norm": 1.0136120319366455,
"learning_rate": 0.00013930250316539238,
"loss": 0.1004,
"step": 296
},
{
"epoch": 4.551724137931035,
"grad_norm": 0.9047825932502747,
"learning_rate": 0.00013892701063173918,
"loss": 0.0902,
"step": 297
},
{
"epoch": 4.567049808429119,
"grad_norm": 0.7350003123283386,
"learning_rate": 0.00013855087010742562,
"loss": 0.0728,
"step": 298
},
{
"epoch": 4.582375478927203,
"grad_norm": 1.1646071672439575,
"learning_rate": 0.00013817408785379943,
"loss": 0.092,
"step": 299
},
{
"epoch": 4.597701149425287,
"grad_norm": 0.6288233399391174,
"learning_rate": 0.00013779667014289065,
"loss": 0.0678,
"step": 300
},
{
"epoch": 4.6130268199233715,
"grad_norm": 0.7127698063850403,
"learning_rate": 0.00013741862325730738,
"loss": 0.0921,
"step": 301
},
{
"epoch": 4.628352490421456,
"grad_norm": 0.8102079629898071,
"learning_rate": 0.00013703995349013113,
"loss": 0.0851,
"step": 302
},
{
"epoch": 4.64367816091954,
"grad_norm": 0.778022050857544,
"learning_rate": 0.00013666066714481206,
"loss": 0.0885,
"step": 303
},
{
"epoch": 4.659003831417625,
"grad_norm": 0.6419159770011902,
"learning_rate": 0.0001362807705350641,
"loss": 0.0736,
"step": 304
},
{
"epoch": 4.674329501915709,
"grad_norm": 0.7336333394050598,
"learning_rate": 0.00013590026998475986,
"loss": 0.0761,
"step": 305
},
{
"epoch": 4.689655172413794,
"grad_norm": 0.6584993600845337,
"learning_rate": 0.00013551917182782529,
"loss": 0.0786,
"step": 306
},
{
"epoch": 4.689655172413794,
"eval_loss": 2.256883144378662,
"eval_runtime": 10.5286,
"eval_samples_per_second": 9.498,
"eval_steps_per_second": 4.749,
"step": 306
},
{
"epoch": 4.704980842911877,
"grad_norm": 0.7220829725265503,
"learning_rate": 0.0001351374824081343,
"loss": 0.0737,
"step": 307
},
{
"epoch": 4.7203065134099615,
"grad_norm": 0.8544161319732666,
"learning_rate": 0.00013475520807940304,
"loss": 0.0839,
"step": 308
},
{
"epoch": 4.735632183908046,
"grad_norm": 0.9264532327651978,
"learning_rate": 0.00013437235520508432,
"loss": 0.0904,
"step": 309
},
{
"epoch": 4.75095785440613,
"grad_norm": 0.6544135212898254,
"learning_rate": 0.00013398893015826167,
"loss": 0.0692,
"step": 310
},
{
"epoch": 4.766283524904215,
"grad_norm": 0.6521825790405273,
"learning_rate": 0.00013360493932154302,
"loss": 0.0696,
"step": 311
},
{
"epoch": 4.781609195402299,
"grad_norm": 0.7229333519935608,
"learning_rate": 0.00013322038908695466,
"loss": 0.0811,
"step": 312
},
{
"epoch": 4.796934865900383,
"grad_norm": 0.8600510954856873,
"learning_rate": 0.00013283528585583484,
"loss": 0.0623,
"step": 313
},
{
"epoch": 4.812260536398467,
"grad_norm": 0.8433498740196228,
"learning_rate": 0.00013244963603872706,
"loss": 0.0805,
"step": 314
},
{
"epoch": 4.827586206896552,
"grad_norm": 1.2378168106079102,
"learning_rate": 0.00013206344605527355,
"loss": 0.0745,
"step": 315
},
{
"epoch": 4.842911877394636,
"grad_norm": 1.4228192567825317,
"learning_rate": 0.00013167672233410825,
"loss": 0.1218,
"step": 316
},
{
"epoch": 4.85823754789272,
"grad_norm": 0.7594043612480164,
"learning_rate": 0.00013128947131274988,
"loss": 0.0744,
"step": 317
},
{
"epoch": 4.873563218390805,
"grad_norm": 0.8461570739746094,
"learning_rate": 0.00013090169943749476,
"loss": 0.0907,
"step": 318
},
{
"epoch": 4.888888888888889,
"grad_norm": 0.8196818232536316,
"learning_rate": 0.00013051341316330946,
"loss": 0.0835,
"step": 319
},
{
"epoch": 4.904214559386973,
"grad_norm": 2.694230794906616,
"learning_rate": 0.00013012461895372344,
"loss": 0.0844,
"step": 320
},
{
"epoch": 4.919540229885057,
"grad_norm": 1.4861178398132324,
"learning_rate": 0.00012973532328072138,
"loss": 0.0782,
"step": 321
},
{
"epoch": 4.934865900383142,
"grad_norm": 0.9646175503730774,
"learning_rate": 0.00012934553262463548,
"loss": 0.069,
"step": 322
},
{
"epoch": 4.950191570881226,
"grad_norm": 0.7597980499267578,
"learning_rate": 0.00012895525347403756,
"loss": 0.0763,
"step": 323
},
{
"epoch": 4.950191570881226,
"eval_loss": 2.252124547958374,
"eval_runtime": 10.469,
"eval_samples_per_second": 9.552,
"eval_steps_per_second": 4.776,
"step": 323
},
{
"epoch": 4.9655172413793105,
"grad_norm": 0.7091509699821472,
"learning_rate": 0.0001285644923256311,
"loss": 0.0734,
"step": 324
},
{
"epoch": 4.980842911877395,
"grad_norm": 0.8412840366363525,
"learning_rate": 0.00012817325568414297,
"loss": 0.0982,
"step": 325
},
{
"epoch": 4.9961685823754785,
"grad_norm": 0.9467046856880188,
"learning_rate": 0.00012778155006221538,
"loss": 0.0725,
"step": 326
},
{
"epoch": 5.011494252873563,
"grad_norm": 1.2083613872528076,
"learning_rate": 0.00012738938198029724,
"loss": 0.0743,
"step": 327
},
{
"epoch": 5.026819923371647,
"grad_norm": 0.8673701882362366,
"learning_rate": 0.0001269967579665357,
"loss": 0.0423,
"step": 328
},
{
"epoch": 5.042145593869732,
"grad_norm": 0.36529555916786194,
"learning_rate": 0.00012660368455666752,
"loss": 0.027,
"step": 329
},
{
"epoch": 5.057471264367816,
"grad_norm": 0.44554996490478516,
"learning_rate": 0.00012621016829391022,
"loss": 0.0296,
"step": 330
},
{
"epoch": 5.0727969348659006,
"grad_norm": 0.9303228259086609,
"learning_rate": 0.00012581621572885321,
"loss": 0.0569,
"step": 331
},
{
"epoch": 5.088122605363985,
"grad_norm": 0.45792293548583984,
"learning_rate": 0.00012542183341934872,
"loss": 0.036,
"step": 332
},
{
"epoch": 5.103448275862069,
"grad_norm": 0.6033705472946167,
"learning_rate": 0.0001250270279304026,
"loss": 0.0409,
"step": 333
},
{
"epoch": 5.118773946360153,
"grad_norm": 0.5663286447525024,
"learning_rate": 0.000124631805834065,
"loss": 0.0258,
"step": 334
},
{
"epoch": 5.134099616858237,
"grad_norm": 0.6377267837524414,
"learning_rate": 0.00012423617370932127,
"loss": 0.039,
"step": 335
},
{
"epoch": 5.149425287356322,
"grad_norm": 0.4742782711982727,
"learning_rate": 0.00012384013814198196,
"loss": 0.0335,
"step": 336
},
{
"epoch": 5.164750957854406,
"grad_norm": 0.5032561421394348,
"learning_rate": 0.00012344370572457366,
"loss": 0.0269,
"step": 337
},
{
"epoch": 5.180076628352491,
"grad_norm": 0.4018470048904419,
"learning_rate": 0.0001230468830562289,
"loss": 0.0271,
"step": 338
},
{
"epoch": 5.195402298850575,
"grad_norm": 0.5031781196594238,
"learning_rate": 0.00012264967674257646,
"loss": 0.0252,
"step": 339
},
{
"epoch": 5.210727969348659,
"grad_norm": 0.6742706894874573,
"learning_rate": 0.00012225209339563145,
"loss": 0.0509,
"step": 340
},
{
"epoch": 5.210727969348659,
"eval_loss": 2.4545507431030273,
"eval_runtime": 10.7404,
"eval_samples_per_second": 9.311,
"eval_steps_per_second": 4.655,
"step": 340
},
{
"epoch": 5.226053639846743,
"grad_norm": 0.6078564524650574,
"learning_rate": 0.00012185413963368519,
"loss": 0.0453,
"step": 341
},
{
"epoch": 5.241379310344827,
"grad_norm": 0.5548681616783142,
"learning_rate": 0.00012145582208119497,
"loss": 0.031,
"step": 342
},
{
"epoch": 5.256704980842912,
"grad_norm": 0.5871354937553406,
"learning_rate": 0.00012105714736867391,
"loss": 0.0391,
"step": 343
},
{
"epoch": 5.272030651340996,
"grad_norm": 0.5070196986198425,
"learning_rate": 0.0001206581221325805,
"loss": 0.0282,
"step": 344
},
{
"epoch": 5.287356321839081,
"grad_norm": 0.6400995850563049,
"learning_rate": 0.0001202587530152081,
"loss": 0.0326,
"step": 345
},
{
"epoch": 5.302681992337165,
"grad_norm": 0.5636530518531799,
"learning_rate": 0.00011985904666457455,
"loss": 0.0341,
"step": 346
},
{
"epoch": 5.3180076628352495,
"grad_norm": 0.27172422409057617,
"learning_rate": 0.00011945900973431128,
"loss": 0.0226,
"step": 347
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.41421565413475037,
"learning_rate": 0.00011905864888355263,
"loss": 0.0322,
"step": 348
},
{
"epoch": 5.3486590038314175,
"grad_norm": 0.444100022315979,
"learning_rate": 0.00011865797077682508,
"loss": 0.0262,
"step": 349
},
{
"epoch": 5.363984674329502,
"grad_norm": 0.5755631923675537,
"learning_rate": 0.00011825698208393619,
"loss": 0.0314,
"step": 350
},
{
"epoch": 5.379310344827586,
"grad_norm": 0.5454833507537842,
"learning_rate": 0.00011785568947986367,
"loss": 0.0336,
"step": 351
},
{
"epoch": 5.394636015325671,
"grad_norm": 1.3440561294555664,
"learning_rate": 0.00011745409964464424,
"loss": 0.0345,
"step": 352
},
{
"epoch": 5.409961685823755,
"grad_norm": 0.4198431670665741,
"learning_rate": 0.0001170522192632624,
"loss": 0.0276,
"step": 353
},
{
"epoch": 5.425287356321839,
"grad_norm": 0.4718680679798126,
"learning_rate": 0.00011665005502553911,
"loss": 0.0288,
"step": 354
},
{
"epoch": 5.440613026819923,
"grad_norm": 0.9051384329795837,
"learning_rate": 0.00011624761362602061,
"loss": 0.0444,
"step": 355
},
{
"epoch": 5.4559386973180075,
"grad_norm": 0.5586571097373962,
"learning_rate": 0.00011584490176386671,
"loss": 0.027,
"step": 356
},
{
"epoch": 5.471264367816092,
"grad_norm": 0.5432120561599731,
"learning_rate": 0.00011544192614273956,
"loss": 0.0374,
"step": 357
},
{
"epoch": 5.471264367816092,
"eval_loss": 2.4692599773406982,
"eval_runtime": 10.4877,
"eval_samples_per_second": 9.535,
"eval_steps_per_second": 4.768,
"step": 357
},
{
"epoch": 5.486590038314176,
"grad_norm": 0.884427547454834,
"learning_rate": 0.00011503869347069185,
"loss": 0.0558,
"step": 358
},
{
"epoch": 5.501915708812261,
"grad_norm": 0.43964701890945435,
"learning_rate": 0.00011463521046005523,
"loss": 0.0278,
"step": 359
},
{
"epoch": 5.517241379310345,
"grad_norm": 0.44980964064598083,
"learning_rate": 0.00011423148382732853,
"loss": 0.0275,
"step": 360
},
{
"epoch": 5.53256704980843,
"grad_norm": 0.40179964900016785,
"learning_rate": 0.00011382752029306604,
"loss": 0.0304,
"step": 361
},
{
"epoch": 5.547892720306513,
"grad_norm": 0.6193554401397705,
"learning_rate": 0.00011342332658176555,
"loss": 0.0305,
"step": 362
},
{
"epoch": 5.563218390804598,
"grad_norm": 0.4448515474796295,
"learning_rate": 0.00011301890942175648,
"loss": 0.0303,
"step": 363
},
{
"epoch": 5.578544061302682,
"grad_norm": 0.40030574798583984,
"learning_rate": 0.0001126142755450878,
"loss": 0.0263,
"step": 364
},
{
"epoch": 5.593869731800766,
"grad_norm": 0.5186451077461243,
"learning_rate": 0.000112209431687416,
"loss": 0.0278,
"step": 365
},
{
"epoch": 5.609195402298851,
"grad_norm": 0.5285075902938843,
"learning_rate": 0.00011180438458789304,
"loss": 0.0348,
"step": 366
},
{
"epoch": 5.624521072796935,
"grad_norm": 0.4877240061759949,
"learning_rate": 0.00011139914098905406,
"loss": 0.0386,
"step": 367
},
{
"epoch": 5.639846743295019,
"grad_norm": 0.5512449145317078,
"learning_rate": 0.00011099370763670523,
"loss": 0.0297,
"step": 368
},
{
"epoch": 5.655172413793103,
"grad_norm": 0.5295383334159851,
"learning_rate": 0.00011058809127981134,
"loss": 0.0344,
"step": 369
},
{
"epoch": 5.670498084291188,
"grad_norm": 0.5817351341247559,
"learning_rate": 0.00011018229867038356,
"loss": 0.0363,
"step": 370
},
{
"epoch": 5.685823754789272,
"grad_norm": 0.3530018627643585,
"learning_rate": 0.00010977633656336706,
"loss": 0.0212,
"step": 371
},
{
"epoch": 5.7011494252873565,
"grad_norm": 2.2889881134033203,
"learning_rate": 0.00010937021171652841,
"loss": 0.0352,
"step": 372
},
{
"epoch": 5.716475095785441,
"grad_norm": 0.846163809299469,
"learning_rate": 0.00010896393089034336,
"loss": 0.0477,
"step": 373
},
{
"epoch": 5.731800766283525,
"grad_norm": 0.31894299387931824,
"learning_rate": 0.00010855750084788398,
"loss": 0.0216,
"step": 374
},
{
"epoch": 5.731800766283525,
"eval_loss": 2.4762635231018066,
"eval_runtime": 10.4616,
"eval_samples_per_second": 9.559,
"eval_steps_per_second": 4.779,
"step": 374
},
{
"epoch": 5.747126436781609,
"grad_norm": 0.6521170139312744,
"learning_rate": 0.00010815092835470633,
"loss": 0.0268,
"step": 375
},
{
"epoch": 5.762452107279693,
"grad_norm": 0.2925560772418976,
"learning_rate": 0.00010774422017873771,
"loss": 0.0223,
"step": 376
},
{
"epoch": 5.777777777777778,
"grad_norm": 0.7669603824615479,
"learning_rate": 0.00010733738309016401,
"loss": 0.027,
"step": 377
},
{
"epoch": 5.793103448275862,
"grad_norm": 0.30490854382514954,
"learning_rate": 0.00010693042386131713,
"loss": 0.02,
"step": 378
},
{
"epoch": 5.8084291187739465,
"grad_norm": 0.456485390663147,
"learning_rate": 0.00010652334926656209,
"loss": 0.0278,
"step": 379
},
{
"epoch": 5.823754789272031,
"grad_norm": 0.5804373621940613,
"learning_rate": 0.00010611616608218429,
"loss": 0.0347,
"step": 380
},
{
"epoch": 5.8390804597701145,
"grad_norm": 1.551376461982727,
"learning_rate": 0.00010570888108627681,
"loss": 0.0274,
"step": 381
},
{
"epoch": 5.854406130268199,
"grad_norm": 0.7403205037117004,
"learning_rate": 0.00010530150105862748,
"loss": 0.0285,
"step": 382
},
{
"epoch": 5.869731800766283,
"grad_norm": 0.7229623794555664,
"learning_rate": 0.00010489403278060613,
"loss": 0.0391,
"step": 383
},
{
"epoch": 5.885057471264368,
"grad_norm": 0.3897419571876526,
"learning_rate": 0.00010448648303505151,
"loss": 0.0231,
"step": 384
},
{
"epoch": 5.900383141762452,
"grad_norm": 0.5959421396255493,
"learning_rate": 0.00010407885860615859,
"loss": 0.0309,
"step": 385
},
{
"epoch": 5.915708812260537,
"grad_norm": 0.7538139224052429,
"learning_rate": 0.00010367116627936548,
"loss": 0.0306,
"step": 386
},
{
"epoch": 5.931034482758621,
"grad_norm": 0.46324053406715393,
"learning_rate": 0.00010326341284124061,
"loss": 0.0293,
"step": 387
},
{
"epoch": 5.946360153256705,
"grad_norm": 1.4018464088439941,
"learning_rate": 0.00010285560507936961,
"loss": 0.0393,
"step": 388
},
{
"epoch": 5.961685823754789,
"grad_norm": 0.5677470564842224,
"learning_rate": 0.00010244774978224254,
"loss": 0.0361,
"step": 389
},
{
"epoch": 5.977011494252873,
"grad_norm": 0.35945063829421997,
"learning_rate": 0.00010203985373914056,
"loss": 0.0206,
"step": 390
},
{
"epoch": 5.992337164750958,
"grad_norm": 0.35713624954223633,
"learning_rate": 0.0001016319237400232,
"loss": 0.0272,
"step": 391
},
{
"epoch": 5.992337164750958,
"eval_loss": 2.511009454727173,
"eval_runtime": 10.521,
"eval_samples_per_second": 9.505,
"eval_steps_per_second": 4.752,
"step": 391
},
{
"epoch": 6.003831417624521,
"grad_norm": 0.6757388114929199,
"learning_rate": 0.00010122396657541522,
"loss": 0.035,
"step": 392
},
{
"epoch": 6.019157088122605,
"grad_norm": 0.3791247010231018,
"learning_rate": 0.0001008159890362936,
"loss": 0.0174,
"step": 393
},
{
"epoch": 6.0344827586206895,
"grad_norm": 0.19176137447357178,
"learning_rate": 0.00010040799791397444,
"loss": 0.0146,
"step": 394
},
{
"epoch": 6.049808429118774,
"grad_norm": 0.16038718819618225,
"learning_rate": 0.0001,
"loss": 0.0118,
"step": 395
},
{
"epoch": 6.065134099616858,
"grad_norm": 0.14217466115951538,
"learning_rate": 9.95920020860256e-05,
"loss": 0.009,
"step": 396
},
{
"epoch": 6.080459770114943,
"grad_norm": 0.19670097529888153,
"learning_rate": 9.918401096370644e-05,
"loss": 0.0134,
"step": 397
},
{
"epoch": 6.095785440613027,
"grad_norm": 0.7063495516777039,
"learning_rate": 9.877603342458483e-05,
"loss": 0.0186,
"step": 398
},
{
"epoch": 6.111111111111111,
"grad_norm": 0.27073654532432556,
"learning_rate": 9.836807625997683e-05,
"loss": 0.0123,
"step": 399
},
{
"epoch": 6.126436781609195,
"grad_norm": 0.34357860684394836,
"learning_rate": 9.79601462608595e-05,
"loss": 0.0224,
"step": 400
},
{
"epoch": 6.14176245210728,
"grad_norm": 1.0311784744262695,
"learning_rate": 9.755225021775749e-05,
"loss": 0.0122,
"step": 401
},
{
"epoch": 6.157088122605364,
"grad_norm": 0.12156683206558228,
"learning_rate": 9.71443949206304e-05,
"loss": 0.011,
"step": 402
},
{
"epoch": 6.172413793103448,
"grad_norm": 0.15306659042835236,
"learning_rate": 9.67365871587594e-05,
"loss": 0.0101,
"step": 403
},
{
"epoch": 6.187739463601533,
"grad_norm": 0.40619829297065735,
"learning_rate": 9.632883372063457e-05,
"loss": 0.0124,
"step": 404
},
{
"epoch": 6.203065134099617,
"grad_norm": 0.2220255583524704,
"learning_rate": 9.592114139384145e-05,
"loss": 0.0115,
"step": 405
},
{
"epoch": 6.218390804597701,
"grad_norm": 0.36143144965171814,
"learning_rate": 9.551351696494854e-05,
"loss": 0.0143,
"step": 406
},
{
"epoch": 6.233716475095785,
"grad_norm": 0.19601793587207794,
"learning_rate": 9.51059672193939e-05,
"loss": 0.0121,
"step": 407
},
{
"epoch": 6.24904214559387,
"grad_norm": 0.17943957448005676,
"learning_rate": 9.469849894137253e-05,
"loss": 0.0117,
"step": 408
},
{
"epoch": 6.24904214559387,
"eval_loss": 2.7329955101013184,
"eval_runtime": 10.5244,
"eval_samples_per_second": 9.502,
"eval_steps_per_second": 4.751,
"step": 408
},
{
"epoch": 6.264367816091954,
"grad_norm": 0.19360607862472534,
"learning_rate": 9.42911189137232e-05,
"loss": 0.0095,
"step": 409
},
{
"epoch": 6.2796934865900385,
"grad_norm": 0.24287296831607819,
"learning_rate": 9.388383391781575e-05,
"loss": 0.0116,
"step": 410
},
{
"epoch": 6.295019157088123,
"grad_norm": 0.554787814617157,
"learning_rate": 9.347665073343794e-05,
"loss": 0.0138,
"step": 411
},
{
"epoch": 6.310344827586207,
"grad_norm": 0.23142507672309875,
"learning_rate": 9.306957613868292e-05,
"loss": 0.0131,
"step": 412
},
{
"epoch": 6.325670498084291,
"grad_norm": 0.2346455603837967,
"learning_rate": 9.266261690983602e-05,
"loss": 0.011,
"step": 413
},
{
"epoch": 6.340996168582375,
"grad_norm": 0.8730548620223999,
"learning_rate": 9.225577982126234e-05,
"loss": 0.0151,
"step": 414
},
{
"epoch": 6.35632183908046,
"grad_norm": 0.3552612364292145,
"learning_rate": 9.184907164529368e-05,
"loss": 0.0232,
"step": 415
},
{
"epoch": 6.371647509578544,
"grad_norm": 0.22842758893966675,
"learning_rate": 9.144249915211605e-05,
"loss": 0.0153,
"step": 416
},
{
"epoch": 6.3869731800766285,
"grad_norm": 0.20680157840251923,
"learning_rate": 9.103606910965666e-05,
"loss": 0.0128,
"step": 417
},
{
"epoch": 6.402298850574713,
"grad_norm": 0.4528963565826416,
"learning_rate": 9.062978828347161e-05,
"loss": 0.0222,
"step": 418
},
{
"epoch": 6.417624521072797,
"grad_norm": 0.298604816198349,
"learning_rate": 9.022366343663298e-05,
"loss": 0.0168,
"step": 419
},
{
"epoch": 6.432950191570881,
"grad_norm": 0.11246322840452194,
"learning_rate": 8.981770132961649e-05,
"loss": 0.0089,
"step": 420
},
{
"epoch": 6.448275862068965,
"grad_norm": 0.2391061782836914,
"learning_rate": 8.94119087201887e-05,
"loss": 0.0105,
"step": 421
},
{
"epoch": 6.46360153256705,
"grad_norm": 0.10826307535171509,
"learning_rate": 8.900629236329482e-05,
"loss": 0.0089,
"step": 422
},
{
"epoch": 6.478927203065134,
"grad_norm": 0.18837091326713562,
"learning_rate": 8.860085901094595e-05,
"loss": 0.0117,
"step": 423
},
{
"epoch": 6.494252873563219,
"grad_norm": 0.24223893880844116,
"learning_rate": 8.819561541210698e-05,
"loss": 0.0109,
"step": 424
},
{
"epoch": 6.509578544061303,
"grad_norm": 0.38215088844299316,
"learning_rate": 8.779056831258402e-05,
"loss": 0.0115,
"step": 425
},
{
"epoch": 6.509578544061303,
"eval_loss": 2.640347480773926,
"eval_runtime": 10.5535,
"eval_samples_per_second": 9.475,
"eval_steps_per_second": 4.738,
"step": 425
},
{
"epoch": 6.5249042145593865,
"grad_norm": 0.4854836165904999,
"learning_rate": 8.738572445491226e-05,
"loss": 0.0168,
"step": 426
},
{
"epoch": 6.540229885057471,
"grad_norm": 0.20515725016593933,
"learning_rate": 8.698109057824354e-05,
"loss": 0.0128,
"step": 427
},
{
"epoch": 6.555555555555555,
"grad_norm": 0.21756961941719055,
"learning_rate": 8.657667341823448e-05,
"loss": 0.0114,
"step": 428
},
{
"epoch": 6.57088122605364,
"grad_norm": 0.18275758624076843,
"learning_rate": 8.617247970693398e-05,
"loss": 0.0105,
"step": 429
},
{
"epoch": 6.586206896551724,
"grad_norm": 0.175423264503479,
"learning_rate": 8.57685161726715e-05,
"loss": 0.0102,
"step": 430
},
{
"epoch": 6.601532567049809,
"grad_norm": 0.3893040418624878,
"learning_rate": 8.53647895399448e-05,
"loss": 0.0151,
"step": 431
},
{
"epoch": 6.616858237547893,
"grad_norm": 0.3841419816017151,
"learning_rate": 8.496130652930818e-05,
"loss": 0.0135,
"step": 432
},
{
"epoch": 6.6321839080459775,
"grad_norm": 0.1184447631239891,
"learning_rate": 8.455807385726046e-05,
"loss": 0.0096,
"step": 433
},
{
"epoch": 6.647509578544061,
"grad_norm": 0.11839904636144638,
"learning_rate": 8.415509823613331e-05,
"loss": 0.0087,
"step": 434
},
{
"epoch": 6.662835249042145,
"grad_norm": 0.27116042375564575,
"learning_rate": 8.375238637397942e-05,
"loss": 0.0134,
"step": 435
},
{
"epoch": 6.67816091954023,
"grad_norm": 0.1837141215801239,
"learning_rate": 8.334994497446091e-05,
"loss": 0.0102,
"step": 436
},
{
"epoch": 6.693486590038314,
"grad_norm": 0.14119590818881989,
"learning_rate": 8.294778073673762e-05,
"loss": 0.0103,
"step": 437
},
{
"epoch": 6.708812260536399,
"grad_norm": 0.38409751653671265,
"learning_rate": 8.254590035535579e-05,
"loss": 0.0146,
"step": 438
},
{
"epoch": 6.724137931034483,
"grad_norm": 0.1519305408000946,
"learning_rate": 8.214431052013634e-05,
"loss": 0.0097,
"step": 439
},
{
"epoch": 6.739463601532567,
"grad_norm": 0.2955567240715027,
"learning_rate": 8.174301791606385e-05,
"loss": 0.0114,
"step": 440
},
{
"epoch": 6.754789272030651,
"grad_norm": 0.2837064862251282,
"learning_rate": 8.134202922317495e-05,
"loss": 0.0134,
"step": 441
},
{
"epoch": 6.7701149425287355,
"grad_norm": 0.13082526624202728,
"learning_rate": 8.094135111644742e-05,
"loss": 0.0092,
"step": 442
},
{
"epoch": 6.7701149425287355,
"eval_loss": 2.7746777534484863,
"eval_runtime": 10.5408,
"eval_samples_per_second": 9.487,
"eval_steps_per_second": 4.743,
"step": 442
},
{
"epoch": 6.78544061302682,
"grad_norm": 0.5769606232643127,
"learning_rate": 8.054099026568874e-05,
"loss": 0.0147,
"step": 443
},
{
"epoch": 6.800766283524904,
"grad_norm": 0.1398877650499344,
"learning_rate": 8.014095333542548e-05,
"loss": 0.0098,
"step": 444
},
{
"epoch": 6.816091954022989,
"grad_norm": 0.16053611040115356,
"learning_rate": 7.974124698479192e-05,
"loss": 0.0074,
"step": 445
},
{
"epoch": 6.831417624521073,
"grad_norm": 0.27454668283462524,
"learning_rate": 7.934187786741956e-05,
"loss": 0.0103,
"step": 446
},
{
"epoch": 6.846743295019158,
"grad_norm": 0.36763104796409607,
"learning_rate": 7.894285263132612e-05,
"loss": 0.0153,
"step": 447
},
{
"epoch": 6.862068965517241,
"grad_norm": 0.21019311249256134,
"learning_rate": 7.854417791880507e-05,
"loss": 0.013,
"step": 448
},
{
"epoch": 6.8773946360153255,
"grad_norm": 0.2829742133617401,
"learning_rate": 7.814586036631483e-05,
"loss": 0.0118,
"step": 449
},
{
"epoch": 6.89272030651341,
"grad_norm": 0.30828389525413513,
"learning_rate": 7.774790660436858e-05,
"loss": 0.011,
"step": 450
},
{
"epoch": 6.908045977011494,
"grad_norm": 0.6878758072853088,
"learning_rate": 7.735032325742355e-05,
"loss": 0.0293,
"step": 451
},
{
"epoch": 6.923371647509579,
"grad_norm": 0.15684568881988525,
"learning_rate": 7.695311694377115e-05,
"loss": 0.01,
"step": 452
},
{
"epoch": 6.938697318007663,
"grad_norm": 0.32623958587646484,
"learning_rate": 7.655629427542635e-05,
"loss": 0.0117,
"step": 453
},
{
"epoch": 6.954022988505747,
"grad_norm": 0.10675598680973053,
"learning_rate": 7.615986185801807e-05,
"loss": 0.0077,
"step": 454
},
{
"epoch": 6.969348659003831,
"grad_norm": 0.3139125406742096,
"learning_rate": 7.576382629067877e-05,
"loss": 0.0134,
"step": 455
},
{
"epoch": 6.984674329501916,
"grad_norm": 0.37668049335479736,
"learning_rate": 7.536819416593504e-05,
"loss": 0.011,
"step": 456
},
{
"epoch": 7.0,
"grad_norm": 0.15798693895339966,
"learning_rate": 7.497297206959746e-05,
"loss": 0.0093,
"step": 457
},
{
"epoch": 7.011494252873563,
"grad_norm": 0.3846645653247833,
"learning_rate": 7.457816658065134e-05,
"loss": 0.0108,
"step": 458
},
{
"epoch": 7.026819923371647,
"grad_norm": 0.05968603119254112,
"learning_rate": 7.41837842711468e-05,
"loss": 0.0064,
"step": 459
},
{
"epoch": 7.026819923371647,
"eval_loss": 2.7342193126678467,
"eval_runtime": 10.5281,
"eval_samples_per_second": 9.498,
"eval_steps_per_second": 4.749,
"step": 459
},
{
"epoch": 7.042145593869732,
"grad_norm": 0.05475788936018944,
"learning_rate": 7.378983170608982e-05,
"loss": 0.0054,
"step": 460
},
{
"epoch": 7.057471264367816,
"grad_norm": 0.055521685630083084,
"learning_rate": 7.339631544333249e-05,
"loss": 0.0057,
"step": 461
},
{
"epoch": 7.0727969348659006,
"grad_norm": 0.06325386464595795,
"learning_rate": 7.300324203346431e-05,
"loss": 0.0061,
"step": 462
},
{
"epoch": 7.088122605363985,
"grad_norm": 0.5059542655944824,
"learning_rate": 7.261061801970277e-05,
"loss": 0.0079,
"step": 463
},
{
"epoch": 7.103448275862069,
"grad_norm": 0.06388293951749802,
"learning_rate": 7.221844993778464e-05,
"loss": 0.0056,
"step": 464
},
{
"epoch": 7.118773946360153,
"grad_norm": 0.07516956329345703,
"learning_rate": 7.182674431585704e-05,
"loss": 0.006,
"step": 465
},
{
"epoch": 7.134099616858237,
"grad_norm": 0.14318601787090302,
"learning_rate": 7.143550767436894e-05,
"loss": 0.0067,
"step": 466
},
{
"epoch": 7.149425287356322,
"grad_norm": 0.1426093429327011,
"learning_rate": 7.104474652596245e-05,
"loss": 0.0079,
"step": 467
},
{
"epoch": 7.164750957854406,
"grad_norm": 0.05885975807905197,
"learning_rate": 7.065446737536456e-05,
"loss": 0.0055,
"step": 468
},
{
"epoch": 7.180076628352491,
"grad_norm": 0.06351395696401596,
"learning_rate": 7.026467671927863e-05,
"loss": 0.0059,
"step": 469
},
{
"epoch": 7.195402298850575,
"grad_norm": 0.0676102414727211,
"learning_rate": 6.98753810462766e-05,
"loss": 0.0062,
"step": 470
},
{
"epoch": 7.210727969348659,
"grad_norm": 0.07731365412473679,
"learning_rate": 6.948658683669056e-05,
"loss": 0.0058,
"step": 471
},
{
"epoch": 7.226053639846743,
"grad_norm": 0.06487540900707245,
"learning_rate": 6.909830056250527e-05,
"loss": 0.0061,
"step": 472
},
{
"epoch": 7.241379310344827,
"grad_norm": 0.09343966096639633,
"learning_rate": 6.871052868725012e-05,
"loss": 0.0062,
"step": 473
},
{
"epoch": 7.256704980842912,
"grad_norm": 0.1045990064740181,
"learning_rate": 6.832327766589177e-05,
"loss": 0.0063,
"step": 474
},
{
"epoch": 7.272030651340996,
"grad_norm": 0.05801545828580856,
"learning_rate": 6.793655394472644e-05,
"loss": 0.0057,
"step": 475
},
{
"epoch": 7.287356321839081,
"grad_norm": 0.06868793070316315,
"learning_rate": 6.755036396127296e-05,
"loss": 0.0059,
"step": 476
},
{
"epoch": 7.287356321839081,
"eval_loss": 2.8930225372314453,
"eval_runtime": 10.5758,
"eval_samples_per_second": 9.456,
"eval_steps_per_second": 4.728,
"step": 476
},
{
"epoch": 7.302681992337165,
"grad_norm": 0.08218348026275635,
"learning_rate": 6.716471414416519e-05,
"loss": 0.0075,
"step": 477
},
{
"epoch": 7.3180076628352495,
"grad_norm": 0.08141635358333588,
"learning_rate": 6.677961091304535e-05,
"loss": 0.0061,
"step": 478
},
{
"epoch": 7.333333333333333,
"grad_norm": 0.05970093235373497,
"learning_rate": 6.639506067845697e-05,
"loss": 0.006,
"step": 479
},
{
"epoch": 7.3486590038314175,
"grad_norm": 0.07674306631088257,
"learning_rate": 6.601106984173835e-05,
"loss": 0.0058,
"step": 480
},
{
"epoch": 7.363984674329502,
"grad_norm": 0.07168275862932205,
"learning_rate": 6.562764479491565e-05,
"loss": 0.0054,
"step": 481
},
{
"epoch": 7.379310344827586,
"grad_norm": 0.06897211819887161,
"learning_rate": 6.524479192059698e-05,
"loss": 0.0059,
"step": 482
},
{
"epoch": 7.394636015325671,
"grad_norm": 0.5173123478889465,
"learning_rate": 6.486251759186572e-05,
"loss": 0.008,
"step": 483
},
{
"epoch": 7.409961685823755,
"grad_norm": 0.05815713480114937,
"learning_rate": 6.448082817217471e-05,
"loss": 0.0052,
"step": 484
},
{
"epoch": 7.425287356321839,
"grad_norm": 0.08304629474878311,
"learning_rate": 6.409973001524012e-05,
"loss": 0.0058,
"step": 485
},
{
"epoch": 7.440613026819923,
"grad_norm": 0.10966533422470093,
"learning_rate": 6.371922946493591e-05,
"loss": 0.0058,
"step": 486
},
{
"epoch": 7.4559386973180075,
"grad_norm": 0.06352514773607254,
"learning_rate": 6.333933285518796e-05,
"loss": 0.0054,
"step": 487
},
{
"epoch": 7.471264367816092,
"grad_norm": 0.16141043603420258,
"learning_rate": 6.29600465098689e-05,
"loss": 0.0106,
"step": 488
},
{
"epoch": 7.486590038314176,
"grad_norm": 0.06440207362174988,
"learning_rate": 6.258137674269261e-05,
"loss": 0.006,
"step": 489
},
{
"epoch": 7.501915708812261,
"grad_norm": 0.08629340678453445,
"learning_rate": 6.220332985710936e-05,
"loss": 0.0073,
"step": 490
},
{
"epoch": 7.517241379310345,
"grad_norm": 0.06371556222438812,
"learning_rate": 6.182591214620057e-05,
"loss": 0.006,
"step": 491
},
{
"epoch": 7.53256704980843,
"grad_norm": 0.08433310687541962,
"learning_rate": 6.144912989257441e-05,
"loss": 0.006,
"step": 492
},
{
"epoch": 7.547892720306513,
"grad_norm": 0.08213558048009872,
"learning_rate": 6.107298936826086e-05,
"loss": 0.0065,
"step": 493
},
{
"epoch": 7.547892720306513,
"eval_loss": 2.91325306892395,
"eval_runtime": 10.6133,
"eval_samples_per_second": 9.422,
"eval_steps_per_second": 4.711,
"step": 493
},
{
"epoch": 7.563218390804598,
"grad_norm": 0.059887565672397614,
"learning_rate": 6.069749683460765e-05,
"loss": 0.0055,
"step": 494
},
{
"epoch": 7.578544061302682,
"grad_norm": 0.06606566160917282,
"learning_rate": 6.0322658542175736e-05,
"loss": 0.0045,
"step": 495
},
{
"epoch": 7.593869731800766,
"grad_norm": 0.076997309923172,
"learning_rate": 5.994848073063551e-05,
"loss": 0.0059,
"step": 496
},
{
"epoch": 7.609195402298851,
"grad_norm": 0.0730021744966507,
"learning_rate": 5.957496962866262e-05,
"loss": 0.0053,
"step": 497
},
{
"epoch": 7.624521072796935,
"grad_norm": 0.05936294421553612,
"learning_rate": 5.920213145383466e-05,
"loss": 0.0054,
"step": 498
},
{
"epoch": 7.639846743295019,
"grad_norm": 0.14003659784793854,
"learning_rate": 5.8829972412527327e-05,
"loss": 0.0073,
"step": 499
},
{
"epoch": 7.655172413793103,
"grad_norm": 0.05907728150486946,
"learning_rate": 5.845849869981137e-05,
"loss": 0.0042,
"step": 500
},
{
"epoch": 7.670498084291188,
"grad_norm": 0.057687729597091675,
"learning_rate": 5.808771649934923e-05,
"loss": 0.0052,
"step": 501
},
{
"epoch": 7.685823754789272,
"grad_norm": 0.09928648918867111,
"learning_rate": 5.7717631983292375e-05,
"loss": 0.0055,
"step": 502
},
{
"epoch": 7.7011494252873565,
"grad_norm": 0.07954944670200348,
"learning_rate": 5.73482513121783e-05,
"loss": 0.0057,
"step": 503
},
{
"epoch": 7.716475095785441,
"grad_norm": 0.06073677912354469,
"learning_rate": 5.6979580634828125e-05,
"loss": 0.0059,
"step": 504
},
{
"epoch": 7.731800766283525,
"grad_norm": 0.06618310511112213,
"learning_rate": 5.6611626088244194e-05,
"loss": 0.0056,
"step": 505
},
{
"epoch": 7.747126436781609,
"grad_norm": 0.06377172470092773,
"learning_rate": 5.624439379750794e-05,
"loss": 0.0053,
"step": 506
},
{
"epoch": 7.762452107279693,
"grad_norm": 0.06222354248166084,
"learning_rate": 5.5877889875677845e-05,
"loss": 0.0054,
"step": 507
},
{
"epoch": 7.777777777777778,
"grad_norm": 0.06755752861499786,
"learning_rate": 5.551212042368792e-05,
"loss": 0.0069,
"step": 508
},
{
"epoch": 7.793103448275862,
"grad_norm": 0.23886863887310028,
"learning_rate": 5.514709153024571e-05,
"loss": 0.007,
"step": 509
},
{
"epoch": 7.8084291187739465,
"grad_norm": 0.06176340579986572,
"learning_rate": 5.478280927173145e-05,
"loss": 0.0059,
"step": 510
},
{
"epoch": 7.8084291187739465,
"eval_loss": 2.921626091003418,
"eval_runtime": 10.5435,
"eval_samples_per_second": 9.485,
"eval_steps_per_second": 4.742,
"step": 510
},
{
"epoch": 7.823754789272031,
"grad_norm": 0.056606221944093704,
"learning_rate": 5.4419279712096437e-05,
"loss": 0.0049,
"step": 511
},
{
"epoch": 7.8390804597701145,
"grad_norm": 0.06514956057071686,
"learning_rate": 5.405650890276255e-05,
"loss": 0.0061,
"step": 512
},
{
"epoch": 7.854406130268199,
"grad_norm": 0.05932604894042015,
"learning_rate": 5.3694502882521125e-05,
"loss": 0.0058,
"step": 513
},
{
"epoch": 7.869731800766283,
"grad_norm": 0.06986385583877563,
"learning_rate": 5.333326767743263e-05,
"loss": 0.0048,
"step": 514
},
{
"epoch": 7.885057471264368,
"grad_norm": 0.07194341719150543,
"learning_rate": 5.297280930072632e-05,
"loss": 0.0065,
"step": 515
},
{
"epoch": 7.900383141762452,
"grad_norm": 0.12007016688585281,
"learning_rate": 5.261313375270014e-05,
"loss": 0.0068,
"step": 516
},
{
"epoch": 7.915708812260537,
"grad_norm": 0.05479056015610695,
"learning_rate": 5.2254247020620814e-05,
"loss": 0.0052,
"step": 517
},
{
"epoch": 7.931034482758621,
"grad_norm": 0.18069668114185333,
"learning_rate": 5.189615507862422e-05,
"loss": 0.0077,
"step": 518
},
{
"epoch": 7.946360153256705,
"grad_norm": 0.08876926451921463,
"learning_rate": 5.153886388761586e-05,
"loss": 0.0063,
"step": 519
},
{
"epoch": 7.961685823754789,
"grad_norm": 0.05993456766009331,
"learning_rate": 5.11823793951719e-05,
"loss": 0.0048,
"step": 520
},
{
"epoch": 7.977011494252873,
"grad_norm": 0.05695677176117897,
"learning_rate": 5.082670753543961e-05,
"loss": 0.0049,
"step": 521
},
{
"epoch": 7.992337164750958,
"grad_norm": 0.0639839619398117,
"learning_rate": 5.047185422903928e-05,
"loss": 0.0054,
"step": 522
},
{
"epoch": 8.007662835249041,
"grad_norm": 0.1566697508096695,
"learning_rate": 5.011782538296512e-05,
"loss": 0.0103,
"step": 523
},
{
"epoch": 8.022988505747126,
"grad_norm": 0.0462418757379055,
"learning_rate": 4.976462689048717e-05,
"loss": 0.0043,
"step": 524
},
{
"epoch": 8.03831417624521,
"grad_norm": 0.046641357243061066,
"learning_rate": 4.9412264631053216e-05,
"loss": 0.0048,
"step": 525
},
{
"epoch": 8.053639846743295,
"grad_norm": 0.04404853284358978,
"learning_rate": 4.9060744470190676e-05,
"loss": 0.0044,
"step": 526
},
{
"epoch": 8.068965517241379,
"grad_norm": 0.053229521960020065,
"learning_rate": 4.87100722594094e-05,
"loss": 0.0058,
"step": 527
},
{
"epoch": 8.068965517241379,
"eval_loss": 2.9435019493103027,
"eval_runtime": 10.5293,
"eval_samples_per_second": 9.497,
"eval_steps_per_second": 4.749,
"step": 527
},
{
"epoch": 8.084291187739463,
"grad_norm": 0.039271771907806396,
"learning_rate": 4.836025383610382e-05,
"loss": 0.0035,
"step": 528
},
{
"epoch": 8.099616858237548,
"grad_norm": 0.0491085946559906,
"learning_rate": 4.801129502345605e-05,
"loss": 0.0048,
"step": 529
},
{
"epoch": 8.114942528735632,
"grad_norm": 0.03886023536324501,
"learning_rate": 4.7663201630338816e-05,
"loss": 0.004,
"step": 530
},
{
"epoch": 8.130268199233717,
"grad_norm": 0.04504215344786644,
"learning_rate": 4.7315979451218864e-05,
"loss": 0.0047,
"step": 531
},
{
"epoch": 8.145593869731801,
"grad_norm": 0.05867081508040428,
"learning_rate": 4.696963426606041e-05,
"loss": 0.0058,
"step": 532
},
{
"epoch": 8.160919540229886,
"grad_norm": 0.0445120669901371,
"learning_rate": 4.6624171840229e-05,
"loss": 0.0043,
"step": 533
},
{
"epoch": 8.17624521072797,
"grad_norm": 0.05101229250431061,
"learning_rate": 4.6279597924395436e-05,
"loss": 0.0044,
"step": 534
},
{
"epoch": 8.191570881226054,
"grad_norm": 0.04617276415228844,
"learning_rate": 4.593591825444028e-05,
"loss": 0.0045,
"step": 535
},
{
"epoch": 8.206896551724139,
"grad_norm": 0.048301588743925095,
"learning_rate": 4.559313855135795e-05,
"loss": 0.0046,
"step": 536
},
{
"epoch": 8.222222222222221,
"grad_norm": 0.05069313570857048,
"learning_rate": 4.5251264521162005e-05,
"loss": 0.005,
"step": 537
},
{
"epoch": 8.237547892720306,
"grad_norm": 0.04811912775039673,
"learning_rate": 4.491030185478976e-05,
"loss": 0.0045,
"step": 538
},
{
"epoch": 8.25287356321839,
"grad_norm": 0.04650574177503586,
"learning_rate": 4.457025622800771e-05,
"loss": 0.0049,
"step": 539
},
{
"epoch": 8.268199233716475,
"grad_norm": 0.038902636617422104,
"learning_rate": 4.423113330131707e-05,
"loss": 0.0037,
"step": 540
},
{
"epoch": 8.28352490421456,
"grad_norm": 0.0576075054705143,
"learning_rate": 4.389293871985949e-05,
"loss": 0.0066,
"step": 541
},
{
"epoch": 8.298850574712644,
"grad_norm": 0.051424864679574966,
"learning_rate": 4.355567811332311e-05,
"loss": 0.0053,
"step": 542
},
{
"epoch": 8.314176245210728,
"grad_norm": 0.040568236261606216,
"learning_rate": 4.3219357095848836e-05,
"loss": 0.0038,
"step": 543
},
{
"epoch": 8.329501915708812,
"grad_norm": 0.051232922822237015,
"learning_rate": 4.2883981265936876e-05,
"loss": 0.0046,
"step": 544
},
{
"epoch": 8.329501915708812,
"eval_loss": 3.006831169128418,
"eval_runtime": 10.5212,
"eval_samples_per_second": 9.505,
"eval_steps_per_second": 4.752,
"step": 544
},
{
"epoch": 8.344827586206897,
"grad_norm": 0.04653798043727875,
"learning_rate": 4.25495562063537e-05,
"loss": 0.0048,
"step": 545
},
{
"epoch": 8.360153256704981,
"grad_norm": 0.04423636198043823,
"learning_rate": 4.2216087484038714e-05,
"loss": 0.0038,
"step": 546
},
{
"epoch": 8.375478927203066,
"grad_norm": 0.04573935642838478,
"learning_rate": 4.188358065001215e-05,
"loss": 0.0045,
"step": 547
},
{
"epoch": 8.39080459770115,
"grad_norm": 0.044406238943338394,
"learning_rate": 4.155204123928205e-05,
"loss": 0.0041,
"step": 548
},
{
"epoch": 8.406130268199234,
"grad_norm": 0.044500816613435745,
"learning_rate": 4.12214747707527e-05,
"loss": 0.0044,
"step": 549
},
{
"epoch": 8.421455938697317,
"grad_norm": 0.039383914321660995,
"learning_rate": 4.089188674713236e-05,
"loss": 0.0038,
"step": 550
},
{
"epoch": 8.436781609195402,
"grad_norm": 0.04521704837679863,
"learning_rate": 4.056328265484184e-05,
"loss": 0.0046,
"step": 551
},
{
"epoch": 8.452107279693486,
"grad_norm": 0.047671083360910416,
"learning_rate": 4.023566796392313e-05,
"loss": 0.0042,
"step": 552
},
{
"epoch": 8.46743295019157,
"grad_norm": 0.04466583952307701,
"learning_rate": 3.990904812794834e-05,
"loss": 0.0043,
"step": 553
},
{
"epoch": 8.482758620689655,
"grad_norm": 0.05882612615823746,
"learning_rate": 3.958342858392893e-05,
"loss": 0.0059,
"step": 554
},
{
"epoch": 8.49808429118774,
"grad_norm": 0.048001233488321304,
"learning_rate": 3.9258814752225284e-05,
"loss": 0.0042,
"step": 555
},
{
"epoch": 8.513409961685824,
"grad_norm": 0.06287714838981628,
"learning_rate": 3.893521203645618e-05,
"loss": 0.0053,
"step": 556
},
{
"epoch": 8.528735632183908,
"grad_norm": 0.047715529799461365,
"learning_rate": 3.8612625823409366e-05,
"loss": 0.0041,
"step": 557
},
{
"epoch": 8.544061302681992,
"grad_norm": 0.05052071437239647,
"learning_rate": 3.829106148295126e-05,
"loss": 0.0046,
"step": 558
},
{
"epoch": 8.559386973180077,
"grad_norm": 0.24502001702785492,
"learning_rate": 3.797052436793814e-05,
"loss": 0.0066,
"step": 559
},
{
"epoch": 8.574712643678161,
"grad_norm": 0.046199604868888855,
"learning_rate": 3.7651019814126654e-05,
"loss": 0.0045,
"step": 560
},
{
"epoch": 8.590038314176246,
"grad_norm": 0.049519941210746765,
"learning_rate": 3.7332553140085155e-05,
"loss": 0.0051,
"step": 561
},
{
"epoch": 8.590038314176246,
"eval_loss": 3.0260815620422363,
"eval_runtime": 10.5212,
"eval_samples_per_second": 9.505,
"eval_steps_per_second": 4.752,
"step": 561
},
{
"epoch": 8.60536398467433,
"grad_norm": 0.053081195801496506,
"learning_rate": 3.701512964710513e-05,
"loss": 0.0046,
"step": 562
},
{
"epoch": 8.620689655172415,
"grad_norm": 0.041760966181755066,
"learning_rate": 3.669875461911297e-05,
"loss": 0.0036,
"step": 563
},
{
"epoch": 8.636015325670499,
"grad_norm": 0.05594363436102867,
"learning_rate": 3.638343332258203e-05,
"loss": 0.0052,
"step": 564
},
{
"epoch": 8.651340996168582,
"grad_norm": 0.04741170257329941,
"learning_rate": 3.606917100644488e-05,
"loss": 0.0039,
"step": 565
},
{
"epoch": 8.666666666666666,
"grad_norm": 0.1333678662776947,
"learning_rate": 3.5755972902005987e-05,
"loss": 0.0048,
"step": 566
},
{
"epoch": 8.68199233716475,
"grad_norm": 0.060406796634197235,
"learning_rate": 3.544384422285477e-05,
"loss": 0.0056,
"step": 567
},
{
"epoch": 8.697318007662835,
"grad_norm": 0.04437935724854469,
"learning_rate": 3.513279016477844e-05,
"loss": 0.004,
"step": 568
},
{
"epoch": 8.71264367816092,
"grad_norm": 0.04306851327419281,
"learning_rate": 3.4822815905675954e-05,
"loss": 0.0043,
"step": 569
},
{
"epoch": 8.727969348659004,
"grad_norm": 0.049886684864759445,
"learning_rate": 3.45139266054715e-05,
"loss": 0.0054,
"step": 570
},
{
"epoch": 8.743295019157088,
"grad_norm": 0.039504941552877426,
"learning_rate": 3.4206127406028745e-05,
"loss": 0.0036,
"step": 571
},
{
"epoch": 8.758620689655173,
"grad_norm": 0.05250853672623634,
"learning_rate": 3.389942343106522e-05,
"loss": 0.0055,
"step": 572
},
{
"epoch": 8.773946360153257,
"grad_norm": 0.06467723846435547,
"learning_rate": 3.359381978606701e-05,
"loss": 0.0046,
"step": 573
},
{
"epoch": 8.789272030651341,
"grad_norm": 0.04862450435757637,
"learning_rate": 3.328932155820377e-05,
"loss": 0.0045,
"step": 574
},
{
"epoch": 8.804597701149426,
"grad_norm": 0.04701303318142891,
"learning_rate": 3.298593381624406e-05,
"loss": 0.0045,
"step": 575
},
{
"epoch": 8.81992337164751,
"grad_norm": 0.04837154597043991,
"learning_rate": 3.2683661610470963e-05,
"loss": 0.0039,
"step": 576
},
{
"epoch": 8.835249042145595,
"grad_norm": 0.04792990908026695,
"learning_rate": 3.238250997259808e-05,
"loss": 0.0041,
"step": 577
},
{
"epoch": 8.850574712643677,
"grad_norm": 0.04371470585465431,
"learning_rate": 3.208248391568553e-05,
"loss": 0.0044,
"step": 578
},
{
"epoch": 8.850574712643677,
"eval_loss": 3.0277657508850098,
"eval_runtime": 10.5822,
"eval_samples_per_second": 9.45,
"eval_steps_per_second": 4.725,
"step": 578
},
{
"epoch": 8.865900383141762,
"grad_norm": 0.048086583614349365,
"learning_rate": 3.178358843405684e-05,
"loss": 0.0043,
"step": 579
},
{
"epoch": 8.881226053639846,
"grad_norm": 0.0496319979429245,
"learning_rate": 3.1485828503215585e-05,
"loss": 0.0047,
"step": 580
},
{
"epoch": 8.89655172413793,
"grad_norm": 0.05418609455227852,
"learning_rate": 3.1189209079762607e-05,
"loss": 0.0045,
"step": 581
},
{
"epoch": 8.911877394636015,
"grad_norm": 0.046972278505563736,
"learning_rate": 3.089373510131354e-05,
"loss": 0.0046,
"step": 582
},
{
"epoch": 8.9272030651341,
"grad_norm": 0.043504588305950165,
"learning_rate": 3.0599411486416585e-05,
"loss": 0.0039,
"step": 583
},
{
"epoch": 8.942528735632184,
"grad_norm": 0.05620258301496506,
"learning_rate": 3.030624313447067e-05,
"loss": 0.0048,
"step": 584
},
{
"epoch": 8.957854406130268,
"grad_norm": 0.05009399726986885,
"learning_rate": 3.0014234925643837e-05,
"loss": 0.0049,
"step": 585
},
{
"epoch": 8.973180076628353,
"grad_norm": 0.04514235258102417,
"learning_rate": 2.9723391720792037e-05,
"loss": 0.0043,
"step": 586
},
{
"epoch": 8.988505747126437,
"grad_norm": 0.04640582203865051,
"learning_rate": 2.9433718361378325e-05,
"loss": 0.0049,
"step": 587
},
{
"epoch": 9.003831417624522,
"grad_norm": 0.05993952602148056,
"learning_rate": 2.9145219669391943e-05,
"loss": 0.0058,
"step": 588
},
{
"epoch": 9.015325670498084,
"grad_norm": 0.0431952066719532,
"learning_rate": 2.8857900447268528e-05,
"loss": 0.004,
"step": 589
},
{
"epoch": 9.030651340996169,
"grad_norm": 0.049201883375644684,
"learning_rate": 2.8571765477809643e-05,
"loss": 0.0044,
"step": 590
},
{
"epoch": 9.045977011494253,
"grad_norm": 0.04409557208418846,
"learning_rate": 2.828681952410366e-05,
"loss": 0.0045,
"step": 591
},
{
"epoch": 9.061302681992338,
"grad_norm": 0.03789050877094269,
"learning_rate": 2.80030673294461e-05,
"loss": 0.0042,
"step": 592
},
{
"epoch": 9.076628352490422,
"grad_norm": 0.04339877888560295,
"learning_rate": 2.7720513617260856e-05,
"loss": 0.0041,
"step": 593
},
{
"epoch": 9.091954022988507,
"grad_norm": 0.04477155953645706,
"learning_rate": 2.7439163091021525e-05,
"loss": 0.0045,
"step": 594
},
{
"epoch": 9.10727969348659,
"grad_norm": 0.0375545509159565,
"learning_rate": 2.71590204341731e-05,
"loss": 0.0035,
"step": 595
},
{
"epoch": 9.10727969348659,
"eval_loss": 3.0368361473083496,
"eval_runtime": 10.5214,
"eval_samples_per_second": 9.504,
"eval_steps_per_second": 4.752,
"step": 595
},
{
"epoch": 9.122605363984674,
"grad_norm": 0.05114487558603287,
"learning_rate": 2.6880090310054028e-05,
"loss": 0.004,
"step": 596
},
{
"epoch": 9.137931034482758,
"grad_norm": 0.03906643018126488,
"learning_rate": 2.6602377361818575e-05,
"loss": 0.0042,
"step": 597
},
{
"epoch": 9.153256704980842,
"grad_norm": 0.04675779864192009,
"learning_rate": 2.6325886212359498e-05,
"loss": 0.0046,
"step": 598
},
{
"epoch": 9.168582375478927,
"grad_norm": 0.04050876200199127,
"learning_rate": 2.605062146423124e-05,
"loss": 0.0041,
"step": 599
},
{
"epoch": 9.183908045977011,
"grad_norm": 0.040845900774002075,
"learning_rate": 2.5776587699573006e-05,
"loss": 0.0047,
"step": 600
},
{
"epoch": 9.199233716475096,
"grad_norm": 0.03970637172460556,
"learning_rate": 2.5503789480032868e-05,
"loss": 0.004,
"step": 601
},
{
"epoch": 9.21455938697318,
"grad_norm": 0.03865237534046173,
"learning_rate": 2.523223134669157e-05,
"loss": 0.0038,
"step": 602
},
{
"epoch": 9.229885057471265,
"grad_norm": 0.04276614263653755,
"learning_rate": 2.496191781998698e-05,
"loss": 0.0041,
"step": 603
},
{
"epoch": 9.245210727969349,
"grad_norm": 0.04257293418049812,
"learning_rate": 2.4692853399638917e-05,
"loss": 0.0039,
"step": 604
},
{
"epoch": 9.260536398467433,
"grad_norm": 0.039596524089574814,
"learning_rate": 2.4425042564574184e-05,
"loss": 0.0041,
"step": 605
},
{
"epoch": 9.275862068965518,
"grad_norm": 0.045230794697999954,
"learning_rate": 2.4158489772852034e-05,
"loss": 0.0041,
"step": 606
},
{
"epoch": 9.291187739463602,
"grad_norm": 0.04807334393262863,
"learning_rate": 2.3893199461589945e-05,
"loss": 0.0044,
"step": 607
},
{
"epoch": 9.306513409961687,
"grad_norm": 0.04473911598324776,
"learning_rate": 2.3629176046889757e-05,
"loss": 0.0044,
"step": 608
},
{
"epoch": 9.32183908045977,
"grad_norm": 0.042184460908174515,
"learning_rate": 2.336642392376427e-05,
"loss": 0.0048,
"step": 609
},
{
"epoch": 9.337164750957854,
"grad_norm": 0.04541192203760147,
"learning_rate": 2.3104947466063787e-05,
"loss": 0.0038,
"step": 610
},
{
"epoch": 9.352490421455938,
"grad_norm": 0.035622596740722656,
"learning_rate": 2.284475102640371e-05,
"loss": 0.0037,
"step": 611
},
{
"epoch": 9.367816091954023,
"grad_norm": 0.036873120814561844,
"learning_rate": 2.2585838936091754e-05,
"loss": 0.0038,
"step": 612
},
{
"epoch": 9.367816091954023,
"eval_loss": 3.0577399730682373,
"eval_runtime": 10.637,
"eval_samples_per_second": 9.401,
"eval_steps_per_second": 4.701,
"step": 612
},
{
"epoch": 9.383141762452107,
"grad_norm": 0.04417318478226662,
"learning_rate": 2.2328215505056004e-05,
"loss": 0.0042,
"step": 613
},
{
"epoch": 9.398467432950191,
"grad_norm": 0.04099538177251816,
"learning_rate": 2.207188502177313e-05,
"loss": 0.0041,
"step": 614
},
{
"epoch": 9.413793103448276,
"grad_norm": 0.04924609512090683,
"learning_rate": 2.181685175319702e-05,
"loss": 0.0056,
"step": 615
},
{
"epoch": 9.42911877394636,
"grad_norm": 0.04036853834986687,
"learning_rate": 2.1563119944687737e-05,
"loss": 0.0039,
"step": 616
},
{
"epoch": 9.444444444444445,
"grad_norm": 0.04601878300309181,
"learning_rate": 2.1310693819940842e-05,
"loss": 0.0046,
"step": 617
},
{
"epoch": 9.459770114942529,
"grad_norm": 0.044013988226652145,
"learning_rate": 2.1059577580917067e-05,
"loss": 0.0046,
"step": 618
},
{
"epoch": 9.475095785440613,
"grad_norm": 0.03659258037805557,
"learning_rate": 2.0809775407772503e-05,
"loss": 0.0035,
"step": 619
},
{
"epoch": 9.490421455938698,
"grad_norm": 0.04221741855144501,
"learning_rate": 2.0561291458788733e-05,
"loss": 0.0037,
"step": 620
},
{
"epoch": 9.505747126436782,
"grad_norm": 0.043971508741378784,
"learning_rate": 2.0314129870303977e-05,
"loss": 0.0045,
"step": 621
},
{
"epoch": 9.521072796934867,
"grad_norm": 0.03597636520862579,
"learning_rate": 2.0068294756643845e-05,
"loss": 0.0032,
"step": 622
},
{
"epoch": 9.53639846743295,
"grad_norm": 0.04181092977523804,
"learning_rate": 1.9823790210053252e-05,
"loss": 0.0042,
"step": 623
},
{
"epoch": 9.551724137931034,
"grad_norm": 0.04154861345887184,
"learning_rate": 1.958062030062795e-05,
"loss": 0.0036,
"step": 624
},
{
"epoch": 9.567049808429118,
"grad_norm": 0.04263344407081604,
"learning_rate": 1.9338789076247e-05,
"loss": 0.0039,
"step": 625
},
{
"epoch": 9.582375478927203,
"grad_norm": 0.04241356998682022,
"learning_rate": 1.9098300562505266e-05,
"loss": 0.0043,
"step": 626
},
{
"epoch": 9.597701149425287,
"grad_norm": 0.04476002976298332,
"learning_rate": 1.8859158762646466e-05,
"loss": 0.0043,
"step": 627
},
{
"epoch": 9.613026819923371,
"grad_norm": 0.04713902622461319,
"learning_rate": 1.8621367657496502e-05,
"loss": 0.004,
"step": 628
},
{
"epoch": 9.628352490421456,
"grad_norm": 0.04231436178088188,
"learning_rate": 1.8384931205397303e-05,
"loss": 0.004,
"step": 629
},
{
"epoch": 9.628352490421456,
"eval_loss": 3.070976495742798,
"eval_runtime": 10.581,
"eval_samples_per_second": 9.451,
"eval_steps_per_second": 4.725,
"step": 629
},
{
"epoch": 9.64367816091954,
"grad_norm": 0.03969426453113556,
"learning_rate": 1.8149853342140645e-05,
"loss": 0.0038,
"step": 630
},
{
"epoch": 9.659003831417625,
"grad_norm": 0.04556899145245552,
"learning_rate": 1.7916137980903046e-05,
"loss": 0.0039,
"step": 631
},
{
"epoch": 9.67432950191571,
"grad_norm": 0.04505952075123787,
"learning_rate": 1.7683789012180196e-05,
"loss": 0.0042,
"step": 632
},
{
"epoch": 9.689655172413794,
"grad_norm": 0.0395471565425396,
"learning_rate": 1.74528103037226e-05,
"loss": 0.0037,
"step": 633
},
{
"epoch": 9.704980842911878,
"grad_norm": 0.0387556366622448,
"learning_rate": 1.722320570047089e-05,
"loss": 0.0041,
"step": 634
},
{
"epoch": 9.720306513409962,
"grad_norm": 0.04286782816052437,
"learning_rate": 1.6994979024491942e-05,
"loss": 0.004,
"step": 635
},
{
"epoch": 9.735632183908045,
"grad_norm": 0.043354280292987823,
"learning_rate": 1.6768134074915276e-05,
"loss": 0.0038,
"step": 636
},
{
"epoch": 9.75095785440613,
"grad_norm": 0.04409995302557945,
"learning_rate": 1.6542674627869737e-05,
"loss": 0.0043,
"step": 637
},
{
"epoch": 9.766283524904214,
"grad_norm": 0.05120624974370003,
"learning_rate": 1.6318604436420737e-05,
"loss": 0.0041,
"step": 638
},
{
"epoch": 9.781609195402298,
"grad_norm": 0.04400256276130676,
"learning_rate": 1.6095927230507667e-05,
"loss": 0.0043,
"step": 639
},
{
"epoch": 9.796934865900383,
"grad_norm": 0.03750475123524666,
"learning_rate": 1.587464671688187e-05,
"loss": 0.0035,
"step": 640
},
{
"epoch": 9.812260536398467,
"grad_norm": 0.03617061302065849,
"learning_rate": 1.5654766579045033e-05,
"loss": 0.0035,
"step": 641
},
{
"epoch": 9.827586206896552,
"grad_norm": 0.04300917312502861,
"learning_rate": 1.5436290477187587e-05,
"loss": 0.0038,
"step": 642
},
{
"epoch": 9.842911877394636,
"grad_norm": 0.043261539191007614,
"learning_rate": 1.5219222048128124e-05,
"loss": 0.0042,
"step": 643
},
{
"epoch": 9.85823754789272,
"grad_norm": 0.05182840675115585,
"learning_rate": 1.500356490525261e-05,
"loss": 0.0051,
"step": 644
},
{
"epoch": 9.873563218390805,
"grad_norm": 0.035250503569841385,
"learning_rate": 1.4789322638454351e-05,
"loss": 0.0035,
"step": 645
},
{
"epoch": 9.88888888888889,
"grad_norm": 0.043576598167419434,
"learning_rate": 1.4576498814074168e-05,
"loss": 0.0041,
"step": 646
},
{
"epoch": 9.88888888888889,
"eval_loss": 3.0796117782592773,
"eval_runtime": 10.5517,
"eval_samples_per_second": 9.477,
"eval_steps_per_second": 4.739,
"step": 646
},
{
"epoch": 9.904214559386974,
"grad_norm": 0.04328146204352379,
"learning_rate": 1.4365096974841108e-05,
"loss": 0.0038,
"step": 647
},
{
"epoch": 9.919540229885058,
"grad_norm": 0.04611522704362869,
"learning_rate": 1.415512063981339e-05,
"loss": 0.0044,
"step": 648
},
{
"epoch": 9.934865900383143,
"grad_norm": 0.047622717916965485,
"learning_rate": 1.3946573304319899e-05,
"loss": 0.0041,
"step": 649
},
{
"epoch": 9.950191570881227,
"grad_norm": 0.04016837850213051,
"learning_rate": 1.373945843990192e-05,
"loss": 0.0042,
"step": 650
}
],
"logging_steps": 1,
"max_steps": 780,
"num_input_tokens_seen": 0,
"num_train_epochs": 12,
"save_steps": 65,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.166280912599777e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}