0x1202's picture
Training in progress, step 200, checkpoint
de66d38 verified
{
"best_metric": 0.9611303806304932,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.10722962218312809,
"eval_steps": 25,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005361481109156405,
"grad_norm": 2.792823553085327,
"learning_rate": 2.9999999999999997e-05,
"loss": 33.294,
"step": 1
},
{
"epoch": 0.0005361481109156405,
"eval_loss": 1.278591275215149,
"eval_runtime": 6.8803,
"eval_samples_per_second": 7.267,
"eval_steps_per_second": 7.267,
"step": 1
},
{
"epoch": 0.001072296221831281,
"grad_norm": 2.6599700450897217,
"learning_rate": 5.9999999999999995e-05,
"loss": 35.668,
"step": 2
},
{
"epoch": 0.0016084443327469213,
"grad_norm": 2.681920051574707,
"learning_rate": 8.999999999999999e-05,
"loss": 35.5497,
"step": 3
},
{
"epoch": 0.002144592443662562,
"grad_norm": 2.969249725341797,
"learning_rate": 0.00011999999999999999,
"loss": 35.8928,
"step": 4
},
{
"epoch": 0.0026807405545782024,
"grad_norm": 3.7171504497528076,
"learning_rate": 0.00015,
"loss": 37.1297,
"step": 5
},
{
"epoch": 0.0032168886654938425,
"grad_norm": 4.583807468414307,
"learning_rate": 0.00017999999999999998,
"loss": 38.0923,
"step": 6
},
{
"epoch": 0.003753036776409483,
"grad_norm": 5.622003078460693,
"learning_rate": 0.00020999999999999998,
"loss": 37.8901,
"step": 7
},
{
"epoch": 0.004289184887325124,
"grad_norm": 4.869115352630615,
"learning_rate": 0.00023999999999999998,
"loss": 37.1484,
"step": 8
},
{
"epoch": 0.004825332998240764,
"grad_norm": 4.734254360198975,
"learning_rate": 0.00027,
"loss": 36.4778,
"step": 9
},
{
"epoch": 0.005361481109156405,
"grad_norm": 11.934779167175293,
"learning_rate": 0.0003,
"loss": 36.416,
"step": 10
},
{
"epoch": 0.0058976292200720445,
"grad_norm": 10.776172637939453,
"learning_rate": 0.0002999794957488703,
"loss": 34.7884,
"step": 11
},
{
"epoch": 0.006433777330987685,
"grad_norm": 9.7468900680542,
"learning_rate": 0.0002999179886011389,
"loss": 36.1845,
"step": 12
},
{
"epoch": 0.006969925441903326,
"grad_norm": 6.64066219329834,
"learning_rate": 0.0002998154953722457,
"loss": 32.117,
"step": 13
},
{
"epoch": 0.007506073552818966,
"grad_norm": 5.012538909912109,
"learning_rate": 0.00029967204408281613,
"loss": 33.6469,
"step": 14
},
{
"epoch": 0.008042221663734606,
"grad_norm": 3.7766289710998535,
"learning_rate": 0.00029948767395100045,
"loss": 34.7003,
"step": 15
},
{
"epoch": 0.008578369774650247,
"grad_norm": 3.6432321071624756,
"learning_rate": 0.0002992624353817517,
"loss": 33.0312,
"step": 16
},
{
"epoch": 0.009114517885565887,
"grad_norm": 4.158324241638184,
"learning_rate": 0.0002989963899530457,
"loss": 32.2584,
"step": 17
},
{
"epoch": 0.009650665996481528,
"grad_norm": 3.888638973236084,
"learning_rate": 0.00029868961039904624,
"loss": 34.2098,
"step": 18
},
{
"epoch": 0.010186814107397168,
"grad_norm": 4.01094388961792,
"learning_rate": 0.00029834218059022024,
"loss": 33.4237,
"step": 19
},
{
"epoch": 0.01072296221831281,
"grad_norm": 4.373755931854248,
"learning_rate": 0.00029795419551040833,
"loss": 34.2133,
"step": 20
},
{
"epoch": 0.01125911032922845,
"grad_norm": 4.608726978302002,
"learning_rate": 0.00029752576123085736,
"loss": 33.9538,
"step": 21
},
{
"epoch": 0.011795258440144089,
"grad_norm": 5.16507625579834,
"learning_rate": 0.0002970569948812214,
"loss": 33.7207,
"step": 22
},
{
"epoch": 0.01233140655105973,
"grad_norm": 5.377164840698242,
"learning_rate": 0.0002965480246175399,
"loss": 31.0621,
"step": 23
},
{
"epoch": 0.01286755466197537,
"grad_norm": 4.942190170288086,
"learning_rate": 0.0002959989895872009,
"loss": 32.3631,
"step": 24
},
{
"epoch": 0.013403702772891012,
"grad_norm": 5.400623321533203,
"learning_rate": 0.0002954100398908995,
"loss": 30.7856,
"step": 25
},
{
"epoch": 0.013403702772891012,
"eval_loss": 1.07772958278656,
"eval_runtime": 7.0053,
"eval_samples_per_second": 7.137,
"eval_steps_per_second": 7.137,
"step": 25
},
{
"epoch": 0.013939850883806651,
"grad_norm": 4.848288536071777,
"learning_rate": 0.0002947813365416023,
"loss": 34.2735,
"step": 26
},
{
"epoch": 0.014475998994722293,
"grad_norm": 4.633642196655273,
"learning_rate": 0.0002941130514205272,
"loss": 33.778,
"step": 27
},
{
"epoch": 0.015012147105637932,
"grad_norm": 4.757195472717285,
"learning_rate": 0.0002934053672301536,
"loss": 33.5148,
"step": 28
},
{
"epoch": 0.015548295216553572,
"grad_norm": 4.664140224456787,
"learning_rate": 0.00029265847744427303,
"loss": 31.7672,
"step": 29
},
{
"epoch": 0.016084443327469212,
"grad_norm": 4.638533592224121,
"learning_rate": 0.00029187258625509513,
"loss": 32.445,
"step": 30
},
{
"epoch": 0.016620591438384853,
"grad_norm": 5.77179479598999,
"learning_rate": 0.00029104790851742417,
"loss": 33.1689,
"step": 31
},
{
"epoch": 0.017156739549300495,
"grad_norm": 5.126926422119141,
"learning_rate": 0.0002901846696899191,
"loss": 33.1114,
"step": 32
},
{
"epoch": 0.017692887660216136,
"grad_norm": 5.307356357574463,
"learning_rate": 0.00028928310577345606,
"loss": 32.6517,
"step": 33
},
{
"epoch": 0.018229035771131774,
"grad_norm": 6.363701820373535,
"learning_rate": 0.0002883434632466077,
"loss": 33.6168,
"step": 34
},
{
"epoch": 0.018765183882047416,
"grad_norm": 6.03190279006958,
"learning_rate": 0.00028736599899825856,
"loss": 32.2326,
"step": 35
},
{
"epoch": 0.019301331992963057,
"grad_norm": 6.035442352294922,
"learning_rate": 0.00028635098025737434,
"loss": 32.6694,
"step": 36
},
{
"epoch": 0.019837480103878695,
"grad_norm": 5.9874467849731445,
"learning_rate": 0.00028529868451994384,
"loss": 31.2628,
"step": 37
},
{
"epoch": 0.020373628214794336,
"grad_norm": 6.513183116912842,
"learning_rate": 0.0002842093994731145,
"loss": 34.0463,
"step": 38
},
{
"epoch": 0.020909776325709978,
"grad_norm": 6.865087032318115,
"learning_rate": 0.00028308342291654174,
"loss": 36.2589,
"step": 39
},
{
"epoch": 0.02144592443662562,
"grad_norm": 6.68407678604126,
"learning_rate": 0.00028192106268097334,
"loss": 33.967,
"step": 40
},
{
"epoch": 0.021982072547541257,
"grad_norm": 7.59697961807251,
"learning_rate": 0.00028072263654409154,
"loss": 33.708,
"step": 41
},
{
"epoch": 0.0225182206584569,
"grad_norm": 7.609118461608887,
"learning_rate": 0.0002794884721436361,
"loss": 34.0083,
"step": 42
},
{
"epoch": 0.02305436876937254,
"grad_norm": 8.011795997619629,
"learning_rate": 0.00027821890688783083,
"loss": 35.5668,
"step": 43
},
{
"epoch": 0.023590516880288178,
"grad_norm": 9.242958068847656,
"learning_rate": 0.0002769142878631403,
"loss": 35.7281,
"step": 44
},
{
"epoch": 0.02412666499120382,
"grad_norm": 9.058021545410156,
"learning_rate": 0.00027557497173937923,
"loss": 38.0303,
"step": 45
},
{
"epoch": 0.02466281310211946,
"grad_norm": 12.166667938232422,
"learning_rate": 0.000274201324672203,
"loss": 38.1286,
"step": 46
},
{
"epoch": 0.025198961213035102,
"grad_norm": 10.742341041564941,
"learning_rate": 0.00027279372220300385,
"loss": 33.2518,
"step": 47
},
{
"epoch": 0.02573510932395074,
"grad_norm": 16.458505630493164,
"learning_rate": 0.0002713525491562421,
"loss": 38.5736,
"step": 48
},
{
"epoch": 0.026271257434866382,
"grad_norm": 28.211767196655273,
"learning_rate": 0.00026987819953423867,
"loss": 37.296,
"step": 49
},
{
"epoch": 0.026807405545782023,
"grad_norm": 51.30426025390625,
"learning_rate": 0.00026837107640945905,
"loss": 33.5111,
"step": 50
},
{
"epoch": 0.026807405545782023,
"eval_loss": 1.0827397108078003,
"eval_runtime": 7.0038,
"eval_samples_per_second": 7.139,
"eval_steps_per_second": 7.139,
"step": 50
},
{
"epoch": 0.02734355365669766,
"grad_norm": 16.112340927124023,
"learning_rate": 0.0002668315918143169,
"loss": 31.1619,
"step": 51
},
{
"epoch": 0.027879701767613303,
"grad_norm": 12.760345458984375,
"learning_rate": 0.00026526016662852886,
"loss": 29.4258,
"step": 52
},
{
"epoch": 0.028415849878528944,
"grad_norm": 11.710972785949707,
"learning_rate": 0.00026365723046405023,
"loss": 30.2184,
"step": 53
},
{
"epoch": 0.028951997989444585,
"grad_norm": 7.269360065460205,
"learning_rate": 0.0002620232215476231,
"loss": 31.87,
"step": 54
},
{
"epoch": 0.029488146100360223,
"grad_norm": 5.4253034591674805,
"learning_rate": 0.0002603585866009697,
"loss": 33.2204,
"step": 55
},
{
"epoch": 0.030024294211275865,
"grad_norm": 4.678703784942627,
"learning_rate": 0.00025866378071866334,
"loss": 29.7763,
"step": 56
},
{
"epoch": 0.030560442322191506,
"grad_norm": 4.662464618682861,
"learning_rate": 0.00025693926724370956,
"loss": 30.3025,
"step": 57
},
{
"epoch": 0.031096590433107144,
"grad_norm": 4.8608198165893555,
"learning_rate": 0.00025518551764087326,
"loss": 30.4402,
"step": 58
},
{
"epoch": 0.031632738544022786,
"grad_norm": 4.3967366218566895,
"learning_rate": 0.00025340301136778483,
"loss": 30.7072,
"step": 59
},
{
"epoch": 0.032168886654938424,
"grad_norm": 4.5696892738342285,
"learning_rate": 0.00025159223574386114,
"loss": 31.4662,
"step": 60
},
{
"epoch": 0.03270503476585407,
"grad_norm": 4.4062323570251465,
"learning_rate": 0.0002497536858170772,
"loss": 32.4237,
"step": 61
},
{
"epoch": 0.033241182876769707,
"grad_norm": 4.4083757400512695,
"learning_rate": 0.00024788786422862526,
"loss": 31.296,
"step": 62
},
{
"epoch": 0.03377733098768535,
"grad_norm": 4.396991729736328,
"learning_rate": 0.00024599528107549745,
"loss": 30.3045,
"step": 63
},
{
"epoch": 0.03431347909860099,
"grad_norm": 4.459329605102539,
"learning_rate": 0.00024407645377103054,
"loss": 32.7868,
"step": 64
},
{
"epoch": 0.03484962720951663,
"grad_norm": 4.469038486480713,
"learning_rate": 0.00024213190690345018,
"loss": 32.613,
"step": 65
},
{
"epoch": 0.03538577532043227,
"grad_norm": 4.406140327453613,
"learning_rate": 0.00024016217209245374,
"loss": 32.0919,
"step": 66
},
{
"epoch": 0.03592192343134791,
"grad_norm": 4.390615463256836,
"learning_rate": 0.00023816778784387094,
"loss": 30.3288,
"step": 67
},
{
"epoch": 0.03645807154226355,
"grad_norm": 4.351038932800293,
"learning_rate": 0.0002361492994024415,
"loss": 31.0161,
"step": 68
},
{
"epoch": 0.03699421965317919,
"grad_norm": 4.775008201599121,
"learning_rate": 0.0002341072586027509,
"loss": 31.068,
"step": 69
},
{
"epoch": 0.03753036776409483,
"grad_norm": 5.006083011627197,
"learning_rate": 0.00023204222371836405,
"loss": 32.1227,
"step": 70
},
{
"epoch": 0.03806651587501047,
"grad_norm": 4.528618335723877,
"learning_rate": 0.00022995475930919905,
"loss": 30.1112,
"step": 71
},
{
"epoch": 0.038602663985926114,
"grad_norm": 4.488797664642334,
"learning_rate": 0.00022784543606718227,
"loss": 31.3176,
"step": 72
},
{
"epoch": 0.03913881209684175,
"grad_norm": 5.192445755004883,
"learning_rate": 0.00022571483066022657,
"loss": 30.9757,
"step": 73
},
{
"epoch": 0.03967496020775739,
"grad_norm": 5.133602619171143,
"learning_rate": 0.0002235635255745762,
"loss": 31.7632,
"step": 74
},
{
"epoch": 0.040211108318673035,
"grad_norm": 4.8014373779296875,
"learning_rate": 0.00022139210895556104,
"loss": 31.0323,
"step": 75
},
{
"epoch": 0.040211108318673035,
"eval_loss": 1.000903606414795,
"eval_runtime": 7.0115,
"eval_samples_per_second": 7.131,
"eval_steps_per_second": 7.131,
"step": 75
},
{
"epoch": 0.04074725642958867,
"grad_norm": 4.980635166168213,
"learning_rate": 0.00021920117444680317,
"loss": 31.2209,
"step": 76
},
{
"epoch": 0.04128340454050432,
"grad_norm": 4.976080417633057,
"learning_rate": 0.00021699132102792097,
"loss": 31.3572,
"step": 77
},
{
"epoch": 0.041819552651419956,
"grad_norm": 5.553629398345947,
"learning_rate": 0.0002147631528507739,
"loss": 33.044,
"step": 78
},
{
"epoch": 0.042355700762335594,
"grad_norm": 5.107100963592529,
"learning_rate": 0.00021251727907429355,
"loss": 30.9737,
"step": 79
},
{
"epoch": 0.04289184887325124,
"grad_norm": 6.064231872558594,
"learning_rate": 0.0002102543136979454,
"loss": 30.7711,
"step": 80
},
{
"epoch": 0.043427996984166876,
"grad_norm": 5.368616104125977,
"learning_rate": 0.0002079748753938678,
"loss": 32.0713,
"step": 81
},
{
"epoch": 0.043964145095082514,
"grad_norm": 5.044811248779297,
"learning_rate": 0.0002056795873377331,
"loss": 32.1515,
"step": 82
},
{
"epoch": 0.04450029320599816,
"grad_norm": 5.057069301605225,
"learning_rate": 0.00020336907703837748,
"loss": 32.276,
"step": 83
},
{
"epoch": 0.0450364413169138,
"grad_norm": 5.206183910369873,
"learning_rate": 0.00020104397616624645,
"loss": 33.1993,
"step": 84
},
{
"epoch": 0.045572589427829435,
"grad_norm": 5.805713176727295,
"learning_rate": 0.00019870492038070252,
"loss": 29.9394,
"step": 85
},
{
"epoch": 0.04610873753874508,
"grad_norm": 5.780794620513916,
"learning_rate": 0.0001963525491562421,
"loss": 33.6122,
"step": 86
},
{
"epoch": 0.04664488564966072,
"grad_norm": 5.779490947723389,
"learning_rate": 0.0001939875056076697,
"loss": 32.9998,
"step": 87
},
{
"epoch": 0.047181033760576356,
"grad_norm": 5.634477615356445,
"learning_rate": 0.00019161043631427666,
"loss": 32.954,
"step": 88
},
{
"epoch": 0.047717181871492,
"grad_norm": 5.964652061462402,
"learning_rate": 0.00018922199114307294,
"loss": 33.3383,
"step": 89
},
{
"epoch": 0.04825332998240764,
"grad_norm": 6.110783100128174,
"learning_rate": 0.00018682282307111987,
"loss": 30.9541,
"step": 90
},
{
"epoch": 0.048789478093323284,
"grad_norm": 6.400683403015137,
"learning_rate": 0.00018441358800701273,
"loss": 34.3879,
"step": 91
},
{
"epoch": 0.04932562620423892,
"grad_norm": 8.689764022827148,
"learning_rate": 0.00018199494461156203,
"loss": 31.8942,
"step": 92
},
{
"epoch": 0.04986177431515456,
"grad_norm": 7.461714744567871,
"learning_rate": 0.000179567554117722,
"loss": 30.9819,
"step": 93
},
{
"epoch": 0.050397922426070205,
"grad_norm": 7.799310684204102,
"learning_rate": 0.00017713208014981648,
"loss": 33.8376,
"step": 94
},
{
"epoch": 0.05093407053698584,
"grad_norm": 9.082171440124512,
"learning_rate": 0.00017468918854211007,
"loss": 34.8842,
"step": 95
},
{
"epoch": 0.05147021864790148,
"grad_norm": 9.59943675994873,
"learning_rate": 0.00017223954715677627,
"loss": 35.6568,
"step": 96
},
{
"epoch": 0.052006366758817126,
"grad_norm": 10.860755920410156,
"learning_rate": 0.00016978382570131034,
"loss": 34.7432,
"step": 97
},
{
"epoch": 0.052542514869732763,
"grad_norm": 18.212682723999023,
"learning_rate": 0.00016732269554543794,
"loss": 32.6667,
"step": 98
},
{
"epoch": 0.0530786629806484,
"grad_norm": 19.332014083862305,
"learning_rate": 0.00016485682953756942,
"loss": 30.8138,
"step": 99
},
{
"epoch": 0.053614811091564046,
"grad_norm": 65.70189666748047,
"learning_rate": 0.00016238690182084986,
"loss": 32.2653,
"step": 100
},
{
"epoch": 0.053614811091564046,
"eval_loss": 1.016136646270752,
"eval_runtime": 7.0094,
"eval_samples_per_second": 7.133,
"eval_steps_per_second": 7.133,
"step": 100
},
{
"epoch": 0.054150959202479684,
"grad_norm": 9.373055458068848,
"learning_rate": 0.0001599135876488549,
"loss": 28.2683,
"step": 101
},
{
"epoch": 0.05468710731339532,
"grad_norm": 8.830703735351562,
"learning_rate": 0.00015743756320098332,
"loss": 30.2945,
"step": 102
},
{
"epoch": 0.05522325542431097,
"grad_norm": 7.033578395843506,
"learning_rate": 0.0001549595053975962,
"loss": 30.4363,
"step": 103
},
{
"epoch": 0.055759403535226605,
"grad_norm": 6.227023601531982,
"learning_rate": 0.00015248009171495378,
"loss": 29.8592,
"step": 104
},
{
"epoch": 0.05629555164614225,
"grad_norm": 5.5991339683532715,
"learning_rate": 0.00015,
"loss": 28.823,
"step": 105
},
{
"epoch": 0.05683169975705789,
"grad_norm": 4.878828525543213,
"learning_rate": 0.00014751990828504622,
"loss": 30.4694,
"step": 106
},
{
"epoch": 0.057367847867973526,
"grad_norm": 4.614971160888672,
"learning_rate": 0.00014504049460240375,
"loss": 28.4186,
"step": 107
},
{
"epoch": 0.05790399597888917,
"grad_norm": 4.517513275146484,
"learning_rate": 0.00014256243679901663,
"loss": 29.0258,
"step": 108
},
{
"epoch": 0.05844014408980481,
"grad_norm": 8.218019485473633,
"learning_rate": 0.00014008641235114508,
"loss": 28.0963,
"step": 109
},
{
"epoch": 0.05897629220072045,
"grad_norm": 4.709647178649902,
"learning_rate": 0.00013761309817915014,
"loss": 30.3963,
"step": 110
},
{
"epoch": 0.05951244031163609,
"grad_norm": 4.416621685028076,
"learning_rate": 0.00013514317046243058,
"loss": 29.8401,
"step": 111
},
{
"epoch": 0.06004858842255173,
"grad_norm": 4.569868564605713,
"learning_rate": 0.00013267730445456208,
"loss": 31.1935,
"step": 112
},
{
"epoch": 0.06058473653346737,
"grad_norm": 4.648908615112305,
"learning_rate": 0.00013021617429868963,
"loss": 30.0785,
"step": 113
},
{
"epoch": 0.06112088464438301,
"grad_norm": 4.735060691833496,
"learning_rate": 0.00012776045284322368,
"loss": 30.8758,
"step": 114
},
{
"epoch": 0.06165703275529865,
"grad_norm": 4.528501510620117,
"learning_rate": 0.00012531081145788987,
"loss": 30.1864,
"step": 115
},
{
"epoch": 0.06219318086621429,
"grad_norm": 4.81594181060791,
"learning_rate": 0.00012286791985018355,
"loss": 31.0008,
"step": 116
},
{
"epoch": 0.06272932897712993,
"grad_norm": 4.743056774139404,
"learning_rate": 0.00012043244588227796,
"loss": 30.7498,
"step": 117
},
{
"epoch": 0.06326547708804557,
"grad_norm": 4.997926712036133,
"learning_rate": 0.00011800505538843798,
"loss": 32.3375,
"step": 118
},
{
"epoch": 0.06380162519896121,
"grad_norm": 4.894601345062256,
"learning_rate": 0.00011558641199298727,
"loss": 30.8206,
"step": 119
},
{
"epoch": 0.06433777330987685,
"grad_norm": 4.590635776519775,
"learning_rate": 0.00011317717692888012,
"loss": 29.6827,
"step": 120
},
{
"epoch": 0.0648739214207925,
"grad_norm": 4.70611572265625,
"learning_rate": 0.00011077800885692702,
"loss": 29.7052,
"step": 121
},
{
"epoch": 0.06541006953170814,
"grad_norm": 4.513106822967529,
"learning_rate": 0.00010838956368572334,
"loss": 29.9822,
"step": 122
},
{
"epoch": 0.06594621764262378,
"grad_norm": 4.913265705108643,
"learning_rate": 0.0001060124943923303,
"loss": 30.3049,
"step": 123
},
{
"epoch": 0.06648236575353941,
"grad_norm": 4.700603485107422,
"learning_rate": 0.0001036474508437579,
"loss": 29.5673,
"step": 124
},
{
"epoch": 0.06701851386445505,
"grad_norm": 4.874575138092041,
"learning_rate": 0.00010129507961929748,
"loss": 30.9478,
"step": 125
},
{
"epoch": 0.06701851386445505,
"eval_loss": 0.9712469577789307,
"eval_runtime": 7.0116,
"eval_samples_per_second": 7.131,
"eval_steps_per_second": 7.131,
"step": 125
},
{
"epoch": 0.0675546619753707,
"grad_norm": 5.066522598266602,
"learning_rate": 9.895602383375353e-05,
"loss": 30.832,
"step": 126
},
{
"epoch": 0.06809081008628634,
"grad_norm": 5.205512046813965,
"learning_rate": 9.663092296162251e-05,
"loss": 30.9786,
"step": 127
},
{
"epoch": 0.06862695819720198,
"grad_norm": 4.869602203369141,
"learning_rate": 9.432041266226686e-05,
"loss": 29.142,
"step": 128
},
{
"epoch": 0.06916310630811762,
"grad_norm": 4.945410251617432,
"learning_rate": 9.202512460613219e-05,
"loss": 30.8244,
"step": 129
},
{
"epoch": 0.06969925441903325,
"grad_norm": 5.29721736907959,
"learning_rate": 8.97456863020546e-05,
"loss": 32.1392,
"step": 130
},
{
"epoch": 0.07023540252994889,
"grad_norm": 6.640650749206543,
"learning_rate": 8.748272092570646e-05,
"loss": 32.1368,
"step": 131
},
{
"epoch": 0.07077155064086454,
"grad_norm": 5.233391284942627,
"learning_rate": 8.523684714922608e-05,
"loss": 31.8081,
"step": 132
},
{
"epoch": 0.07130769875178018,
"grad_norm": 5.334178924560547,
"learning_rate": 8.300867897207903e-05,
"loss": 32.2981,
"step": 133
},
{
"epoch": 0.07184384686269582,
"grad_norm": 5.664828777313232,
"learning_rate": 8.079882555319684e-05,
"loss": 30.6256,
"step": 134
},
{
"epoch": 0.07237999497361146,
"grad_norm": 5.590938091278076,
"learning_rate": 7.860789104443896e-05,
"loss": 31.1287,
"step": 135
},
{
"epoch": 0.0729161430845271,
"grad_norm": 5.823013782501221,
"learning_rate": 7.643647442542382e-05,
"loss": 31.7947,
"step": 136
},
{
"epoch": 0.07345229119544273,
"grad_norm": 5.71795129776001,
"learning_rate": 7.428516933977347e-05,
"loss": 31.4032,
"step": 137
},
{
"epoch": 0.07398843930635839,
"grad_norm": 6.240416049957275,
"learning_rate": 7.215456393281776e-05,
"loss": 31.7812,
"step": 138
},
{
"epoch": 0.07452458741727402,
"grad_norm": 5.8073410987854,
"learning_rate": 7.004524069080096e-05,
"loss": 31.8186,
"step": 139
},
{
"epoch": 0.07506073552818966,
"grad_norm": 6.44705057144165,
"learning_rate": 6.795777628163599e-05,
"loss": 33.4398,
"step": 140
},
{
"epoch": 0.0755968836391053,
"grad_norm": 6.330421447753906,
"learning_rate": 6.58927413972491e-05,
"loss": 31.8283,
"step": 141
},
{
"epoch": 0.07613303175002094,
"grad_norm": 6.591586589813232,
"learning_rate": 6.385070059755846e-05,
"loss": 33.1672,
"step": 142
},
{
"epoch": 0.07666917986093659,
"grad_norm": 8.423770904541016,
"learning_rate": 6.183221215612904e-05,
"loss": 33.6284,
"step": 143
},
{
"epoch": 0.07720532797185223,
"grad_norm": 9.330811500549316,
"learning_rate": 5.983782790754623e-05,
"loss": 35.6624,
"step": 144
},
{
"epoch": 0.07774147608276787,
"grad_norm": 9.339548110961914,
"learning_rate": 5.786809309654982e-05,
"loss": 34.5517,
"step": 145
},
{
"epoch": 0.0782776241936835,
"grad_norm": 12.109213829040527,
"learning_rate": 5.592354622896944e-05,
"loss": 35.4903,
"step": 146
},
{
"epoch": 0.07881377230459914,
"grad_norm": 11.704776763916016,
"learning_rate": 5.40047189245025e-05,
"loss": 36.6097,
"step": 147
},
{
"epoch": 0.07934992041551478,
"grad_norm": 16.527082443237305,
"learning_rate": 5.211213577137469e-05,
"loss": 33.7769,
"step": 148
},
{
"epoch": 0.07988606852643043,
"grad_norm": 16.686304092407227,
"learning_rate": 5.024631418292274e-05,
"loss": 31.3678,
"step": 149
},
{
"epoch": 0.08042221663734607,
"grad_norm": 35.215946197509766,
"learning_rate": 4.840776425613886e-05,
"loss": 29.0595,
"step": 150
},
{
"epoch": 0.08042221663734607,
"eval_loss": 0.9701613783836365,
"eval_runtime": 7.002,
"eval_samples_per_second": 7.141,
"eval_steps_per_second": 7.141,
"step": 150
},
{
"epoch": 0.08095836474826171,
"grad_norm": 5.198462009429932,
"learning_rate": 4.659698863221513e-05,
"loss": 24.5072,
"step": 151
},
{
"epoch": 0.08149451285917735,
"grad_norm": 4.549412250518799,
"learning_rate": 4.481448235912671e-05,
"loss": 26.2581,
"step": 152
},
{
"epoch": 0.08203066097009298,
"grad_norm": 4.362401485443115,
"learning_rate": 4.306073275629044e-05,
"loss": 27.6788,
"step": 153
},
{
"epoch": 0.08256680908100864,
"grad_norm": 5.346713066101074,
"learning_rate": 4.133621928133665e-05,
"loss": 30.5177,
"step": 154
},
{
"epoch": 0.08310295719192427,
"grad_norm": 4.8502702713012695,
"learning_rate": 3.964141339903026e-05,
"loss": 27.5227,
"step": 155
},
{
"epoch": 0.08363910530283991,
"grad_norm": 4.808586120605469,
"learning_rate": 3.797677845237696e-05,
"loss": 30.8657,
"step": 156
},
{
"epoch": 0.08417525341375555,
"grad_norm": 4.663094997406006,
"learning_rate": 3.634276953594982e-05,
"loss": 29.7888,
"step": 157
},
{
"epoch": 0.08471140152467119,
"grad_norm": 5.056007385253906,
"learning_rate": 3.473983337147118e-05,
"loss": 29.6446,
"step": 158
},
{
"epoch": 0.08524754963558683,
"grad_norm": 4.635434150695801,
"learning_rate": 3.316840818568315e-05,
"loss": 29.1428,
"step": 159
},
{
"epoch": 0.08578369774650248,
"grad_norm": 4.840639591217041,
"learning_rate": 3.162892359054098e-05,
"loss": 29.1692,
"step": 160
},
{
"epoch": 0.08631984585741811,
"grad_norm": 4.706408977508545,
"learning_rate": 3.0121800465761293e-05,
"loss": 30.8735,
"step": 161
},
{
"epoch": 0.08685599396833375,
"grad_norm": 4.438312530517578,
"learning_rate": 2.8647450843757897e-05,
"loss": 29.7526,
"step": 162
},
{
"epoch": 0.08739214207924939,
"grad_norm": 4.957245349884033,
"learning_rate": 2.7206277796996144e-05,
"loss": 31.258,
"step": 163
},
{
"epoch": 0.08792829019016503,
"grad_norm": 4.573652267456055,
"learning_rate": 2.5798675327796993e-05,
"loss": 30.8789,
"step": 164
},
{
"epoch": 0.08846443830108067,
"grad_norm": 4.697335243225098,
"learning_rate": 2.4425028260620715e-05,
"loss": 30.2755,
"step": 165
},
{
"epoch": 0.08900058641199632,
"grad_norm": 4.690494060516357,
"learning_rate": 2.3085712136859668e-05,
"loss": 29.9391,
"step": 166
},
{
"epoch": 0.08953673452291196,
"grad_norm": 4.9841694831848145,
"learning_rate": 2.178109311216913e-05,
"loss": 27.8766,
"step": 167
},
{
"epoch": 0.0900728826338276,
"grad_norm": 5.224708557128906,
"learning_rate": 2.0511527856363912e-05,
"loss": 29.0932,
"step": 168
},
{
"epoch": 0.09060903074474323,
"grad_norm": 4.802592754364014,
"learning_rate": 1.927736345590839e-05,
"loss": 30.957,
"step": 169
},
{
"epoch": 0.09114517885565887,
"grad_norm": 4.843587398529053,
"learning_rate": 1.8078937319026654e-05,
"loss": 30.3239,
"step": 170
},
{
"epoch": 0.09168132696657452,
"grad_norm": 4.917239665985107,
"learning_rate": 1.6916577083458228e-05,
"loss": 30.4517,
"step": 171
},
{
"epoch": 0.09221747507749016,
"grad_norm": 4.622511863708496,
"learning_rate": 1.579060052688548e-05,
"loss": 29.1561,
"step": 172
},
{
"epoch": 0.0927536231884058,
"grad_norm": 5.174027919769287,
"learning_rate": 1.4701315480056164e-05,
"loss": 28.5216,
"step": 173
},
{
"epoch": 0.09328977129932144,
"grad_norm": 5.131156921386719,
"learning_rate": 1.3649019742625623e-05,
"loss": 30.5295,
"step": 174
},
{
"epoch": 0.09382591941023707,
"grad_norm": 5.090980052947998,
"learning_rate": 1.2634001001741373e-05,
"loss": 30.1167,
"step": 175
},
{
"epoch": 0.09382591941023707,
"eval_loss": 0.961780846118927,
"eval_runtime": 7.0068,
"eval_samples_per_second": 7.136,
"eval_steps_per_second": 7.136,
"step": 175
},
{
"epoch": 0.09436206752115271,
"grad_norm": 5.299232006072998,
"learning_rate": 1.1656536753392287e-05,
"loss": 29.9242,
"step": 176
},
{
"epoch": 0.09489821563206836,
"grad_norm": 5.412871837615967,
"learning_rate": 1.0716894226543953e-05,
"loss": 31.6097,
"step": 177
},
{
"epoch": 0.095434363742984,
"grad_norm": 5.443716526031494,
"learning_rate": 9.815330310080887e-06,
"loss": 32.2806,
"step": 178
},
{
"epoch": 0.09597051185389964,
"grad_norm": 5.2226080894470215,
"learning_rate": 8.952091482575824e-06,
"loss": 32.0395,
"step": 179
},
{
"epoch": 0.09650665996481528,
"grad_norm": 5.585422039031982,
"learning_rate": 8.127413744904804e-06,
"loss": 32.5046,
"step": 180
},
{
"epoch": 0.09704280807573092,
"grad_norm": 5.256342887878418,
"learning_rate": 7.34152255572697e-06,
"loss": 29.7976,
"step": 181
},
{
"epoch": 0.09757895618664657,
"grad_norm": 5.678742408752441,
"learning_rate": 6.594632769846353e-06,
"loss": 31.166,
"step": 182
},
{
"epoch": 0.0981151042975622,
"grad_norm": 5.608844757080078,
"learning_rate": 5.886948579472778e-06,
"loss": 31.462,
"step": 183
},
{
"epoch": 0.09865125240847784,
"grad_norm": 5.735723495483398,
"learning_rate": 5.218663458397715e-06,
"loss": 32.3888,
"step": 184
},
{
"epoch": 0.09918740051939348,
"grad_norm": 5.686238765716553,
"learning_rate": 4.589960109100444e-06,
"loss": 32.2446,
"step": 185
},
{
"epoch": 0.09972354863030912,
"grad_norm": 6.456745147705078,
"learning_rate": 4.001010412799138e-06,
"loss": 31.8302,
"step": 186
},
{
"epoch": 0.10025969674122476,
"grad_norm": 6.2624192237854,
"learning_rate": 3.451975382460109e-06,
"loss": 32.1222,
"step": 187
},
{
"epoch": 0.10079584485214041,
"grad_norm": 6.031360149383545,
"learning_rate": 2.9430051187785962e-06,
"loss": 30.3575,
"step": 188
},
{
"epoch": 0.10133199296305605,
"grad_norm": 6.787166595458984,
"learning_rate": 2.4742387691426445e-06,
"loss": 33.5298,
"step": 189
},
{
"epoch": 0.10186814107397169,
"grad_norm": 7.786647796630859,
"learning_rate": 2.0458044895916513e-06,
"loss": 33.5317,
"step": 190
},
{
"epoch": 0.10240428918488732,
"grad_norm": 7.0574631690979,
"learning_rate": 1.6578194097797258e-06,
"loss": 30.7742,
"step": 191
},
{
"epoch": 0.10294043729580296,
"grad_norm": 7.725040912628174,
"learning_rate": 1.3103896009537207e-06,
"loss": 31.2916,
"step": 192
},
{
"epoch": 0.1034765854067186,
"grad_norm": 8.722898483276367,
"learning_rate": 1.0036100469542786e-06,
"loss": 33.509,
"step": 193
},
{
"epoch": 0.10401273351763425,
"grad_norm": 9.124086380004883,
"learning_rate": 7.375646182482875e-07,
"loss": 32.766,
"step": 194
},
{
"epoch": 0.10454888162854989,
"grad_norm": 12.139370918273926,
"learning_rate": 5.123260489995229e-07,
"loss": 35.1079,
"step": 195
},
{
"epoch": 0.10508502973946553,
"grad_norm": 13.486305236816406,
"learning_rate": 3.2795591718381975e-07,
"loss": 33.708,
"step": 196
},
{
"epoch": 0.10562117785038116,
"grad_norm": 15.585397720336914,
"learning_rate": 1.8450462775428942e-07,
"loss": 35.6728,
"step": 197
},
{
"epoch": 0.1061573259612968,
"grad_norm": 17.088029861450195,
"learning_rate": 8.201139886109264e-08,
"loss": 33.6193,
"step": 198
},
{
"epoch": 0.10669347407221245,
"grad_norm": 17.898561477661133,
"learning_rate": 2.0504251129649374e-08,
"loss": 27.5111,
"step": 199
},
{
"epoch": 0.10722962218312809,
"grad_norm": 28.369626998901367,
"learning_rate": 0.0,
"loss": 21.8853,
"step": 200
},
{
"epoch": 0.10722962218312809,
"eval_loss": 0.9611303806304932,
"eval_runtime": 7.0035,
"eval_samples_per_second": 7.139,
"eval_steps_per_second": 7.139,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.784601741716357e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}