{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987239472564866, "eval_steps": 500, "global_step": 587, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017014036580178648, "grad_norm": 2.168562940412461, "learning_rate": 1.111111111111111e-06, "loss": 1.1424, "step": 1 }, { "epoch": 0.0034028073160357296, "grad_norm": 2.250032849091299, "learning_rate": 2.222222222222222e-06, "loss": 1.1697, "step": 2 }, { "epoch": 0.005104210974053594, "grad_norm": 2.124744531823449, "learning_rate": 3.3333333333333333e-06, "loss": 1.1364, "step": 3 }, { "epoch": 0.006805614632071459, "grad_norm": 2.060484893473413, "learning_rate": 4.444444444444444e-06, "loss": 1.1471, "step": 4 }, { "epoch": 0.008507018290089324, "grad_norm": 1.89749522427158, "learning_rate": 5.555555555555557e-06, "loss": 1.1181, "step": 5 }, { "epoch": 0.010208421948107189, "grad_norm": 1.3357851941439673, "learning_rate": 6.666666666666667e-06, "loss": 1.0893, "step": 6 }, { "epoch": 0.011909825606125054, "grad_norm": 1.2850663628144225, "learning_rate": 7.77777777777778e-06, "loss": 1.0484, "step": 7 }, { "epoch": 0.013611229264142918, "grad_norm": 1.636445160753731, "learning_rate": 8.888888888888888e-06, "loss": 1.0308, "step": 8 }, { "epoch": 0.015312632922160783, "grad_norm": 1.3002228280073427, "learning_rate": 1e-05, "loss": 1.0234, "step": 9 }, { "epoch": 0.017014036580178648, "grad_norm": 1.2380680351381246, "learning_rate": 1.1111111111111113e-05, "loss": 0.9064, "step": 10 }, { "epoch": 0.01871544023819651, "grad_norm": 1.2765329463296096, "learning_rate": 1.2222222222222224e-05, "loss": 0.8696, "step": 11 }, { "epoch": 0.020416843896214378, "grad_norm": 1.060585818467504, "learning_rate": 1.3333333333333333e-05, "loss": 0.8444, "step": 12 }, { "epoch": 0.02211824755423224, "grad_norm": 1.1232952035148502, "learning_rate": 1.4444444444444446e-05, "loss": 0.841, "step": 13 }, { "epoch": 0.023819651212250107, "grad_norm": 1.107943114775762, "learning_rate": 1.555555555555556e-05, "loss": 0.7655, "step": 14 }, { "epoch": 0.02552105487026797, "grad_norm": 1.0059483053370937, "learning_rate": 1.6666666666666667e-05, "loss": 0.7063, "step": 15 }, { "epoch": 0.027222458528285837, "grad_norm": 0.9803969915500654, "learning_rate": 1.7777777777777777e-05, "loss": 0.6458, "step": 16 }, { "epoch": 0.0289238621863037, "grad_norm": 0.9105498405702808, "learning_rate": 1.888888888888889e-05, "loss": 0.6321, "step": 17 }, { "epoch": 0.030625265844321566, "grad_norm": 0.9433546278214147, "learning_rate": 2e-05, "loss": 0.493, "step": 18 }, { "epoch": 0.03232666950233943, "grad_norm": 0.8625027370409255, "learning_rate": 1.9999847579243196e-05, "loss": 0.5339, "step": 19 }, { "epoch": 0.034028073160357296, "grad_norm": 0.8101152462345085, "learning_rate": 1.9999390321619196e-05, "loss": 0.4566, "step": 20 }, { "epoch": 0.03572947681837516, "grad_norm": 0.7923424067716904, "learning_rate": 1.9998628241067113e-05, "loss": 0.4507, "step": 21 }, { "epoch": 0.03743088047639302, "grad_norm": 0.7681394677537327, "learning_rate": 1.9997561360818322e-05, "loss": 0.4265, "step": 22 }, { "epoch": 0.03913228413441089, "grad_norm": 0.7031739222010267, "learning_rate": 1.999618971339577e-05, "loss": 0.4174, "step": 23 }, { "epoch": 0.040833687792428755, "grad_norm": 0.6603135403892707, "learning_rate": 1.9994513340612957e-05, "loss": 0.3599, "step": 24 }, { "epoch": 0.04253509145044662, "grad_norm": 0.7260664469398356, "learning_rate": 1.9992532293572688e-05, "loss": 0.3398, "step": 25 }, { "epoch": 0.04423649510846448, "grad_norm": 0.590669951494315, "learning_rate": 1.9990246632665503e-05, "loss": 0.2616, "step": 26 }, { "epoch": 0.04593789876648235, "grad_norm": 0.6319158523814317, "learning_rate": 1.998765642756783e-05, "loss": 0.3082, "step": 27 }, { "epoch": 0.047639302424500214, "grad_norm": 0.6331615276638973, "learning_rate": 1.9984761757239878e-05, "loss": 0.2933, "step": 28 }, { "epoch": 0.04934070608251808, "grad_norm": 0.6376428367891515, "learning_rate": 1.998156270992321e-05, "loss": 0.2612, "step": 29 }, { "epoch": 0.05104210974053594, "grad_norm": 0.6322085618139611, "learning_rate": 1.9978059383138073e-05, "loss": 0.2397, "step": 30 }, { "epoch": 0.05274351339855381, "grad_norm": 0.6174252074442552, "learning_rate": 1.997425188368041e-05, "loss": 0.2566, "step": 31 }, { "epoch": 0.05444491705657167, "grad_norm": 0.5327916753070241, "learning_rate": 1.9970140327618612e-05, "loss": 0.2416, "step": 32 }, { "epoch": 0.05614632071458953, "grad_norm": 0.5479078907290673, "learning_rate": 1.9965724840289972e-05, "loss": 0.1781, "step": 33 }, { "epoch": 0.0578477243726074, "grad_norm": 0.5315756790547577, "learning_rate": 1.9961005556296875e-05, "loss": 0.2258, "step": 34 }, { "epoch": 0.059549128030625266, "grad_norm": 0.48386295113930106, "learning_rate": 1.9955982619502693e-05, "loss": 0.1679, "step": 35 }, { "epoch": 0.06125053168864313, "grad_norm": 0.5185655350233463, "learning_rate": 1.9950656183027392e-05, "loss": 0.2022, "step": 36 }, { "epoch": 0.06295193534666099, "grad_norm": 0.5425773293966369, "learning_rate": 1.994502640924286e-05, "loss": 0.2144, "step": 37 }, { "epoch": 0.06465333900467886, "grad_norm": 0.39700624229353654, "learning_rate": 1.993909346976798e-05, "loss": 0.1361, "step": 38 }, { "epoch": 0.06635474266269673, "grad_norm": 0.5024564814818023, "learning_rate": 1.993285754546338e-05, "loss": 0.1618, "step": 39 }, { "epoch": 0.06805614632071459, "grad_norm": 0.4298514607710804, "learning_rate": 1.9926318826425905e-05, "loss": 0.1801, "step": 40 }, { "epoch": 0.06975754997873246, "grad_norm": 0.4631733150959672, "learning_rate": 1.9919477511982873e-05, "loss": 0.1212, "step": 41 }, { "epoch": 0.07145895363675032, "grad_norm": 0.46629563189251916, "learning_rate": 1.991233381068594e-05, "loss": 0.1538, "step": 42 }, { "epoch": 0.07316035729476818, "grad_norm": 0.4077353003444054, "learning_rate": 1.990488794030478e-05, "loss": 0.1601, "step": 43 }, { "epoch": 0.07486176095278604, "grad_norm": 0.3874990470468177, "learning_rate": 1.9897140127820432e-05, "loss": 0.125, "step": 44 }, { "epoch": 0.07656316461080391, "grad_norm": 0.3638523211055093, "learning_rate": 1.9889090609418384e-05, "loss": 0.1366, "step": 45 }, { "epoch": 0.07826456826882178, "grad_norm": 0.5006533990444182, "learning_rate": 1.9880739630481376e-05, "loss": 0.1466, "step": 46 }, { "epoch": 0.07996597192683964, "grad_norm": 0.4320089119157406, "learning_rate": 1.9872087445581912e-05, "loss": 0.1256, "step": 47 }, { "epoch": 0.08166737558485751, "grad_norm": 0.3330675873783333, "learning_rate": 1.9863134318474504e-05, "loss": 0.1066, "step": 48 }, { "epoch": 0.08336877924287538, "grad_norm": 0.4410537592519413, "learning_rate": 1.985388052208764e-05, "loss": 0.1381, "step": 49 }, { "epoch": 0.08507018290089324, "grad_norm": 0.4003560082493989, "learning_rate": 1.9844326338515444e-05, "loss": 0.1202, "step": 50 }, { "epoch": 0.0867715865589111, "grad_norm": 0.41012939531437076, "learning_rate": 1.9834472059009097e-05, "loss": 0.1246, "step": 51 }, { "epoch": 0.08847299021692896, "grad_norm": 0.408091381990657, "learning_rate": 1.982431798396794e-05, "loss": 0.1289, "step": 52 }, { "epoch": 0.09017439387494683, "grad_norm": 0.4821914250638501, "learning_rate": 1.9813864422930345e-05, "loss": 0.1312, "step": 53 }, { "epoch": 0.0918757975329647, "grad_norm": 0.40793016164627693, "learning_rate": 1.9803111694564246e-05, "loss": 0.1237, "step": 54 }, { "epoch": 0.09357720119098256, "grad_norm": 0.3734013818822545, "learning_rate": 1.9792060126657437e-05, "loss": 0.1049, "step": 55 }, { "epoch": 0.09527860484900043, "grad_norm": 0.3186480775193613, "learning_rate": 1.9780710056107587e-05, "loss": 0.0911, "step": 56 }, { "epoch": 0.0969800085070183, "grad_norm": 0.39667217504563396, "learning_rate": 1.976906182891197e-05, "loss": 0.0918, "step": 57 }, { "epoch": 0.09868141216503616, "grad_norm": 0.4482874203710604, "learning_rate": 1.97571158001569e-05, "loss": 0.1209, "step": 58 }, { "epoch": 0.10038281582305401, "grad_norm": 0.2965972804159924, "learning_rate": 1.9744872334006936e-05, "loss": 0.0932, "step": 59 }, { "epoch": 0.10208421948107188, "grad_norm": 0.3566484676555622, "learning_rate": 1.973233180369374e-05, "loss": 0.1248, "step": 60 }, { "epoch": 0.10378562313908975, "grad_norm": 0.30896394609488814, "learning_rate": 1.9719494591504747e-05, "loss": 0.0851, "step": 61 }, { "epoch": 0.10548702679710761, "grad_norm": 0.2903927724315073, "learning_rate": 1.9706361088771474e-05, "loss": 0.0763, "step": 62 }, { "epoch": 0.10718843045512548, "grad_norm": 0.3405843767078732, "learning_rate": 1.96929316958576e-05, "loss": 0.103, "step": 63 }, { "epoch": 0.10888983411314335, "grad_norm": 0.40152278275783043, "learning_rate": 1.9679206822146776e-05, "loss": 0.1084, "step": 64 }, { "epoch": 0.11059123777116121, "grad_norm": 0.3372032580489575, "learning_rate": 1.9665186886030135e-05, "loss": 0.1022, "step": 65 }, { "epoch": 0.11229264142917907, "grad_norm": 0.332966030135067, "learning_rate": 1.9650872314893523e-05, "loss": 0.0891, "step": 66 }, { "epoch": 0.11399404508719693, "grad_norm": 0.3832837550171733, "learning_rate": 1.9636263545104498e-05, "loss": 0.1181, "step": 67 }, { "epoch": 0.1156954487452148, "grad_norm": 0.40356329916123346, "learning_rate": 1.962136102199901e-05, "loss": 0.1262, "step": 68 }, { "epoch": 0.11739685240323267, "grad_norm": 0.3689423450550846, "learning_rate": 1.9606165199867822e-05, "loss": 0.0826, "step": 69 }, { "epoch": 0.11909825606125053, "grad_norm": 0.3457217662571401, "learning_rate": 1.959067654194268e-05, "loss": 0.1007, "step": 70 }, { "epoch": 0.1207996597192684, "grad_norm": 0.3511006571491925, "learning_rate": 1.9574895520382183e-05, "loss": 0.1263, "step": 71 }, { "epoch": 0.12250106337728627, "grad_norm": 0.36819206266348997, "learning_rate": 1.955882261625737e-05, "loss": 0.1137, "step": 72 }, { "epoch": 0.12420246703530413, "grad_norm": 0.2607004742274578, "learning_rate": 1.9542458319537094e-05, "loss": 0.0914, "step": 73 }, { "epoch": 0.12590387069332198, "grad_norm": 0.2773076381582836, "learning_rate": 1.9525803129073046e-05, "loss": 0.0592, "step": 74 }, { "epoch": 0.12760527435133986, "grad_norm": 0.28951070788934496, "learning_rate": 1.9508857552584574e-05, "loss": 0.0928, "step": 75 }, { "epoch": 0.12930667800935772, "grad_norm": 0.35718485474187367, "learning_rate": 1.9491622106643195e-05, "loss": 0.0952, "step": 76 }, { "epoch": 0.1310080816673756, "grad_norm": 0.34891501382529494, "learning_rate": 1.9474097316656856e-05, "loss": 0.1004, "step": 77 }, { "epoch": 0.13270948532539345, "grad_norm": 0.28733346641687174, "learning_rate": 1.9456283716853906e-05, "loss": 0.0823, "step": 78 }, { "epoch": 0.1344108889834113, "grad_norm": 0.3735237899238901, "learning_rate": 1.9438181850266815e-05, "loss": 0.121, "step": 79 }, { "epoch": 0.13611229264142918, "grad_norm": 0.2904974285120679, "learning_rate": 1.941979226871563e-05, "loss": 0.0768, "step": 80 }, { "epoch": 0.13781369629944704, "grad_norm": 0.32117427662146836, "learning_rate": 1.9401115532791134e-05, "loss": 0.0837, "step": 81 }, { "epoch": 0.13951509995746492, "grad_norm": 0.40967624537549835, "learning_rate": 1.938215221183777e-05, "loss": 0.1061, "step": 82 }, { "epoch": 0.14121650361548277, "grad_norm": 0.33629708001931885, "learning_rate": 1.936290288393629e-05, "loss": 0.0892, "step": 83 }, { "epoch": 0.14291790727350065, "grad_norm": 0.31834590604771723, "learning_rate": 1.9343368135886112e-05, "loss": 0.1088, "step": 84 }, { "epoch": 0.1446193109315185, "grad_norm": 0.30988146926514176, "learning_rate": 1.932354856318746e-05, "loss": 0.0864, "step": 85 }, { "epoch": 0.14632071458953635, "grad_norm": 0.41148714128819724, "learning_rate": 1.9303444770023184e-05, "loss": 0.0867, "step": 86 }, { "epoch": 0.14802211824755424, "grad_norm": 0.3641579696865324, "learning_rate": 1.9283057369240358e-05, "loss": 0.1149, "step": 87 }, { "epoch": 0.1497235219055721, "grad_norm": 0.3359968169832728, "learning_rate": 1.9262386982331596e-05, "loss": 0.0959, "step": 88 }, { "epoch": 0.15142492556358997, "grad_norm": 0.2849324477084618, "learning_rate": 1.9241434239416093e-05, "loss": 0.0957, "step": 89 }, { "epoch": 0.15312632922160782, "grad_norm": 0.29673544787967593, "learning_rate": 1.922019977922045e-05, "loss": 0.0823, "step": 90 }, { "epoch": 0.1548277328796257, "grad_norm": 0.28046315847608455, "learning_rate": 1.919868424905915e-05, "loss": 0.0885, "step": 91 }, { "epoch": 0.15652913653764355, "grad_norm": 0.3024116439443639, "learning_rate": 1.9176888304814882e-05, "loss": 0.0919, "step": 92 }, { "epoch": 0.15823054019566143, "grad_norm": 0.4139703560637819, "learning_rate": 1.9154812610918503e-05, "loss": 0.0938, "step": 93 }, { "epoch": 0.1599319438536793, "grad_norm": 0.2618424473915207, "learning_rate": 1.913245784032881e-05, "loss": 0.0684, "step": 94 }, { "epoch": 0.16163334751169714, "grad_norm": 0.31266175001831703, "learning_rate": 1.9109824674512014e-05, "loss": 0.0673, "step": 95 }, { "epoch": 0.16333475116971502, "grad_norm": 0.3741829507920355, "learning_rate": 1.9086913803420966e-05, "loss": 0.1097, "step": 96 }, { "epoch": 0.16503615482773287, "grad_norm": 0.3356628202803492, "learning_rate": 1.906372592547413e-05, "loss": 0.1028, "step": 97 }, { "epoch": 0.16673755848575075, "grad_norm": 0.34106497626751014, "learning_rate": 1.9040261747534282e-05, "loss": 0.0989, "step": 98 }, { "epoch": 0.1684389621437686, "grad_norm": 0.29746452263621875, "learning_rate": 1.9016521984886984e-05, "loss": 0.0718, "step": 99 }, { "epoch": 0.1701403658017865, "grad_norm": 0.2990092253885084, "learning_rate": 1.8992507361218743e-05, "loss": 0.0765, "step": 100 }, { "epoch": 0.17184176945980434, "grad_norm": 0.2177337033832474, "learning_rate": 1.8968218608594987e-05, "loss": 0.056, "step": 101 }, { "epoch": 0.1735431731178222, "grad_norm": 0.39840165284276846, "learning_rate": 1.8943656467437726e-05, "loss": 0.1111, "step": 102 }, { "epoch": 0.17524457677584007, "grad_norm": 0.3378745899319231, "learning_rate": 1.8918821686502992e-05, "loss": 0.0931, "step": 103 }, { "epoch": 0.17694598043385792, "grad_norm": 0.2637083690262847, "learning_rate": 1.8893715022858e-05, "loss": 0.0759, "step": 104 }, { "epoch": 0.1786473840918758, "grad_norm": 0.36645660481595893, "learning_rate": 1.886833724185809e-05, "loss": 0.0956, "step": 105 }, { "epoch": 0.18034878774989366, "grad_norm": 0.2626991205572612, "learning_rate": 1.8842689117123377e-05, "loss": 0.0622, "step": 106 }, { "epoch": 0.18205019140791154, "grad_norm": 0.28778996504448945, "learning_rate": 1.8816771430515178e-05, "loss": 0.0778, "step": 107 }, { "epoch": 0.1837515950659294, "grad_norm": 0.30016452527135956, "learning_rate": 1.8790584972112174e-05, "loss": 0.0658, "step": 108 }, { "epoch": 0.18545299872394724, "grad_norm": 0.5203118503020492, "learning_rate": 1.876413054018633e-05, "loss": 0.146, "step": 109 }, { "epoch": 0.18715440238196512, "grad_norm": 0.30134320864935904, "learning_rate": 1.873740894117854e-05, "loss": 0.0821, "step": 110 }, { "epoch": 0.18885580603998298, "grad_norm": 0.34344915555537253, "learning_rate": 1.8710420989674093e-05, "loss": 0.075, "step": 111 }, { "epoch": 0.19055720969800086, "grad_norm": 0.2458501444323483, "learning_rate": 1.8683167508377775e-05, "loss": 0.0596, "step": 112 }, { "epoch": 0.1922586133560187, "grad_norm": 0.2651178569441844, "learning_rate": 1.8655649328088836e-05, "loss": 0.0674, "step": 113 }, { "epoch": 0.1939600170140366, "grad_norm": 0.2662036987842302, "learning_rate": 1.862786728767565e-05, "loss": 0.0769, "step": 114 }, { "epoch": 0.19566142067205444, "grad_norm": 0.28498333925442115, "learning_rate": 1.8599822234050143e-05, "loss": 0.0778, "step": 115 }, { "epoch": 0.19736282433007232, "grad_norm": 0.26717816913116776, "learning_rate": 1.8571515022141974e-05, "loss": 0.0896, "step": 116 }, { "epoch": 0.19906422798809018, "grad_norm": 0.3374842493414297, "learning_rate": 1.8542946514872478e-05, "loss": 0.0992, "step": 117 }, { "epoch": 0.20076563164610803, "grad_norm": 0.21117730752547184, "learning_rate": 1.851411758312835e-05, "loss": 0.0454, "step": 118 }, { "epoch": 0.2024670353041259, "grad_norm": 0.19203553955386238, "learning_rate": 1.8485029105735112e-05, "loss": 0.0611, "step": 119 }, { "epoch": 0.20416843896214376, "grad_norm": 0.22445524786962626, "learning_rate": 1.8455681969430307e-05, "loss": 0.0584, "step": 120 }, { "epoch": 0.20586984262016164, "grad_norm": 0.23148081400071185, "learning_rate": 1.8426077068836487e-05, "loss": 0.0629, "step": 121 }, { "epoch": 0.2075712462781795, "grad_norm": 0.17709972528549517, "learning_rate": 1.839621530643392e-05, "loss": 0.0563, "step": 122 }, { "epoch": 0.20927264993619737, "grad_norm": 0.3048264391849215, "learning_rate": 1.8366097592533095e-05, "loss": 0.0778, "step": 123 }, { "epoch": 0.21097405359421523, "grad_norm": 0.32208607157512736, "learning_rate": 1.8335724845246948e-05, "loss": 0.1028, "step": 124 }, { "epoch": 0.21267545725223308, "grad_norm": 0.23725153079241285, "learning_rate": 1.830509799046292e-05, "loss": 0.0803, "step": 125 }, { "epoch": 0.21437686091025096, "grad_norm": 0.2996136224560952, "learning_rate": 1.8274217961814682e-05, "loss": 0.0718, "step": 126 }, { "epoch": 0.2160782645682688, "grad_norm": 0.23529202737787724, "learning_rate": 1.8243085700653698e-05, "loss": 0.058, "step": 127 }, { "epoch": 0.2177796682262867, "grad_norm": 0.2602674833841576, "learning_rate": 1.821170215602053e-05, "loss": 0.0847, "step": 128 }, { "epoch": 0.21948107188430455, "grad_norm": 0.23134307276310323, "learning_rate": 1.818006828461591e-05, "loss": 0.0568, "step": 129 }, { "epoch": 0.22118247554232243, "grad_norm": 0.26453961985179086, "learning_rate": 1.8148185050771554e-05, "loss": 0.0801, "step": 130 }, { "epoch": 0.22288387920034028, "grad_norm": 0.25318378680529585, "learning_rate": 1.8116053426420793e-05, "loss": 0.0749, "step": 131 }, { "epoch": 0.22458528285835813, "grad_norm": 0.20502925194857724, "learning_rate": 1.8083674391068925e-05, "loss": 0.0629, "step": 132 }, { "epoch": 0.226286686516376, "grad_norm": 0.19279005026370763, "learning_rate": 1.8051048931763366e-05, "loss": 0.0463, "step": 133 }, { "epoch": 0.22798809017439386, "grad_norm": 0.2986931928862026, "learning_rate": 1.8018178043063554e-05, "loss": 0.0869, "step": 134 }, { "epoch": 0.22968949383241175, "grad_norm": 0.2580936970306385, "learning_rate": 1.798506272701064e-05, "loss": 0.0689, "step": 135 }, { "epoch": 0.2313908974904296, "grad_norm": 0.35433155424672486, "learning_rate": 1.795170399309692e-05, "loss": 0.0889, "step": 136 }, { "epoch": 0.23309230114844748, "grad_norm": 0.3342563856245416, "learning_rate": 1.7918102858235103e-05, "loss": 0.1088, "step": 137 }, { "epoch": 0.23479370480646533, "grad_norm": 0.2658901230535673, "learning_rate": 1.7884260346727257e-05, "loss": 0.0967, "step": 138 }, { "epoch": 0.2364951084644832, "grad_norm": 0.3116747759735481, "learning_rate": 1.7850177490233635e-05, "loss": 0.0763, "step": 139 }, { "epoch": 0.23819651212250106, "grad_norm": 0.2561525037232419, "learning_rate": 1.7815855327741185e-05, "loss": 0.0956, "step": 140 }, { "epoch": 0.23989791578051892, "grad_norm": 0.2246820236163285, "learning_rate": 1.7781294905531908e-05, "loss": 0.0792, "step": 141 }, { "epoch": 0.2415993194385368, "grad_norm": 0.3264214270885714, "learning_rate": 1.774649727715094e-05, "loss": 0.0792, "step": 142 }, { "epoch": 0.24330072309655465, "grad_norm": 0.19102790309255646, "learning_rate": 1.7711463503374466e-05, "loss": 0.0627, "step": 143 }, { "epoch": 0.24500212675457253, "grad_norm": 0.29766375426258634, "learning_rate": 1.7676194652177333e-05, "loss": 0.0529, "step": 144 }, { "epoch": 0.24670353041259038, "grad_norm": 0.29002494096896503, "learning_rate": 1.764069179870055e-05, "loss": 0.0873, "step": 145 }, { "epoch": 0.24840493407060826, "grad_norm": 0.1961062743167957, "learning_rate": 1.760495602521847e-05, "loss": 0.0549, "step": 146 }, { "epoch": 0.2501063377286261, "grad_norm": 0.3391386450150704, "learning_rate": 1.756898842110582e-05, "loss": 0.0855, "step": 147 }, { "epoch": 0.25180774138664397, "grad_norm": 0.2237841919957392, "learning_rate": 1.753279008280449e-05, "loss": 0.0611, "step": 148 }, { "epoch": 0.2535091450446618, "grad_norm": 0.1863997964345379, "learning_rate": 1.74963621137901e-05, "loss": 0.0557, "step": 149 }, { "epoch": 0.25521054870267973, "grad_norm": 0.2322748935860178, "learning_rate": 1.7459705624538383e-05, "loss": 0.0744, "step": 150 }, { "epoch": 0.2569119523606976, "grad_norm": 0.2811884338929891, "learning_rate": 1.7422821732491297e-05, "loss": 0.0869, "step": 151 }, { "epoch": 0.25861335601871543, "grad_norm": 0.2336367022405289, "learning_rate": 1.7385711562022988e-05, "loss": 0.0662, "step": 152 }, { "epoch": 0.2603147596767333, "grad_norm": 0.2525517790918003, "learning_rate": 1.734837624440551e-05, "loss": 0.0709, "step": 153 }, { "epoch": 0.2620161633347512, "grad_norm": 0.25674452456443186, "learning_rate": 1.731081691777434e-05, "loss": 0.0536, "step": 154 }, { "epoch": 0.26371756699276905, "grad_norm": 0.2380952734859674, "learning_rate": 1.7273034727093677e-05, "loss": 0.0797, "step": 155 }, { "epoch": 0.2654189706507869, "grad_norm": 0.22013348660975807, "learning_rate": 1.7235030824121542e-05, "loss": 0.0608, "step": 156 }, { "epoch": 0.26712037430880475, "grad_norm": 0.22281523456342697, "learning_rate": 1.7196806367374656e-05, "loss": 0.0635, "step": 157 }, { "epoch": 0.2688217779668226, "grad_norm": 0.2102268959812806, "learning_rate": 1.7158362522093153e-05, "loss": 0.0682, "step": 158 }, { "epoch": 0.2705231816248405, "grad_norm": 0.2981020750225053, "learning_rate": 1.7119700460205026e-05, "loss": 0.0748, "step": 159 }, { "epoch": 0.27222458528285837, "grad_norm": 0.23396857925074813, "learning_rate": 1.7080821360290426e-05, "loss": 0.0668, "step": 160 }, { "epoch": 0.2739259889408762, "grad_norm": 0.14039625188691493, "learning_rate": 1.7041726407545716e-05, "loss": 0.0334, "step": 161 }, { "epoch": 0.27562739259889407, "grad_norm": 0.2591445596342466, "learning_rate": 1.7002416793747354e-05, "loss": 0.0613, "step": 162 }, { "epoch": 0.2773287962569119, "grad_norm": 0.29367133177482335, "learning_rate": 1.696289371721556e-05, "loss": 0.0713, "step": 163 }, { "epoch": 0.27903019991492983, "grad_norm": 0.3690305678977906, "learning_rate": 1.692315838277778e-05, "loss": 0.1274, "step": 164 }, { "epoch": 0.2807316035729477, "grad_norm": 0.22214467316639894, "learning_rate": 1.6883212001731956e-05, "loss": 0.0655, "step": 165 }, { "epoch": 0.28243300723096554, "grad_norm": 0.1574976967869379, "learning_rate": 1.6843055791809623e-05, "loss": 0.0322, "step": 166 }, { "epoch": 0.2841344108889834, "grad_norm": 0.27654091402143877, "learning_rate": 1.680269097713876e-05, "loss": 0.0922, "step": 167 }, { "epoch": 0.2858358145470013, "grad_norm": 0.28597652739783896, "learning_rate": 1.6762118788206488e-05, "loss": 0.079, "step": 168 }, { "epoch": 0.28753721820501915, "grad_norm": 0.26938794460799176, "learning_rate": 1.6721340461821555e-05, "loss": 0.0871, "step": 169 }, { "epoch": 0.289238621863037, "grad_norm": 0.32524563962835895, "learning_rate": 1.6680357241076632e-05, "loss": 0.1052, "step": 170 }, { "epoch": 0.29094002552105486, "grad_norm": 0.3017728585235927, "learning_rate": 1.6639170375310422e-05, "loss": 0.0705, "step": 171 }, { "epoch": 0.2926414291790727, "grad_norm": 0.23611080555948813, "learning_rate": 1.6597781120069584e-05, "loss": 0.0735, "step": 172 }, { "epoch": 0.2943428328370906, "grad_norm": 0.2824684340323033, "learning_rate": 1.655619073707043e-05, "loss": 0.0957, "step": 173 }, { "epoch": 0.29604423649510847, "grad_norm": 0.22439711399094156, "learning_rate": 1.6514400494160498e-05, "loss": 0.0572, "step": 174 }, { "epoch": 0.2977456401531263, "grad_norm": 0.18939390048237154, "learning_rate": 1.6472411665279872e-05, "loss": 0.057, "step": 175 }, { "epoch": 0.2994470438111442, "grad_norm": 0.22456676896884215, "learning_rate": 1.643022553042237e-05, "loss": 0.0557, "step": 176 }, { "epoch": 0.3011484474691621, "grad_norm": 0.18664835436430186, "learning_rate": 1.6387843375596513e-05, "loss": 0.0494, "step": 177 }, { "epoch": 0.30284985112717994, "grad_norm": 0.23489112753979385, "learning_rate": 1.634526649278632e-05, "loss": 0.0821, "step": 178 }, { "epoch": 0.3045512547851978, "grad_norm": 0.2194109402248629, "learning_rate": 1.630249617991194e-05, "loss": 0.0672, "step": 179 }, { "epoch": 0.30625265844321564, "grad_norm": 0.3321350768766143, "learning_rate": 1.6259533740790055e-05, "loss": 0.1135, "step": 180 }, { "epoch": 0.3079540621012335, "grad_norm": 0.2502151994073321, "learning_rate": 1.6216380485094164e-05, "loss": 0.0718, "step": 181 }, { "epoch": 0.3096554657592514, "grad_norm": 0.2670386782819874, "learning_rate": 1.617303772831465e-05, "loss": 0.0728, "step": 182 }, { "epoch": 0.31135686941726926, "grad_norm": 0.26272098469205685, "learning_rate": 1.6129506791718665e-05, "loss": 0.0832, "step": 183 }, { "epoch": 0.3130582730752871, "grad_norm": 0.2246929104575992, "learning_rate": 1.6085789002309873e-05, "loss": 0.0631, "step": 184 }, { "epoch": 0.31475967673330496, "grad_norm": 0.21270610161779943, "learning_rate": 1.6041885692787985e-05, "loss": 0.067, "step": 185 }, { "epoch": 0.31646108039132287, "grad_norm": 0.21242584741643203, "learning_rate": 1.599779820150813e-05, "loss": 0.068, "step": 186 }, { "epoch": 0.3181624840493407, "grad_norm": 0.20419485410339863, "learning_rate": 1.5953527872440063e-05, "loss": 0.0757, "step": 187 }, { "epoch": 0.3198638877073586, "grad_norm": 0.27210472135834674, "learning_rate": 1.5909076055127202e-05, "loss": 0.0853, "step": 188 }, { "epoch": 0.3215652913653764, "grad_norm": 0.22338089059411625, "learning_rate": 1.5864444104645473e-05, "loss": 0.0742, "step": 189 }, { "epoch": 0.3232666950233943, "grad_norm": 0.2732168335391855, "learning_rate": 1.581963338156201e-05, "loss": 0.0908, "step": 190 }, { "epoch": 0.3249680986814122, "grad_norm": 0.2467079865485088, "learning_rate": 1.5774645251893673e-05, "loss": 0.0591, "step": 191 }, { "epoch": 0.32666950233943004, "grad_norm": 0.2313614767876932, "learning_rate": 1.5729481087065423e-05, "loss": 0.0892, "step": 192 }, { "epoch": 0.3283709059974479, "grad_norm": 0.2023763116089166, "learning_rate": 1.5684142263868493e-05, "loss": 0.0672, "step": 193 }, { "epoch": 0.33007230965546575, "grad_norm": 0.31753011619888016, "learning_rate": 1.5638630164418435e-05, "loss": 0.0712, "step": 194 }, { "epoch": 0.3317737133134836, "grad_norm": 0.2757080957715778, "learning_rate": 1.5592946176112973e-05, "loss": 0.1015, "step": 195 }, { "epoch": 0.3334751169715015, "grad_norm": 0.2505973926615177, "learning_rate": 1.554709169158972e-05, "loss": 0.0971, "step": 196 }, { "epoch": 0.33517652062951936, "grad_norm": 0.2226907461950715, "learning_rate": 1.550106810868373e-05, "loss": 0.0513, "step": 197 }, { "epoch": 0.3368779242875372, "grad_norm": 0.15446367505194006, "learning_rate": 1.5454876830384868e-05, "loss": 0.057, "step": 198 }, { "epoch": 0.33857932794555506, "grad_norm": 0.30643512505114406, "learning_rate": 1.540851926479505e-05, "loss": 0.0975, "step": 199 }, { "epoch": 0.340280731603573, "grad_norm": 0.25762810344960363, "learning_rate": 1.536199682508533e-05, "loss": 0.0633, "step": 200 }, { "epoch": 0.3419821352615908, "grad_norm": 0.19158691969988545, "learning_rate": 1.531531092945279e-05, "loss": 0.0569, "step": 201 }, { "epoch": 0.3436835389196087, "grad_norm": 0.2834266199692826, "learning_rate": 1.526846300107734e-05, "loss": 0.0988, "step": 202 }, { "epoch": 0.34538494257762653, "grad_norm": 0.2556087083863337, "learning_rate": 1.5221454468078336e-05, "loss": 0.0689, "step": 203 }, { "epoch": 0.3470863462356444, "grad_norm": 0.2438742301334132, "learning_rate": 1.5174286763470995e-05, "loss": 0.0715, "step": 204 }, { "epoch": 0.3487877498936623, "grad_norm": 0.21904569801568857, "learning_rate": 1.5126961325122773e-05, "loss": 0.0715, "step": 205 }, { "epoch": 0.35048915355168014, "grad_norm": 0.32118887699890014, "learning_rate": 1.5079479595709493e-05, "loss": 0.1042, "step": 206 }, { "epoch": 0.352190557209698, "grad_norm": 0.22503680461445366, "learning_rate": 1.5031843022671377e-05, "loss": 0.0516, "step": 207 }, { "epoch": 0.35389196086771585, "grad_norm": 0.20476968089029912, "learning_rate": 1.4984053058168936e-05, "loss": 0.0651, "step": 208 }, { "epoch": 0.35559336452573376, "grad_norm": 0.26260720806295024, "learning_rate": 1.4936111159038677e-05, "loss": 0.078, "step": 209 }, { "epoch": 0.3572947681837516, "grad_norm": 0.21335908324799727, "learning_rate": 1.4888018786748713e-05, "loss": 0.0531, "step": 210 }, { "epoch": 0.35899617184176946, "grad_norm": 0.24776977384172608, "learning_rate": 1.4839777407354194e-05, "loss": 0.0711, "step": 211 }, { "epoch": 0.3606975754997873, "grad_norm": 0.26283265478228607, "learning_rate": 1.4791388491452637e-05, "loss": 0.1005, "step": 212 }, { "epoch": 0.36239897915780517, "grad_norm": 0.21912083728071574, "learning_rate": 1.4742853514139076e-05, "loss": 0.0759, "step": 213 }, { "epoch": 0.3641003828158231, "grad_norm": 0.30538258609344554, "learning_rate": 1.4694173954961105e-05, "loss": 0.1009, "step": 214 }, { "epoch": 0.36580178647384093, "grad_norm": 0.26764402263401943, "learning_rate": 1.4645351297873774e-05, "loss": 0.0792, "step": 215 }, { "epoch": 0.3675031901318588, "grad_norm": 0.23675986028315632, "learning_rate": 1.4596387031194354e-05, "loss": 0.0743, "step": 216 }, { "epoch": 0.36920459378987663, "grad_norm": 0.223485373234782, "learning_rate": 1.4547282647556964e-05, "loss": 0.0929, "step": 217 }, { "epoch": 0.3709059974478945, "grad_norm": 0.29133370479541304, "learning_rate": 1.449803964386706e-05, "loss": 0.0798, "step": 218 }, { "epoch": 0.3726074011059124, "grad_norm": 0.18643880537818264, "learning_rate": 1.4448659521255823e-05, "loss": 0.0569, "step": 219 }, { "epoch": 0.37430880476393025, "grad_norm": 0.280997156813028, "learning_rate": 1.4399143785034388e-05, "loss": 0.0999, "step": 220 }, { "epoch": 0.3760102084219481, "grad_norm": 0.24715686808729184, "learning_rate": 1.4349493944647953e-05, "loss": 0.0627, "step": 221 }, { "epoch": 0.37771161207996595, "grad_norm": 0.2653638113311677, "learning_rate": 1.4299711513629759e-05, "loss": 0.0863, "step": 222 }, { "epoch": 0.37941301573798386, "grad_norm": 0.3152170570199493, "learning_rate": 1.4249798009554979e-05, "loss": 0.0962, "step": 223 }, { "epoch": 0.3811144193960017, "grad_norm": 0.25435032990077316, "learning_rate": 1.419975495399442e-05, "loss": 0.0937, "step": 224 }, { "epoch": 0.38281582305401957, "grad_norm": 0.15047418159614007, "learning_rate": 1.4149583872468165e-05, "loss": 0.0482, "step": 225 }, { "epoch": 0.3845172267120374, "grad_norm": 0.15295980990137148, "learning_rate": 1.4099286294399051e-05, "loss": 0.0382, "step": 226 }, { "epoch": 0.38621863037005527, "grad_norm": 0.2533178513009288, "learning_rate": 1.404886375306607e-05, "loss": 0.0948, "step": 227 }, { "epoch": 0.3879200340280732, "grad_norm": 0.23107415073266793, "learning_rate": 1.3998317785557597e-05, "loss": 0.0556, "step": 228 }, { "epoch": 0.38962143768609103, "grad_norm": 0.22917339521301278, "learning_rate": 1.3947649932724563e-05, "loss": 0.0843, "step": 229 }, { "epoch": 0.3913228413441089, "grad_norm": 0.17860874606051796, "learning_rate": 1.3896861739133456e-05, "loss": 0.0488, "step": 230 }, { "epoch": 0.39302424500212674, "grad_norm": 0.20110618464098665, "learning_rate": 1.384595475301926e-05, "loss": 0.0707, "step": 231 }, { "epoch": 0.39472564866014465, "grad_norm": 0.20490708885008196, "learning_rate": 1.3794930526238246e-05, "loss": 0.0638, "step": 232 }, { "epoch": 0.3964270523181625, "grad_norm": 0.23510478688343137, "learning_rate": 1.3743790614220664e-05, "loss": 0.0795, "step": 233 }, { "epoch": 0.39812845597618035, "grad_norm": 0.23100407798838418, "learning_rate": 1.3692536575923334e-05, "loss": 0.0761, "step": 234 }, { "epoch": 0.3998298596341982, "grad_norm": 0.256927774293719, "learning_rate": 1.3641169973782117e-05, "loss": 0.0669, "step": 235 }, { "epoch": 0.40153126329221606, "grad_norm": 0.265960997604412, "learning_rate": 1.3589692373664288e-05, "loss": 0.0792, "step": 236 }, { "epoch": 0.40323266695023396, "grad_norm": 0.2915525993978868, "learning_rate": 1.3538105344820798e-05, "loss": 0.0995, "step": 237 }, { "epoch": 0.4049340706082518, "grad_norm": 0.2578778351844946, "learning_rate": 1.3486410459838448e-05, "loss": 0.0719, "step": 238 }, { "epoch": 0.40663547426626967, "grad_norm": 0.23384387017059058, "learning_rate": 1.343460929459193e-05, "loss": 0.0712, "step": 239 }, { "epoch": 0.4083368779242875, "grad_norm": 0.3019784671101925, "learning_rate": 1.3382703428195812e-05, "loss": 0.1115, "step": 240 }, { "epoch": 0.4100382815823054, "grad_norm": 0.1352734841435768, "learning_rate": 1.3330694442956376e-05, "loss": 0.0464, "step": 241 }, { "epoch": 0.4117396852403233, "grad_norm": 0.17202787234184866, "learning_rate": 1.3278583924323405e-05, "loss": 0.0454, "step": 242 }, { "epoch": 0.41344108889834114, "grad_norm": 0.197390798554472, "learning_rate": 1.3226373460841835e-05, "loss": 0.0643, "step": 243 }, { "epoch": 0.415142492556359, "grad_norm": 0.21059005106394152, "learning_rate": 1.3174064644103334e-05, "loss": 0.0619, "step": 244 }, { "epoch": 0.41684389621437684, "grad_norm": 0.1722308196492277, "learning_rate": 1.3121659068697797e-05, "loss": 0.0454, "step": 245 }, { "epoch": 0.41854529987239475, "grad_norm": 0.32958443926088915, "learning_rate": 1.306915833216471e-05, "loss": 0.1037, "step": 246 }, { "epoch": 0.4202467035304126, "grad_norm": 0.2544357662171513, "learning_rate": 1.3016564034944473e-05, "loss": 0.0706, "step": 247 }, { "epoch": 0.42194810718843045, "grad_norm": 0.2585510606952109, "learning_rate": 1.29638777803296e-05, "loss": 0.0825, "step": 248 }, { "epoch": 0.4236495108464483, "grad_norm": 0.20275321109441732, "learning_rate": 1.2911101174415861e-05, "loss": 0.0526, "step": 249 }, { "epoch": 0.42535091450446616, "grad_norm": 0.233981891948638, "learning_rate": 1.2858235826053294e-05, "loss": 0.0695, "step": 250 }, { "epoch": 0.42705231816248407, "grad_norm": 0.21493578674491315, "learning_rate": 1.2805283346797179e-05, "loss": 0.0653, "step": 251 }, { "epoch": 0.4287537218205019, "grad_norm": 0.24141724836500014, "learning_rate": 1.2752245350858905e-05, "loss": 0.0797, "step": 252 }, { "epoch": 0.4304551254785198, "grad_norm": 0.1489771140649588, "learning_rate": 1.2699123455056777e-05, "loss": 0.03, "step": 253 }, { "epoch": 0.4321565291365376, "grad_norm": 0.24828101075324488, "learning_rate": 1.26459192787667e-05, "loss": 0.0819, "step": 254 }, { "epoch": 0.43385793279455553, "grad_norm": 0.17372013690514643, "learning_rate": 1.2592634443872842e-05, "loss": 0.0461, "step": 255 }, { "epoch": 0.4355593364525734, "grad_norm": 0.2764346314356569, "learning_rate": 1.2539270574718172e-05, "loss": 0.0806, "step": 256 }, { "epoch": 0.43726074011059124, "grad_norm": 0.29987546473911214, "learning_rate": 1.2485829298054952e-05, "loss": 0.0846, "step": 257 }, { "epoch": 0.4389621437686091, "grad_norm": 0.31175687442320515, "learning_rate": 1.2432312242995158e-05, "loss": 0.0971, "step": 258 }, { "epoch": 0.44066354742662694, "grad_norm": 0.3009904680059143, "learning_rate": 1.2378721040960788e-05, "loss": 0.0994, "step": 259 }, { "epoch": 0.44236495108464485, "grad_norm": 0.2562384582969849, "learning_rate": 1.232505732563416e-05, "loss": 0.0759, "step": 260 }, { "epoch": 0.4440663547426627, "grad_norm": 0.2550190452410635, "learning_rate": 1.2271322732908091e-05, "loss": 0.0733, "step": 261 }, { "epoch": 0.44576775840068056, "grad_norm": 0.2515270792806656, "learning_rate": 1.2217518900836045e-05, "loss": 0.0708, "step": 262 }, { "epoch": 0.4474691620586984, "grad_norm": 0.18357019713578807, "learning_rate": 1.2163647469582181e-05, "loss": 0.0515, "step": 263 }, { "epoch": 0.44917056571671626, "grad_norm": 0.2671034404389676, "learning_rate": 1.210971008137136e-05, "loss": 0.0825, "step": 264 }, { "epoch": 0.45087196937473417, "grad_norm": 0.2571681277129728, "learning_rate": 1.2055708380439089e-05, "loss": 0.1042, "step": 265 }, { "epoch": 0.452573373032752, "grad_norm": 0.21793020282041717, "learning_rate": 1.2001644012981392e-05, "loss": 0.0672, "step": 266 }, { "epoch": 0.4542747766907699, "grad_norm": 0.3515840793799933, "learning_rate": 1.1947518627104637e-05, "loss": 0.1232, "step": 267 }, { "epoch": 0.45597618034878773, "grad_norm": 0.22270456303325817, "learning_rate": 1.1893333872775275e-05, "loss": 0.084, "step": 268 }, { "epoch": 0.45767758400680564, "grad_norm": 0.1770352144116934, "learning_rate": 1.1839091401769559e-05, "loss": 0.0435, "step": 269 }, { "epoch": 0.4593789876648235, "grad_norm": 0.19476419487806804, "learning_rate": 1.1784792867623179e-05, "loss": 0.0535, "step": 270 }, { "epoch": 0.46108039132284134, "grad_norm": 0.2961488962332661, "learning_rate": 1.1730439925580876e-05, "loss": 0.1054, "step": 271 }, { "epoch": 0.4627817949808592, "grad_norm": 0.23209294477341702, "learning_rate": 1.1676034232545963e-05, "loss": 0.0898, "step": 272 }, { "epoch": 0.46448319863887705, "grad_norm": 0.3192995286716727, "learning_rate": 1.1621577447029816e-05, "loss": 0.0864, "step": 273 }, { "epoch": 0.46618460229689496, "grad_norm": 0.22033834311649156, "learning_rate": 1.1567071229101332e-05, "loss": 0.061, "step": 274 }, { "epoch": 0.4678860059549128, "grad_norm": 0.24431011933295152, "learning_rate": 1.1512517240336304e-05, "loss": 0.05, "step": 275 }, { "epoch": 0.46958740961293066, "grad_norm": 0.2501002015877452, "learning_rate": 1.1457917143766786e-05, "loss": 0.0811, "step": 276 }, { "epoch": 0.4712888132709485, "grad_norm": 0.15591637050035256, "learning_rate": 1.1403272603830384e-05, "loss": 0.0439, "step": 277 }, { "epoch": 0.4729902169289664, "grad_norm": 0.23445891677475122, "learning_rate": 1.1348585286319529e-05, "loss": 0.0562, "step": 278 }, { "epoch": 0.4746916205869843, "grad_norm": 0.22953431606642624, "learning_rate": 1.1293856858330678e-05, "loss": 0.0712, "step": 279 }, { "epoch": 0.47639302424500213, "grad_norm": 0.24410355979016798, "learning_rate": 1.1239088988213522e-05, "loss": 0.0652, "step": 280 }, { "epoch": 0.47809442790302, "grad_norm": 0.18183702432279936, "learning_rate": 1.11842833455201e-05, "loss": 0.0464, "step": 281 }, { "epoch": 0.47979583156103783, "grad_norm": 0.3068246013758465, "learning_rate": 1.1129441600953916e-05, "loss": 0.101, "step": 282 }, { "epoch": 0.48149723521905574, "grad_norm": 0.2725071360381625, "learning_rate": 1.1074565426319014e-05, "loss": 0.0906, "step": 283 }, { "epoch": 0.4831986388770736, "grad_norm": 0.23078769715046143, "learning_rate": 1.101965649446901e-05, "loss": 0.0659, "step": 284 }, { "epoch": 0.48490004253509145, "grad_norm": 0.24607556757801813, "learning_rate": 1.0964716479256094e-05, "loss": 0.0581, "step": 285 }, { "epoch": 0.4866014461931093, "grad_norm": 0.2039783514676341, "learning_rate": 1.0909747055480004e-05, "loss": 0.042, "step": 286 }, { "epoch": 0.4883028498511272, "grad_norm": 0.2132647216715679, "learning_rate": 1.0854749898836974e-05, "loss": 0.042, "step": 287 }, { "epoch": 0.49000425350914506, "grad_norm": 0.17146104368359683, "learning_rate": 1.0799726685868648e-05, "loss": 0.0486, "step": 288 }, { "epoch": 0.4917056571671629, "grad_norm": 0.3101626941591736, "learning_rate": 1.0744679093910987e-05, "loss": 0.0855, "step": 289 }, { "epoch": 0.49340706082518077, "grad_norm": 0.26359884054130955, "learning_rate": 1.0689608801043107e-05, "loss": 0.0671, "step": 290 }, { "epoch": 0.4951084644831986, "grad_norm": 0.21556126895353891, "learning_rate": 1.063451748603616e-05, "loss": 0.076, "step": 291 }, { "epoch": 0.4968098681412165, "grad_norm": 0.27181671931172413, "learning_rate": 1.0579406828302124e-05, "loss": 0.0847, "step": 292 }, { "epoch": 0.4985112717992344, "grad_norm": 0.31490941982013704, "learning_rate": 1.0524278507842637e-05, "loss": 0.1254, "step": 293 }, { "epoch": 0.5002126754572522, "grad_norm": 0.28738576717755915, "learning_rate": 1.0469134205197762e-05, "loss": 0.0741, "step": 294 }, { "epoch": 0.5019140791152701, "grad_norm": 0.24967744618311208, "learning_rate": 1.0413975601394765e-05, "loss": 0.0952, "step": 295 }, { "epoch": 0.5036154827732879, "grad_norm": 0.18653535652798736, "learning_rate": 1.0358804377896876e-05, "loss": 0.0666, "step": 296 }, { "epoch": 0.5053168864313058, "grad_norm": 0.31524703768298645, "learning_rate": 1.0303622216552022e-05, "loss": 0.0821, "step": 297 }, { "epoch": 0.5070182900893236, "grad_norm": 0.19693363449938797, "learning_rate": 1.0248430799541564e-05, "loss": 0.0486, "step": 298 }, { "epoch": 0.5087196937473416, "grad_norm": 0.3100958390915376, "learning_rate": 1.019323180932901e-05, "loss": 0.075, "step": 299 }, { "epoch": 0.5104210974053595, "grad_norm": 0.2263867395185222, "learning_rate": 1.013802692860873e-05, "loss": 0.0729, "step": 300 }, { "epoch": 0.5121225010633773, "grad_norm": 0.30499024558777427, "learning_rate": 1.0082817840254667e-05, "loss": 0.0949, "step": 301 }, { "epoch": 0.5138239047213952, "grad_norm": 0.27412297007732506, "learning_rate": 1.0027606227269026e-05, "loss": 0.0711, "step": 302 }, { "epoch": 0.515525308379413, "grad_norm": 0.2236619712267448, "learning_rate": 9.972393772730975e-06, "loss": 0.0711, "step": 303 }, { "epoch": 0.5172267120374309, "grad_norm": 0.21645526531787626, "learning_rate": 9.917182159745335e-06, "loss": 0.0696, "step": 304 }, { "epoch": 0.5189281156954487, "grad_norm": 0.19297627616781193, "learning_rate": 9.861973071391272e-06, "loss": 0.0723, "step": 305 }, { "epoch": 0.5206295193534666, "grad_norm": 0.17935274621615926, "learning_rate": 9.806768190670994e-06, "loss": 0.0603, "step": 306 }, { "epoch": 0.5223309230114844, "grad_norm": 0.36516731843883593, "learning_rate": 9.751569200458438e-06, "loss": 0.1183, "step": 307 }, { "epoch": 0.5240323266695024, "grad_norm": 0.2666543854374252, "learning_rate": 9.69637778344798e-06, "loss": 0.0683, "step": 308 }, { "epoch": 0.5257337303275202, "grad_norm": 0.1559589176353223, "learning_rate": 9.641195622103126e-06, "loss": 0.0457, "step": 309 }, { "epoch": 0.5274351339855381, "grad_norm": 0.21822959242881637, "learning_rate": 9.586024398605238e-06, "loss": 0.0728, "step": 310 }, { "epoch": 0.529136537643556, "grad_norm": 0.1910067933489864, "learning_rate": 9.530865794802243e-06, "loss": 0.0518, "step": 311 }, { "epoch": 0.5308379413015738, "grad_norm": 0.2286981911174439, "learning_rate": 9.475721492157365e-06, "loss": 0.0538, "step": 312 }, { "epoch": 0.5325393449595917, "grad_norm": 0.2649695673029832, "learning_rate": 9.420593171697876e-06, "loss": 0.086, "step": 313 }, { "epoch": 0.5342407486176095, "grad_norm": 0.2420342613285877, "learning_rate": 9.365482513963844e-06, "loss": 0.0972, "step": 314 }, { "epoch": 0.5359421522756274, "grad_norm": 0.2855237701478863, "learning_rate": 9.310391198956896e-06, "loss": 0.0795, "step": 315 }, { "epoch": 0.5376435559336452, "grad_norm": 0.19062890218128994, "learning_rate": 9.255320906089017e-06, "loss": 0.0385, "step": 316 }, { "epoch": 0.5393449595916632, "grad_norm": 0.12818263127205254, "learning_rate": 9.200273314131356e-06, "loss": 0.0358, "step": 317 }, { "epoch": 0.541046363249681, "grad_norm": 0.21271005750639677, "learning_rate": 9.145250101163032e-06, "loss": 0.0511, "step": 318 }, { "epoch": 0.5427477669076989, "grad_norm": 0.3230145385857894, "learning_rate": 9.090252944520002e-06, "loss": 0.1249, "step": 319 }, { "epoch": 0.5444491705657167, "grad_norm": 0.17129666760859652, "learning_rate": 9.035283520743911e-06, "loss": 0.0473, "step": 320 }, { "epoch": 0.5461505742237346, "grad_norm": 0.25326344089197617, "learning_rate": 8.980343505530988e-06, "loss": 0.0613, "step": 321 }, { "epoch": 0.5478519778817524, "grad_norm": 0.2847520686151069, "learning_rate": 8.925434573680986e-06, "loss": 0.0883, "step": 322 }, { "epoch": 0.5495533815397703, "grad_norm": 0.358576412450163, "learning_rate": 8.870558399046086e-06, "loss": 0.1097, "step": 323 }, { "epoch": 0.5512547851977881, "grad_norm": 0.21816479879448794, "learning_rate": 8.815716654479903e-06, "loss": 0.0766, "step": 324 }, { "epoch": 0.552956188855806, "grad_norm": 0.27098416317022683, "learning_rate": 8.76091101178648e-06, "loss": 0.0959, "step": 325 }, { "epoch": 0.5546575925138238, "grad_norm": 0.16228239529758662, "learning_rate": 8.706143141669324e-06, "loss": 0.0427, "step": 326 }, { "epoch": 0.5563589961718418, "grad_norm": 0.19457073616768888, "learning_rate": 8.651414713680474e-06, "loss": 0.0674, "step": 327 }, { "epoch": 0.5580603998298597, "grad_norm": 0.21506185063350097, "learning_rate": 8.59672739616962e-06, "loss": 0.0725, "step": 328 }, { "epoch": 0.5597618034878775, "grad_norm": 0.25928980929110046, "learning_rate": 8.542082856233216e-06, "loss": 0.0926, "step": 329 }, { "epoch": 0.5614632071458954, "grad_norm": 0.19301809133671421, "learning_rate": 8.487482759663696e-06, "loss": 0.0661, "step": 330 }, { "epoch": 0.5631646108039132, "grad_norm": 0.19336796199191325, "learning_rate": 8.43292877089867e-06, "loss": 0.0694, "step": 331 }, { "epoch": 0.5648660144619311, "grad_norm": 0.2300001624245782, "learning_rate": 8.378422552970185e-06, "loss": 0.0746, "step": 332 }, { "epoch": 0.5665674181199489, "grad_norm": 0.23998094239502984, "learning_rate": 8.32396576745404e-06, "loss": 0.0696, "step": 333 }, { "epoch": 0.5682688217779668, "grad_norm": 0.2052361707208072, "learning_rate": 8.269560074419126e-06, "loss": 0.0624, "step": 334 }, { "epoch": 0.5699702254359846, "grad_norm": 0.2962367385563096, "learning_rate": 8.215207132376824e-06, "loss": 0.124, "step": 335 }, { "epoch": 0.5716716290940026, "grad_norm": 0.24752974776344203, "learning_rate": 8.160908598230448e-06, "loss": 0.0653, "step": 336 }, { "epoch": 0.5733730327520205, "grad_norm": 0.18127533151541284, "learning_rate": 8.10666612722473e-06, "loss": 0.0591, "step": 337 }, { "epoch": 0.5750744364100383, "grad_norm": 0.18346580605719615, "learning_rate": 8.052481372895363e-06, "loss": 0.0488, "step": 338 }, { "epoch": 0.5767758400680562, "grad_norm": 0.31877710293947625, "learning_rate": 7.998355987018606e-06, "loss": 0.0872, "step": 339 }, { "epoch": 0.578477243726074, "grad_norm": 0.1344651628761348, "learning_rate": 7.944291619560914e-06, "loss": 0.0403, "step": 340 }, { "epoch": 0.5801786473840919, "grad_norm": 0.13295165420726127, "learning_rate": 7.890289918628644e-06, "loss": 0.0476, "step": 341 }, { "epoch": 0.5818800510421097, "grad_norm": 0.27781942031149137, "learning_rate": 7.836352530417824e-06, "loss": 0.0925, "step": 342 }, { "epoch": 0.5835814547001276, "grad_norm": 0.2923332990407699, "learning_rate": 7.782481099163958e-06, "loss": 0.1173, "step": 343 }, { "epoch": 0.5852828583581454, "grad_norm": 0.20398201081622527, "learning_rate": 7.728677267091912e-06, "loss": 0.0712, "step": 344 }, { "epoch": 0.5869842620161634, "grad_norm": 0.1827445671136079, "learning_rate": 7.674942674365847e-06, "loss": 0.0588, "step": 345 }, { "epoch": 0.5886856656741812, "grad_norm": 0.16240437969547905, "learning_rate": 7.621278959039217e-06, "loss": 0.0637, "step": 346 }, { "epoch": 0.5903870693321991, "grad_norm": 0.16062106806552065, "learning_rate": 7.567687757004843e-06, "loss": 0.0414, "step": 347 }, { "epoch": 0.5920884729902169, "grad_norm": 0.2689581192862772, "learning_rate": 7.514170701945047e-06, "loss": 0.0897, "step": 348 }, { "epoch": 0.5937898766482348, "grad_norm": 0.2681187781079101, "learning_rate": 7.460729425281831e-06, "loss": 0.0709, "step": 349 }, { "epoch": 0.5954912803062526, "grad_norm": 0.17526558434710648, "learning_rate": 7.407365556127162e-06, "loss": 0.0539, "step": 350 }, { "epoch": 0.5971926839642705, "grad_norm": 0.1648773603270098, "learning_rate": 7.354080721233303e-06, "loss": 0.0503, "step": 351 }, { "epoch": 0.5988940876222884, "grad_norm": 0.17999080092184985, "learning_rate": 7.300876544943227e-06, "loss": 0.0605, "step": 352 }, { "epoch": 0.6005954912803062, "grad_norm": 0.20308998126444186, "learning_rate": 7.247754649141097e-06, "loss": 0.0769, "step": 353 }, { "epoch": 0.6022968949383242, "grad_norm": 0.16018163485236867, "learning_rate": 7.194716653202826e-06, "loss": 0.0545, "step": 354 }, { "epoch": 0.603998298596342, "grad_norm": 0.24744753769790884, "learning_rate": 7.1417641739467104e-06, "loss": 0.0776, "step": 355 }, { "epoch": 0.6056997022543599, "grad_norm": 0.2298041880240223, "learning_rate": 7.088898825584139e-06, "loss": 0.0674, "step": 356 }, { "epoch": 0.6074011059123777, "grad_norm": 0.19841362318124559, "learning_rate": 7.036122219670398e-06, "loss": 0.0635, "step": 357 }, { "epoch": 0.6091025095703956, "grad_norm": 0.18877644807321198, "learning_rate": 6.9834359650555305e-06, "loss": 0.0777, "step": 358 }, { "epoch": 0.6108039132284134, "grad_norm": 0.16102641349173863, "learning_rate": 6.930841667835295e-06, "loss": 0.0576, "step": 359 }, { "epoch": 0.6125053168864313, "grad_norm": 0.20224797100905906, "learning_rate": 6.878340931302208e-06, "loss": 0.0754, "step": 360 }, { "epoch": 0.6142067205444491, "grad_norm": 0.2857194289415506, "learning_rate": 6.825935355896669e-06, "loss": 0.1052, "step": 361 }, { "epoch": 0.615908124202467, "grad_norm": 0.20616924754434873, "learning_rate": 6.773626539158171e-06, "loss": 0.0716, "step": 362 }, { "epoch": 0.617609527860485, "grad_norm": 0.23846455066099467, "learning_rate": 6.721416075676601e-06, "loss": 0.0847, "step": 363 }, { "epoch": 0.6193109315185028, "grad_norm": 0.14989055759308637, "learning_rate": 6.669305557043626e-06, "loss": 0.0371, "step": 364 }, { "epoch": 0.6210123351765207, "grad_norm": 0.25146318527723016, "learning_rate": 6.617296571804191e-06, "loss": 0.0938, "step": 365 }, { "epoch": 0.6227137388345385, "grad_norm": 0.2795331744757292, "learning_rate": 6.565390705408072e-06, "loss": 0.0503, "step": 366 }, { "epoch": 0.6244151424925564, "grad_norm": 0.17139219570184439, "learning_rate": 6.513589540161556e-06, "loss": 0.0578, "step": 367 }, { "epoch": 0.6261165461505742, "grad_norm": 0.157560985399721, "learning_rate": 6.461894655179204e-06, "loss": 0.0582, "step": 368 }, { "epoch": 0.6278179498085921, "grad_norm": 0.22752999438352467, "learning_rate": 6.410307626335717e-06, "loss": 0.0779, "step": 369 }, { "epoch": 0.6295193534666099, "grad_norm": 0.27521908636927156, "learning_rate": 6.358830026217887e-06, "loss": 0.0826, "step": 370 }, { "epoch": 0.6312207571246278, "grad_norm": 0.21611309006232896, "learning_rate": 6.30746342407667e-06, "loss": 0.0575, "step": 371 }, { "epoch": 0.6329221607826457, "grad_norm": 0.2022628238984182, "learning_rate": 6.256209385779341e-06, "loss": 0.0597, "step": 372 }, { "epoch": 0.6346235644406636, "grad_norm": 0.20087283890633761, "learning_rate": 6.205069473761756e-06, "loss": 0.0565, "step": 373 }, { "epoch": 0.6363249680986814, "grad_norm": 0.27973559478745097, "learning_rate": 6.154045246980742e-06, "loss": 0.0777, "step": 374 }, { "epoch": 0.6380263717566993, "grad_norm": 0.2153647046344647, "learning_rate": 6.1031382608665456e-06, "loss": 0.065, "step": 375 }, { "epoch": 0.6397277754147171, "grad_norm": 0.15832658385378948, "learning_rate": 6.052350067275441e-06, "loss": 0.0463, "step": 376 }, { "epoch": 0.641429179072735, "grad_norm": 0.24889956825525697, "learning_rate": 6.001682214442406e-06, "loss": 0.0868, "step": 377 }, { "epoch": 0.6431305827307529, "grad_norm": 0.2537522589782198, "learning_rate": 5.951136246933933e-06, "loss": 0.0771, "step": 378 }, { "epoch": 0.6448319863887707, "grad_norm": 0.35384965176549915, "learning_rate": 5.900713705600951e-06, "loss": 0.0885, "step": 379 }, { "epoch": 0.6465333900467886, "grad_norm": 0.24583176378622248, "learning_rate": 5.850416127531841e-06, "loss": 0.076, "step": 380 }, { "epoch": 0.6482347937048064, "grad_norm": 0.19401849479737754, "learning_rate": 5.800245046005585e-06, "loss": 0.055, "step": 381 }, { "epoch": 0.6499361973628244, "grad_norm": 0.22292321754995165, "learning_rate": 5.750201990445024e-06, "loss": 0.0837, "step": 382 }, { "epoch": 0.6516376010208422, "grad_norm": 0.15980625639550533, "learning_rate": 5.70028848637024e-06, "loss": 0.053, "step": 383 }, { "epoch": 0.6533390046788601, "grad_norm": 0.17476218437373806, "learning_rate": 5.650506055352052e-06, "loss": 0.047, "step": 384 }, { "epoch": 0.6550404083368779, "grad_norm": 0.26159431180356163, "learning_rate": 5.600856214965613e-06, "loss": 0.075, "step": 385 }, { "epoch": 0.6567418119948958, "grad_norm": 0.24983048217170784, "learning_rate": 5.551340478744176e-06, "loss": 0.0819, "step": 386 }, { "epoch": 0.6584432156529136, "grad_norm": 0.277677983790708, "learning_rate": 5.501960356132945e-06, "loss": 0.0743, "step": 387 }, { "epoch": 0.6601446193109315, "grad_norm": 0.2687619515031017, "learning_rate": 5.4527173524430395e-06, "loss": 0.076, "step": 388 }, { "epoch": 0.6618460229689493, "grad_norm": 0.18825889492381687, "learning_rate": 5.403612968805649e-06, "loss": 0.0533, "step": 389 }, { "epoch": 0.6635474266269672, "grad_norm": 0.22722813251197366, "learning_rate": 5.354648702126229e-06, "loss": 0.0669, "step": 390 }, { "epoch": 0.6652488302849852, "grad_norm": 0.15428942499610793, "learning_rate": 5.305826045038899e-06, "loss": 0.0496, "step": 391 }, { "epoch": 0.666950233943003, "grad_norm": 0.29481966419649847, "learning_rate": 5.257146485860927e-06, "loss": 0.0871, "step": 392 }, { "epoch": 0.6686516376010209, "grad_norm": 0.2132988743676148, "learning_rate": 5.208611508547367e-06, "loss": 0.072, "step": 393 }, { "epoch": 0.6703530412590387, "grad_norm": 0.1932211736507852, "learning_rate": 5.160222592645808e-06, "loss": 0.0672, "step": 394 }, { "epoch": 0.6720544449170566, "grad_norm": 0.32017863808058095, "learning_rate": 5.111981213251293e-06, "loss": 0.0996, "step": 395 }, { "epoch": 0.6737558485750744, "grad_norm": 0.33076228774617505, "learning_rate": 5.063888840961325e-06, "loss": 0.1062, "step": 396 }, { "epoch": 0.6754572522330923, "grad_norm": 0.2152145333210106, "learning_rate": 5.015946941831064e-06, "loss": 0.0682, "step": 397 }, { "epoch": 0.6771586558911101, "grad_norm": 0.21543841192984545, "learning_rate": 4.968156977328626e-06, "loss": 0.0572, "step": 398 }, { "epoch": 0.678860059549128, "grad_norm": 0.16950132912260057, "learning_rate": 4.920520404290512e-06, "loss": 0.0577, "step": 399 }, { "epoch": 0.680561463207146, "grad_norm": 0.20787322030508298, "learning_rate": 4.87303867487723e-06, "loss": 0.0561, "step": 400 }, { "epoch": 0.6822628668651638, "grad_norm": 0.1533621298140527, "learning_rate": 4.825713236529005e-06, "loss": 0.0435, "step": 401 }, { "epoch": 0.6839642705231816, "grad_norm": 0.18296820958014204, "learning_rate": 4.778545531921668e-06, "loss": 0.0538, "step": 402 }, { "epoch": 0.6856656741811995, "grad_norm": 0.24677189398080018, "learning_rate": 4.731536998922657e-06, "loss": 0.0715, "step": 403 }, { "epoch": 0.6873670778392174, "grad_norm": 0.18381971512083234, "learning_rate": 4.684689070547216e-06, "loss": 0.0589, "step": 404 }, { "epoch": 0.6890684814972352, "grad_norm": 0.19563535366138982, "learning_rate": 4.638003174914675e-06, "loss": 0.0375, "step": 405 }, { "epoch": 0.6907698851552531, "grad_norm": 0.27920320369616836, "learning_rate": 4.591480735204953e-06, "loss": 0.0657, "step": 406 }, { "epoch": 0.6924712888132709, "grad_norm": 0.22501473081164228, "learning_rate": 4.545123169615134e-06, "loss": 0.0754, "step": 407 }, { "epoch": 0.6941726924712888, "grad_norm": 0.18685344173190274, "learning_rate": 4.49893189131627e-06, "loss": 0.0753, "step": 408 }, { "epoch": 0.6958740961293067, "grad_norm": 0.17185385088024444, "learning_rate": 4.45290830841028e-06, "loss": 0.0514, "step": 409 }, { "epoch": 0.6975754997873246, "grad_norm": 0.2702549233525611, "learning_rate": 4.407053823887033e-06, "loss": 0.0833, "step": 410 }, { "epoch": 0.6992769034453424, "grad_norm": 0.23362806883478313, "learning_rate": 4.361369835581569e-06, "loss": 0.0769, "step": 411 }, { "epoch": 0.7009783071033603, "grad_norm": 0.2101592538580294, "learning_rate": 4.315857736131508e-06, "loss": 0.0602, "step": 412 }, { "epoch": 0.7026797107613781, "grad_norm": 0.20842960868238944, "learning_rate": 4.2705189129345814e-06, "loss": 0.074, "step": 413 }, { "epoch": 0.704381114419396, "grad_norm": 0.18803427484767865, "learning_rate": 4.225354748106328e-06, "loss": 0.07, "step": 414 }, { "epoch": 0.7060825180774138, "grad_norm": 0.3066569805512131, "learning_rate": 4.180366618437996e-06, "loss": 0.093, "step": 415 }, { "epoch": 0.7077839217354317, "grad_norm": 0.1744953221856188, "learning_rate": 4.13555589535453e-06, "loss": 0.0555, "step": 416 }, { "epoch": 0.7094853253934496, "grad_norm": 0.166243605934049, "learning_rate": 4.0909239448727985e-06, "loss": 0.061, "step": 417 }, { "epoch": 0.7111867290514675, "grad_norm": 0.20351482232627222, "learning_rate": 4.046472127559937e-06, "loss": 0.0715, "step": 418 }, { "epoch": 0.7128881327094854, "grad_norm": 0.21225234915881963, "learning_rate": 4.002201798491875e-06, "loss": 0.0502, "step": 419 }, { "epoch": 0.7145895363675032, "grad_norm": 0.20113105383651986, "learning_rate": 3.958114307212018e-06, "loss": 0.0645, "step": 420 }, { "epoch": 0.7162909400255211, "grad_norm": 0.18912603242706336, "learning_rate": 3.91421099769013e-06, "loss": 0.0642, "step": 421 }, { "epoch": 0.7179923436835389, "grad_norm": 0.1879026045376002, "learning_rate": 3.870493208281337e-06, "loss": 0.0479, "step": 422 }, { "epoch": 0.7196937473415568, "grad_norm": 0.26477442490530756, "learning_rate": 3.826962271685351e-06, "loss": 0.0831, "step": 423 }, { "epoch": 0.7213951509995746, "grad_norm": 0.29428954187807327, "learning_rate": 3.7836195149058386e-06, "loss": 0.0724, "step": 424 }, { "epoch": 0.7230965546575925, "grad_norm": 0.23675032676032767, "learning_rate": 3.7404662592099483e-06, "loss": 0.0854, "step": 425 }, { "epoch": 0.7247979583156103, "grad_norm": 0.283635317471, "learning_rate": 3.697503820088063e-06, "loss": 0.0805, "step": 426 }, { "epoch": 0.7264993619736282, "grad_norm": 0.20299536681350067, "learning_rate": 3.654733507213678e-06, "loss": 0.0629, "step": 427 }, { "epoch": 0.7282007656316462, "grad_norm": 0.2737835974179114, "learning_rate": 3.61215662440349e-06, "loss": 0.0813, "step": 428 }, { "epoch": 0.729902169289664, "grad_norm": 0.2292958769762407, "learning_rate": 3.5697744695776326e-06, "loss": 0.0625, "step": 429 }, { "epoch": 0.7316035729476819, "grad_norm": 0.2690574028374729, "learning_rate": 3.5275883347201336e-06, "loss": 0.0895, "step": 430 }, { "epoch": 0.7333049766056997, "grad_norm": 0.2612064488237033, "learning_rate": 3.4855995058395066e-06, "loss": 0.076, "step": 431 }, { "epoch": 0.7350063802637176, "grad_norm": 0.20866850048700003, "learning_rate": 3.443809262929575e-06, "loss": 0.0719, "step": 432 }, { "epoch": 0.7367077839217354, "grad_norm": 0.2274495669126989, "learning_rate": 3.4022188799304214e-06, "loss": 0.0754, "step": 433 }, { "epoch": 0.7384091875797533, "grad_norm": 0.21436057034429756, "learning_rate": 3.36082962468958e-06, "loss": 0.0634, "step": 434 }, { "epoch": 0.7401105912377711, "grad_norm": 0.2122758067179719, "learning_rate": 3.3196427589233725e-06, "loss": 0.0605, "step": 435 }, { "epoch": 0.741811994895789, "grad_norm": 0.2387395787166068, "learning_rate": 3.2786595381784512e-06, "loss": 0.0679, "step": 436 }, { "epoch": 0.7435133985538069, "grad_norm": 0.31722612816219875, "learning_rate": 3.2378812117935154e-06, "loss": 0.1076, "step": 437 }, { "epoch": 0.7452148022118248, "grad_norm": 0.34631520054017056, "learning_rate": 3.1973090228612404e-06, "loss": 0.1121, "step": 438 }, { "epoch": 0.7469162058698426, "grad_norm": 0.23043404441429802, "learning_rate": 3.15694420819038e-06, "loss": 0.0877, "step": 439 }, { "epoch": 0.7486176095278605, "grad_norm": 0.09929972754097662, "learning_rate": 3.116787998268046e-06, "loss": 0.0281, "step": 440 }, { "epoch": 0.7503190131858783, "grad_norm": 0.2540228794638467, "learning_rate": 3.076841617222228e-06, "loss": 0.1016, "step": 441 }, { "epoch": 0.7520204168438962, "grad_norm": 0.20530486566659917, "learning_rate": 3.0371062827844434e-06, "loss": 0.0759, "step": 442 }, { "epoch": 0.753721820501914, "grad_norm": 0.22889025759698128, "learning_rate": 2.997583206252647e-06, "loss": 0.0641, "step": 443 }, { "epoch": 0.7554232241599319, "grad_norm": 0.22376344098617418, "learning_rate": 2.958273592454285e-06, "loss": 0.0696, "step": 444 }, { "epoch": 0.7571246278179498, "grad_norm": 0.29335139294143503, "learning_rate": 2.9191786397095778e-06, "loss": 0.0722, "step": 445 }, { "epoch": 0.7588260314759677, "grad_norm": 0.19904638249374088, "learning_rate": 2.880299539794975e-06, "loss": 0.0644, "step": 446 }, { "epoch": 0.7605274351339856, "grad_norm": 0.27398415500191214, "learning_rate": 2.841637477906851e-06, "loss": 0.097, "step": 447 }, { "epoch": 0.7622288387920034, "grad_norm": 0.1909061169980495, "learning_rate": 2.803193632625346e-06, "loss": 0.0653, "step": 448 }, { "epoch": 0.7639302424500213, "grad_norm": 0.1839577194240098, "learning_rate": 2.7649691758784603e-06, "loss": 0.0612, "step": 449 }, { "epoch": 0.7656316461080391, "grad_norm": 0.18749598930597564, "learning_rate": 2.7269652729063233e-06, "loss": 0.0586, "step": 450 }, { "epoch": 0.767333049766057, "grad_norm": 0.2550319493358391, "learning_rate": 2.689183082225659e-06, "loss": 0.0784, "step": 451 }, { "epoch": 0.7690344534240748, "grad_norm": 0.2702034699283639, "learning_rate": 2.65162375559449e-06, "loss": 0.1012, "step": 452 }, { "epoch": 0.7707358570820927, "grad_norm": 0.22699731806268653, "learning_rate": 2.614288437977014e-06, "loss": 0.08, "step": 453 }, { "epoch": 0.7724372607401105, "grad_norm": 0.28841442095335584, "learning_rate": 2.5771782675087078e-06, "loss": 0.105, "step": 454 }, { "epoch": 0.7741386643981285, "grad_norm": 0.2545180037798505, "learning_rate": 2.5402943754616182e-06, "loss": 0.0847, "step": 455 }, { "epoch": 0.7758400680561464, "grad_norm": 0.16486311867632228, "learning_rate": 2.5036378862099e-06, "loss": 0.0409, "step": 456 }, { "epoch": 0.7775414717141642, "grad_norm": 0.263761370929647, "learning_rate": 2.467209917195513e-06, "loss": 0.096, "step": 457 }, { "epoch": 0.7792428753721821, "grad_norm": 0.235530147590817, "learning_rate": 2.4310115788941855e-06, "loss": 0.0595, "step": 458 }, { "epoch": 0.7809442790301999, "grad_norm": 0.2110709448726579, "learning_rate": 2.3950439747815357e-06, "loss": 0.07, "step": 459 }, { "epoch": 0.7826456826882178, "grad_norm": 0.1763868737174647, "learning_rate": 2.359308201299454e-06, "loss": 0.0586, "step": 460 }, { "epoch": 0.7843470863462356, "grad_norm": 0.16676117425431294, "learning_rate": 2.3238053478226665e-06, "loss": 0.0492, "step": 461 }, { "epoch": 0.7860484900042535, "grad_norm": 0.15717970250735389, "learning_rate": 2.2885364966255372e-06, "loss": 0.0487, "step": 462 }, { "epoch": 0.7877498936622713, "grad_norm": 0.28197077618286126, "learning_rate": 2.2535027228490582e-06, "loss": 0.0857, "step": 463 }, { "epoch": 0.7894512973202893, "grad_norm": 0.264862322279995, "learning_rate": 2.2187050944680942e-06, "loss": 0.0937, "step": 464 }, { "epoch": 0.7911527009783071, "grad_norm": 0.22066167775922854, "learning_rate": 2.18414467225882e-06, "loss": 0.0642, "step": 465 }, { "epoch": 0.792854104636325, "grad_norm": 0.2250702122829751, "learning_rate": 2.1498225097663695e-06, "loss": 0.0831, "step": 466 }, { "epoch": 0.7945555082943428, "grad_norm": 0.22295479048611572, "learning_rate": 2.115739653272747e-06, "loss": 0.0631, "step": 467 }, { "epoch": 0.7962569119523607, "grad_norm": 0.24242984035739493, "learning_rate": 2.0818971417649013e-06, "loss": 0.0591, "step": 468 }, { "epoch": 0.7979583156103786, "grad_norm": 0.2612637093823693, "learning_rate": 2.048296006903081e-06, "loss": 0.1046, "step": 469 }, { "epoch": 0.7996597192683964, "grad_norm": 0.1792782441746806, "learning_rate": 2.0149372729893646e-06, "loss": 0.0445, "step": 470 }, { "epoch": 0.8013611229264143, "grad_norm": 0.29350099593257656, "learning_rate": 1.981821956936448e-06, "loss": 0.0804, "step": 471 }, { "epoch": 0.8030625265844321, "grad_norm": 0.22341777662676934, "learning_rate": 1.9489510682366363e-06, "loss": 0.0745, "step": 472 }, { "epoch": 0.8047639302424501, "grad_norm": 0.1589102792801742, "learning_rate": 1.916325608931079e-06, "loss": 0.047, "step": 473 }, { "epoch": 0.8064653339004679, "grad_norm": 0.2345326238035068, "learning_rate": 1.8839465735792095e-06, "loss": 0.0572, "step": 474 }, { "epoch": 0.8081667375584858, "grad_norm": 0.24004839314637838, "learning_rate": 1.8518149492284477e-06, "loss": 0.0884, "step": 475 }, { "epoch": 0.8098681412165036, "grad_norm": 0.29421822191095054, "learning_rate": 1.8199317153840933e-06, "loss": 0.0887, "step": 476 }, { "epoch": 0.8115695448745215, "grad_norm": 0.29025654033124915, "learning_rate": 1.7882978439794708e-06, "loss": 0.1021, "step": 477 }, { "epoch": 0.8132709485325393, "grad_norm": 0.25878138649406646, "learning_rate": 1.756914299346304e-06, "loss": 0.0616, "step": 478 }, { "epoch": 0.8149723521905572, "grad_norm": 0.2102766744469287, "learning_rate": 1.7257820381853197e-06, "loss": 0.0627, "step": 479 }, { "epoch": 0.816673755848575, "grad_norm": 0.24781694684746497, "learning_rate": 1.6949020095370816e-06, "loss": 0.0766, "step": 480 }, { "epoch": 0.8183751595065929, "grad_norm": 0.1734085990747018, "learning_rate": 1.6642751547530512e-06, "loss": 0.0514, "step": 481 }, { "epoch": 0.8200765631646108, "grad_norm": 0.2117204201108364, "learning_rate": 1.6339024074669107e-06, "loss": 0.0717, "step": 482 }, { "epoch": 0.8217779668226287, "grad_norm": 0.23022449445835655, "learning_rate": 1.6037846935660807e-06, "loss": 0.0697, "step": 483 }, { "epoch": 0.8234793704806466, "grad_norm": 0.2031147008426011, "learning_rate": 1.5739229311635152e-06, "loss": 0.0647, "step": 484 }, { "epoch": 0.8251807741386644, "grad_norm": 0.16452534724080284, "learning_rate": 1.5443180305696948e-06, "loss": 0.0477, "step": 485 }, { "epoch": 0.8268821777966823, "grad_norm": 0.22807646976291562, "learning_rate": 1.5149708942648922e-06, "loss": 0.0814, "step": 486 }, { "epoch": 0.8285835814547001, "grad_norm": 0.24827387251547514, "learning_rate": 1.4858824168716524e-06, "loss": 0.0755, "step": 487 }, { "epoch": 0.830284985112718, "grad_norm": 0.22526986180364844, "learning_rate": 1.4570534851275241e-06, "loss": 0.076, "step": 488 }, { "epoch": 0.8319863887707358, "grad_norm": 0.20155351877240635, "learning_rate": 1.4284849778580279e-06, "loss": 0.0698, "step": 489 }, { "epoch": 0.8336877924287537, "grad_norm": 0.1495610561199564, "learning_rate": 1.4001777659498584e-06, "loss": 0.04, "step": 490 }, { "epoch": 0.8353891960867715, "grad_norm": 0.22042874488356587, "learning_rate": 1.3721327123243533e-06, "loss": 0.0696, "step": 491 }, { "epoch": 0.8370905997447895, "grad_norm": 0.217650029772456, "learning_rate": 1.3443506719111666e-06, "loss": 0.0499, "step": 492 }, { "epoch": 0.8387920034028074, "grad_norm": 0.28478642607874244, "learning_rate": 1.3168324916222296e-06, "loss": 0.1052, "step": 493 }, { "epoch": 0.8404934070608252, "grad_norm": 0.2847136340573529, "learning_rate": 1.28957901032591e-06, "loss": 0.0772, "step": 494 }, { "epoch": 0.8421948107188431, "grad_norm": 0.21335659065873505, "learning_rate": 1.2625910588214608e-06, "loss": 0.0651, "step": 495 }, { "epoch": 0.8438962143768609, "grad_norm": 0.2007624647622523, "learning_rate": 1.2358694598136755e-06, "loss": 0.0579, "step": 496 }, { "epoch": 0.8455976180348788, "grad_norm": 0.22052289165556443, "learning_rate": 1.2094150278878303e-06, "loss": 0.0564, "step": 497 }, { "epoch": 0.8472990216928966, "grad_norm": 0.23003203856097848, "learning_rate": 1.1832285694848255e-06, "loss": 0.0604, "step": 498 }, { "epoch": 0.8490004253509145, "grad_norm": 0.18674042534277024, "learning_rate": 1.1573108828766255e-06, "loss": 0.0442, "step": 499 }, { "epoch": 0.8507018290089323, "grad_norm": 0.21647795156393285, "learning_rate": 1.1316627581419137e-06, "loss": 0.0535, "step": 500 }, { "epoch": 0.8524032326669503, "grad_norm": 0.3119714991626263, "learning_rate": 1.1062849771420025e-06, "loss": 0.1191, "step": 501 }, { "epoch": 0.8541046363249681, "grad_norm": 0.21164862051641084, "learning_rate": 1.0811783134970132e-06, "loss": 0.0658, "step": 502 }, { "epoch": 0.855806039982986, "grad_norm": 0.2724719475504451, "learning_rate": 1.0563435325622762e-06, "loss": 0.0736, "step": 503 }, { "epoch": 0.8575074436410038, "grad_norm": 0.22358105859093347, "learning_rate": 1.0317813914050157e-06, "loss": 0.0711, "step": 504 }, { "epoch": 0.8592088472990217, "grad_norm": 0.2978912375008609, "learning_rate": 1.007492638781259e-06, "loss": 0.0895, "step": 505 }, { "epoch": 0.8609102509570395, "grad_norm": 0.209790599151719, "learning_rate": 9.834780151130196e-07, "loss": 0.0718, "step": 506 }, { "epoch": 0.8626116546150574, "grad_norm": 0.1681086042265431, "learning_rate": 9.597382524657173e-07, "loss": 0.0592, "step": 507 }, { "epoch": 0.8643130582730753, "grad_norm": 0.20035287400774016, "learning_rate": 9.362740745258736e-07, "loss": 0.074, "step": 508 }, { "epoch": 0.8660144619310931, "grad_norm": 0.17882197027245497, "learning_rate": 9.13086196579035e-07, "loss": 0.0481, "step": 509 }, { "epoch": 0.8677158655891111, "grad_norm": 0.2010107765861354, "learning_rate": 8.901753254879885e-07, "loss": 0.0599, "step": 510 }, { "epoch": 0.8694172692471289, "grad_norm": 0.1784357377756698, "learning_rate": 8.67542159671192e-07, "loss": 0.0422, "step": 511 }, { "epoch": 0.8711186729051468, "grad_norm": 0.2595041553389473, "learning_rate": 8.451873890814988e-07, "loss": 0.0834, "step": 512 }, { "epoch": 0.8728200765631646, "grad_norm": 0.18720029506475785, "learning_rate": 8.231116951851204e-07, "loss": 0.0441, "step": 513 }, { "epoch": 0.8745214802211825, "grad_norm": 0.13605762437865937, "learning_rate": 8.013157509408509e-07, "loss": 0.0499, "step": 514 }, { "epoch": 0.8762228838792003, "grad_norm": 0.3392074002609155, "learning_rate": 7.79800220779554e-07, "loss": 0.0935, "step": 515 }, { "epoch": 0.8779242875372182, "grad_norm": 0.27563422248531844, "learning_rate": 7.585657605839059e-07, "loss": 0.0749, "step": 516 }, { "epoch": 0.879625691195236, "grad_norm": 0.2928759924757975, "learning_rate": 7.376130176684082e-07, "loss": 0.107, "step": 517 }, { "epoch": 0.8813270948532539, "grad_norm": 0.20820494565138964, "learning_rate": 7.169426307596428e-07, "loss": 0.0711, "step": 518 }, { "epoch": 0.8830284985112719, "grad_norm": 0.1778541297140114, "learning_rate": 6.965552299768186e-07, "loss": 0.0548, "step": 519 }, { "epoch": 0.8847299021692897, "grad_norm": 0.18347737015999377, "learning_rate": 6.764514368125419e-07, "loss": 0.0468, "step": 520 }, { "epoch": 0.8864313058273076, "grad_norm": 0.22402928085104057, "learning_rate": 6.566318641138902e-07, "loss": 0.0819, "step": 521 }, { "epoch": 0.8881327094853254, "grad_norm": 0.10087048048840686, "learning_rate": 6.370971160637129e-07, "loss": 0.0257, "step": 522 }, { "epoch": 0.8898341131433433, "grad_norm": 0.24244337938824437, "learning_rate": 6.178477881622325e-07, "loss": 0.0929, "step": 523 }, { "epoch": 0.8915355168013611, "grad_norm": 0.2215797215803862, "learning_rate": 5.98884467208869e-07, "loss": 0.0707, "step": 524 }, { "epoch": 0.893236920459379, "grad_norm": 0.21837004790407136, "learning_rate": 5.802077312843723e-07, "loss": 0.0601, "step": 525 }, { "epoch": 0.8949383241173968, "grad_norm": 0.13798341621078078, "learning_rate": 5.618181497331865e-07, "loss": 0.0387, "step": 526 }, { "epoch": 0.8966397277754147, "grad_norm": 0.18913792414386757, "learning_rate": 5.437162831460962e-07, "loss": 0.0498, "step": 527 }, { "epoch": 0.8983411314334325, "grad_norm": 0.22877307977873534, "learning_rate": 5.259026833431468e-07, "loss": 0.0704, "step": 528 }, { "epoch": 0.9000425350914505, "grad_norm": 0.20234348876984656, "learning_rate": 5.083778933568073e-07, "loss": 0.0649, "step": 529 }, { "epoch": 0.9017439387494683, "grad_norm": 0.2531050005740147, "learning_rate": 4.911424474154314e-07, "loss": 0.0878, "step": 530 }, { "epoch": 0.9034453424074862, "grad_norm": 0.24193903233807051, "learning_rate": 4.741968709269573e-07, "loss": 0.073, "step": 531 }, { "epoch": 0.905146746065504, "grad_norm": 0.2541146802125869, "learning_rate": 4.575416804629085e-07, "loss": 0.0563, "step": 532 }, { "epoch": 0.9068481497235219, "grad_norm": 0.16762038698102058, "learning_rate": 4.411773837426303e-07, "loss": 0.053, "step": 533 }, { "epoch": 0.9085495533815398, "grad_norm": 0.23076714189266884, "learning_rate": 4.2510447961782055e-07, "loss": 0.0687, "step": 534 }, { "epoch": 0.9102509570395576, "grad_norm": 0.26029701749366, "learning_rate": 4.093234580573202e-07, "loss": 0.0765, "step": 535 }, { "epoch": 0.9119523606975755, "grad_norm": 0.2130287074707999, "learning_rate": 3.938348001321812e-07, "loss": 0.062, "step": 536 }, { "epoch": 0.9136537643555933, "grad_norm": 0.18154744728141692, "learning_rate": 3.786389780009958e-07, "loss": 0.0521, "step": 537 }, { "epoch": 0.9153551680136113, "grad_norm": 0.15469238363767565, "learning_rate": 3.637364548955047e-07, "loss": 0.0358, "step": 538 }, { "epoch": 0.9170565716716291, "grad_norm": 0.3678961865523596, "learning_rate": 3.491276851064784e-07, "loss": 0.0881, "step": 539 }, { "epoch": 0.918757975329647, "grad_norm": 0.23428870195639928, "learning_rate": 3.3481311396986626e-07, "loss": 0.0844, "step": 540 }, { "epoch": 0.9204593789876648, "grad_norm": 0.2491236742624812, "learning_rate": 3.2079317785322363e-07, "loss": 0.0767, "step": 541 }, { "epoch": 0.9221607826456827, "grad_norm": 0.27866990645467393, "learning_rate": 3.0706830414240164e-07, "loss": 0.0862, "step": 542 }, { "epoch": 0.9238621863037005, "grad_norm": 0.13956980085260687, "learning_rate": 2.9363891122853097e-07, "loss": 0.0437, "step": 543 }, { "epoch": 0.9255635899617184, "grad_norm": 0.1975336620060032, "learning_rate": 2.805054084952552e-07, "loss": 0.076, "step": 544 }, { "epoch": 0.9272649936197362, "grad_norm": 0.22649643287994412, "learning_rate": 2.6766819630626216e-07, "loss": 0.0647, "step": 545 }, { "epoch": 0.9289663972777541, "grad_norm": 0.1932416005125246, "learning_rate": 2.5512766599306903e-07, "loss": 0.0642, "step": 546 }, { "epoch": 0.9306678009357721, "grad_norm": 0.1779695034006232, "learning_rate": 2.4288419984310086e-07, "loss": 0.0439, "step": 547 }, { "epoch": 0.9323692045937899, "grad_norm": 0.18689194535029088, "learning_rate": 2.3093817108803318e-07, "loss": 0.0761, "step": 548 }, { "epoch": 0.9340706082518078, "grad_norm": 0.13597627329781425, "learning_rate": 2.1928994389241454e-07, "loss": 0.0369, "step": 549 }, { "epoch": 0.9357720119098256, "grad_norm": 0.21989658377102142, "learning_rate": 2.0793987334256637e-07, "loss": 0.0625, "step": 550 }, { "epoch": 0.9374734155678435, "grad_norm": 0.21266747025038635, "learning_rate": 1.968883054357562e-07, "loss": 0.0689, "step": 551 }, { "epoch": 0.9391748192258613, "grad_norm": 0.25928931998255494, "learning_rate": 1.861355770696549e-07, "loss": 0.1025, "step": 552 }, { "epoch": 0.9408762228838792, "grad_norm": 0.2801452061064216, "learning_rate": 1.7568201603205827e-07, "loss": 0.0869, "step": 553 }, { "epoch": 0.942577626541897, "grad_norm": 0.3090393640358726, "learning_rate": 1.6552794099090718e-07, "loss": 0.1212, "step": 554 }, { "epoch": 0.9442790301999149, "grad_norm": 0.17720681659522422, "learning_rate": 1.5567366148455887e-07, "loss": 0.0355, "step": 555 }, { "epoch": 0.9459804338579328, "grad_norm": 0.15472295591692306, "learning_rate": 1.4611947791236314e-07, "loss": 0.0395, "step": 556 }, { "epoch": 0.9476818375159507, "grad_norm": 0.26730416240977917, "learning_rate": 1.3686568152549539e-07, "loss": 0.0595, "step": 557 }, { "epoch": 0.9493832411739686, "grad_norm": 0.21978673843573918, "learning_rate": 1.2791255441809037e-07, "loss": 0.064, "step": 558 }, { "epoch": 0.9510846448319864, "grad_norm": 0.19683818868694625, "learning_rate": 1.1926036951862563e-07, "loss": 0.0672, "step": 559 }, { "epoch": 0.9527860484900043, "grad_norm": 0.19189970865124908, "learning_rate": 1.109093905816172e-07, "loss": 0.0569, "step": 560 }, { "epoch": 0.9544874521480221, "grad_norm": 0.22200476537406752, "learning_rate": 1.0285987217957038e-07, "loss": 0.0761, "step": 561 }, { "epoch": 0.95618885580604, "grad_norm": 0.2253757142644252, "learning_rate": 9.511205969522263e-08, "loss": 0.0645, "step": 562 }, { "epoch": 0.9578902594640578, "grad_norm": 0.218059392950448, "learning_rate": 8.76661893140629e-08, "loss": 0.0682, "step": 563 }, { "epoch": 0.9595916631220757, "grad_norm": 0.2649165636045151, "learning_rate": 8.052248801712958e-08, "loss": 0.0912, "step": 564 }, { "epoch": 0.9612930667800936, "grad_norm": 0.20253828553816175, "learning_rate": 7.36811735740961e-08, "loss": 0.0571, "step": 565 }, { "epoch": 0.9629944704381115, "grad_norm": 0.19699087263694953, "learning_rate": 6.714245453662504e-08, "loss": 0.0457, "step": 566 }, { "epoch": 0.9646958740961293, "grad_norm": 0.27159809390838907, "learning_rate": 6.090653023201997e-08, "loss": 0.1057, "step": 567 }, { "epoch": 0.9663972777541472, "grad_norm": 0.2589995420361442, "learning_rate": 5.497359075714026e-08, "loss": 0.102, "step": 568 }, { "epoch": 0.968098681412165, "grad_norm": 0.19790945958126713, "learning_rate": 4.934381697261015e-08, "loss": 0.0608, "step": 569 }, { "epoch": 0.9698000850701829, "grad_norm": 0.24043673016274164, "learning_rate": 4.401738049730653e-08, "loss": 0.0609, "step": 570 }, { "epoch": 0.9715014887282007, "grad_norm": 0.2690784354034665, "learning_rate": 3.899444370312533e-08, "loss": 0.0861, "step": 571 }, { "epoch": 0.9732028923862186, "grad_norm": 0.22841785491856118, "learning_rate": 3.4275159710032146e-08, "loss": 0.0712, "step": 572 }, { "epoch": 0.9749042960442365, "grad_norm": 0.1950332471845292, "learning_rate": 2.9859672381392644e-08, "loss": 0.0532, "step": 573 }, { "epoch": 0.9766056997022544, "grad_norm": 0.22678060127201066, "learning_rate": 2.574811631959273e-08, "loss": 0.0771, "step": 574 }, { "epoch": 0.9783071033602723, "grad_norm": 0.3698823942978684, "learning_rate": 2.1940616861929608e-08, "loss": 0.0693, "step": 575 }, { "epoch": 0.9800085070182901, "grad_norm": 0.23298311393693824, "learning_rate": 1.8437290076792624e-08, "loss": 0.0725, "step": 576 }, { "epoch": 0.981709910676308, "grad_norm": 0.2028362088180894, "learning_rate": 1.5238242760126088e-08, "loss": 0.0756, "step": 577 }, { "epoch": 0.9834113143343258, "grad_norm": 0.29483561036206646, "learning_rate": 1.234357243217188e-08, "loss": 0.0988, "step": 578 }, { "epoch": 0.9851127179923437, "grad_norm": 0.2525424387277673, "learning_rate": 9.753367334499608e-09, "loss": 0.0771, "step": 579 }, { "epoch": 0.9868141216503615, "grad_norm": 0.26576715050467775, "learning_rate": 7.467706427312093e-09, "loss": 0.0612, "step": 580 }, { "epoch": 0.9885155253083794, "grad_norm": 0.1729974193289849, "learning_rate": 5.486659387043958e-09, "loss": 0.0371, "step": 581 }, { "epoch": 0.9902169289663972, "grad_norm": 0.19337221194799736, "learning_rate": 3.810286604232216e-09, "loss": 0.0548, "step": 582 }, { "epoch": 0.9919183326244151, "grad_norm": 0.2881735818611327, "learning_rate": 2.4386391816777488e-09, "loss": 0.0866, "step": 583 }, { "epoch": 0.993619736282433, "grad_norm": 0.25389258821605704, "learning_rate": 1.3717589328898773e-09, "loss": 0.0442, "step": 584 }, { "epoch": 0.9953211399404509, "grad_norm": 0.2721835551378759, "learning_rate": 6.096783808062778e-10, "loss": 0.0977, "step": 585 }, { "epoch": 0.9970225435984688, "grad_norm": 0.2313901379023938, "learning_rate": 1.524207568059932e-10, "loss": 0.0628, "step": 586 }, { "epoch": 0.9987239472564866, "grad_norm": 0.24018936080778358, "learning_rate": 0.0, "loss": 0.0614, "step": 587 }, { "epoch": 0.9987239472564866, "step": 587, "total_flos": 1551551083839488.0, "train_loss": 0.1089990799881683, "train_runtime": 5367.2694, "train_samples_per_second": 14.016, "train_steps_per_second": 0.109 } ], "logging_steps": 1.0, "max_steps": 587, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1551551083839488.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }