diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4151 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9987239472564866, + "eval_steps": 500, + "global_step": 587, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017014036580178648, + "grad_norm": 5.63924253730727, + "learning_rate": 1.111111111111111e-06, + "loss": 2.0374, + "step": 1 + }, + { + "epoch": 0.0034028073160357296, + "grad_norm": 5.632927224201576, + "learning_rate": 2.222222222222222e-06, + "loss": 2.0866, + "step": 2 + }, + { + "epoch": 0.005104210974053594, + "grad_norm": 5.479642165547945, + "learning_rate": 3.3333333333333333e-06, + "loss": 2.0376, + "step": 3 + }, + { + "epoch": 0.006805614632071459, + "grad_norm": 5.471589021050632, + "learning_rate": 4.444444444444444e-06, + "loss": 2.1061, + "step": 4 + }, + { + "epoch": 0.008507018290089324, + "grad_norm": 5.452260559713389, + "learning_rate": 5.555555555555557e-06, + "loss": 2.0813, + "step": 5 + }, + { + "epoch": 0.010208421948107189, + "grad_norm": 3.9931952519584444, + "learning_rate": 6.666666666666667e-06, + "loss": 2.0435, + "step": 6 + }, + { + "epoch": 0.011909825606125054, + "grad_norm": 2.946163093695019, + "learning_rate": 7.77777777777778e-06, + "loss": 1.916, + "step": 7 + }, + { + "epoch": 0.013611229264142918, + "grad_norm": 2.9796331190994465, + "learning_rate": 8.888888888888888e-06, + "loss": 1.9134, + "step": 8 + }, + { + "epoch": 0.015312632922160783, + "grad_norm": 2.8280631710237794, + "learning_rate": 1e-05, + "loss": 1.8869, + "step": 9 + }, + { + "epoch": 0.017014036580178648, + "grad_norm": 4.671284274036202, + "learning_rate": 1.1111111111111113e-05, + "loss": 1.789, + "step": 10 + }, + { + "epoch": 0.01871544023819651, + "grad_norm": 4.159264796058962, + "learning_rate": 1.2222222222222224e-05, + "loss": 1.7813, + "step": 11 + }, + { + "epoch": 0.020416843896214378, + "grad_norm": 3.3595037621909585, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.7648, + "step": 12 + }, + { + "epoch": 0.02211824755423224, + "grad_norm": 2.8360337563680154, + "learning_rate": 1.4444444444444446e-05, + "loss": 1.7677, + "step": 13 + }, + { + "epoch": 0.023819651212250107, + "grad_norm": 2.9812972795523827, + "learning_rate": 1.555555555555556e-05, + "loss": 1.6811, + "step": 14 + }, + { + "epoch": 0.02552105487026797, + "grad_norm": 2.8779465341667416, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.6153, + "step": 15 + }, + { + "epoch": 0.027222458528285837, + "grad_norm": 2.8657952631273798, + "learning_rate": 1.7777777777777777e-05, + "loss": 1.5663, + "step": 16 + }, + { + "epoch": 0.0289238621863037, + "grad_norm": 2.4000313234772443, + "learning_rate": 1.888888888888889e-05, + "loss": 1.5878, + "step": 17 + }, + { + "epoch": 0.030625265844321566, + "grad_norm": 2.5195012364274283, + "learning_rate": 2e-05, + "loss": 1.4538, + "step": 18 + }, + { + "epoch": 0.03232666950233943, + "grad_norm": 2.7457190633780955, + "learning_rate": 1.9999847579243196e-05, + "loss": 1.4576, + "step": 19 + }, + { + "epoch": 0.034028073160357296, + "grad_norm": 2.5130398824731204, + "learning_rate": 1.9999390321619196e-05, + "loss": 1.3711, + "step": 20 + }, + { + "epoch": 0.03572947681837516, + "grad_norm": 2.2632102571908086, + "learning_rate": 1.9998628241067113e-05, + "loss": 1.3483, + "step": 21 + }, + { + "epoch": 0.03743088047639302, + "grad_norm": 2.1464189612501583, + "learning_rate": 1.9997561360818322e-05, + "loss": 1.3179, + "step": 22 + }, + { + "epoch": 0.03913228413441089, + "grad_norm": 2.185526820010283, + "learning_rate": 1.999618971339577e-05, + "loss": 1.2857, + "step": 23 + }, + { + "epoch": 0.040833687792428755, + "grad_norm": 2.038229812617752, + "learning_rate": 1.9994513340612957e-05, + "loss": 1.2006, + "step": 24 + }, + { + "epoch": 0.04253509145044662, + "grad_norm": 2.1776347650433183, + "learning_rate": 1.9992532293572688e-05, + "loss": 1.2273, + "step": 25 + }, + { + "epoch": 0.04423649510846448, + "grad_norm": 2.079366983059656, + "learning_rate": 1.9990246632665503e-05, + "loss": 1.0571, + "step": 26 + }, + { + "epoch": 0.04593789876648235, + "grad_norm": 1.979679204723909, + "learning_rate": 1.998765642756783e-05, + "loss": 1.1059, + "step": 27 + }, + { + "epoch": 0.047639302424500214, + "grad_norm": 1.946272753302321, + "learning_rate": 1.9984761757239878e-05, + "loss": 1.0799, + "step": 28 + }, + { + "epoch": 0.04934070608251808, + "grad_norm": 2.0714174634215814, + "learning_rate": 1.998156270992321e-05, + "loss": 1.0714, + "step": 29 + }, + { + "epoch": 0.05104210974053594, + "grad_norm": 1.916092537194068, + "learning_rate": 1.9978059383138073e-05, + "loss": 1.0315, + "step": 30 + }, + { + "epoch": 0.05274351339855381, + "grad_norm": 1.9160611807015002, + "learning_rate": 1.997425188368041e-05, + "loss": 0.998, + "step": 31 + }, + { + "epoch": 0.05444491705657167, + "grad_norm": 1.9303848798477186, + "learning_rate": 1.9970140327618612e-05, + "loss": 0.9, + "step": 32 + }, + { + "epoch": 0.05614632071458953, + "grad_norm": 2.0185535489578292, + "learning_rate": 1.9965724840289972e-05, + "loss": 0.8651, + "step": 33 + }, + { + "epoch": 0.0578477243726074, + "grad_norm": 1.8001076615593201, + "learning_rate": 1.9961005556296875e-05, + "loss": 0.9509, + "step": 34 + }, + { + "epoch": 0.059549128030625266, + "grad_norm": 1.8623852421957452, + "learning_rate": 1.9955982619502693e-05, + "loss": 0.8698, + "step": 35 + }, + { + "epoch": 0.06125053168864313, + "grad_norm": 1.8163115386172757, + "learning_rate": 1.9950656183027392e-05, + "loss": 0.8275, + "step": 36 + }, + { + "epoch": 0.06295193534666099, + "grad_norm": 1.916794143703785, + "learning_rate": 1.994502640924286e-05, + "loss": 0.8285, + "step": 37 + }, + { + "epoch": 0.06465333900467886, + "grad_norm": 1.653184767202805, + "learning_rate": 1.993909346976798e-05, + "loss": 0.7227, + "step": 38 + }, + { + "epoch": 0.06635474266269673, + "grad_norm": 1.6621341278775057, + "learning_rate": 1.993285754546338e-05, + "loss": 0.7275, + "step": 39 + }, + { + "epoch": 0.06805614632071459, + "grad_norm": 1.632197605528049, + "learning_rate": 1.9926318826425905e-05, + "loss": 0.6896, + "step": 40 + }, + { + "epoch": 0.06975754997873246, + "grad_norm": 1.7234721697892237, + "learning_rate": 1.9919477511982873e-05, + "loss": 0.6754, + "step": 41 + }, + { + "epoch": 0.07145895363675032, + "grad_norm": 1.7148101817526775, + "learning_rate": 1.991233381068594e-05, + "loss": 0.6953, + "step": 42 + }, + { + "epoch": 0.07316035729476818, + "grad_norm": 1.7455307347725455, + "learning_rate": 1.990488794030478e-05, + "loss": 0.6602, + "step": 43 + }, + { + "epoch": 0.07486176095278604, + "grad_norm": 1.5636042552262228, + "learning_rate": 1.9897140127820432e-05, + "loss": 0.5694, + "step": 44 + }, + { + "epoch": 0.07656316461080391, + "grad_norm": 1.4856335508449605, + "learning_rate": 1.9889090609418384e-05, + "loss": 0.5319, + "step": 45 + }, + { + "epoch": 0.07826456826882178, + "grad_norm": 1.7253954681014994, + "learning_rate": 1.9880739630481376e-05, + "loss": 0.5794, + "step": 46 + }, + { + "epoch": 0.07996597192683964, + "grad_norm": 1.5857215457182516, + "learning_rate": 1.9872087445581912e-05, + "loss": 0.5964, + "step": 47 + }, + { + "epoch": 0.08166737558485751, + "grad_norm": 1.5931220420137733, + "learning_rate": 1.9863134318474504e-05, + "loss": 0.456, + "step": 48 + }, + { + "epoch": 0.08336877924287538, + "grad_norm": 1.4862808758180759, + "learning_rate": 1.985388052208764e-05, + "loss": 0.552, + "step": 49 + }, + { + "epoch": 0.08507018290089324, + "grad_norm": 1.6111023128163313, + "learning_rate": 1.9844326338515444e-05, + "loss": 0.4635, + "step": 50 + }, + { + "epoch": 0.0867715865589111, + "grad_norm": 1.5692010467971949, + "learning_rate": 1.9834472059009097e-05, + "loss": 0.5204, + "step": 51 + }, + { + "epoch": 0.08847299021692896, + "grad_norm": 1.35999078119841, + "learning_rate": 1.982431798396794e-05, + "loss": 0.4738, + "step": 52 + }, + { + "epoch": 0.09017439387494683, + "grad_norm": 1.724311652044838, + "learning_rate": 1.9813864422930345e-05, + "loss": 0.5453, + "step": 53 + }, + { + "epoch": 0.0918757975329647, + "grad_norm": 1.4794560380467412, + "learning_rate": 1.9803111694564246e-05, + "loss": 0.4516, + "step": 54 + }, + { + "epoch": 0.09357720119098256, + "grad_norm": 1.4492255731919181, + "learning_rate": 1.9792060126657437e-05, + "loss": 0.4085, + "step": 55 + }, + { + "epoch": 0.09527860484900043, + "grad_norm": 1.225858498755291, + "learning_rate": 1.9780710056107587e-05, + "loss": 0.417, + "step": 56 + }, + { + "epoch": 0.0969800085070183, + "grad_norm": 1.25238752929569, + "learning_rate": 1.976906182891197e-05, + "loss": 0.3714, + "step": 57 + }, + { + "epoch": 0.09868141216503616, + "grad_norm": 1.5035257308896186, + "learning_rate": 1.97571158001569e-05, + "loss": 0.479, + "step": 58 + }, + { + "epoch": 0.10038281582305401, + "grad_norm": 1.43439830862149, + "learning_rate": 1.9744872334006936e-05, + "loss": 0.3673, + "step": 59 + }, + { + "epoch": 0.10208421948107188, + "grad_norm": 1.3765449932113782, + "learning_rate": 1.973233180369374e-05, + "loss": 0.4469, + "step": 60 + }, + { + "epoch": 0.10378562313908975, + "grad_norm": 1.4400621685399293, + "learning_rate": 1.9719494591504747e-05, + "loss": 0.4076, + "step": 61 + }, + { + "epoch": 0.10548702679710761, + "grad_norm": 1.359923297097399, + "learning_rate": 1.9706361088771474e-05, + "loss": 0.3256, + "step": 62 + }, + { + "epoch": 0.10718843045512548, + "grad_norm": 1.3228876262805203, + "learning_rate": 1.96929316958576e-05, + "loss": 0.4022, + "step": 63 + }, + { + "epoch": 0.10888983411314335, + "grad_norm": 1.462008074682739, + "learning_rate": 1.9679206822146776e-05, + "loss": 0.3708, + "step": 64 + }, + { + "epoch": 0.11059123777116121, + "grad_norm": 1.2655667904871533, + "learning_rate": 1.9665186886030135e-05, + "loss": 0.319, + "step": 65 + }, + { + "epoch": 0.11229264142917907, + "grad_norm": 1.2402190577080747, + "learning_rate": 1.9650872314893523e-05, + "loss": 0.3527, + "step": 66 + }, + { + "epoch": 0.11399404508719693, + "grad_norm": 1.2679970115630648, + "learning_rate": 1.9636263545104498e-05, + "loss": 0.3614, + "step": 67 + }, + { + "epoch": 0.1156954487452148, + "grad_norm": 1.3799286618561433, + "learning_rate": 1.962136102199901e-05, + "loss": 0.4032, + "step": 68 + }, + { + "epoch": 0.11739685240323267, + "grad_norm": 1.4584330418276699, + "learning_rate": 1.9606165199867822e-05, + "loss": 0.3315, + "step": 69 + }, + { + "epoch": 0.11909825606125053, + "grad_norm": 1.2522747225476394, + "learning_rate": 1.959067654194268e-05, + "loss": 0.3451, + "step": 70 + }, + { + "epoch": 0.1207996597192684, + "grad_norm": 1.2450797837241314, + "learning_rate": 1.9574895520382183e-05, + "loss": 0.3892, + "step": 71 + }, + { + "epoch": 0.12250106337728627, + "grad_norm": 1.5181322357352012, + "learning_rate": 1.955882261625737e-05, + "loss": 0.347, + "step": 72 + }, + { + "epoch": 0.12420246703530413, + "grad_norm": 1.2905603961922367, + "learning_rate": 1.9542458319537094e-05, + "loss": 0.2405, + "step": 73 + }, + { + "epoch": 0.12590387069332198, + "grad_norm": 1.2179775856907882, + "learning_rate": 1.9525803129073046e-05, + "loss": 0.2806, + "step": 74 + }, + { + "epoch": 0.12760527435133986, + "grad_norm": 1.2539388678563226, + "learning_rate": 1.9508857552584574e-05, + "loss": 0.33, + "step": 75 + }, + { + "epoch": 0.12930667800935772, + "grad_norm": 1.2346640445330164, + "learning_rate": 1.9491622106643195e-05, + "loss": 0.303, + "step": 76 + }, + { + "epoch": 0.1310080816673756, + "grad_norm": 1.14678973623224, + "learning_rate": 1.9474097316656856e-05, + "loss": 0.275, + "step": 77 + }, + { + "epoch": 0.13270948532539345, + "grad_norm": 1.0642556255301903, + "learning_rate": 1.9456283716853906e-05, + "loss": 0.2856, + "step": 78 + }, + { + "epoch": 0.1344108889834113, + "grad_norm": 1.302725831732861, + "learning_rate": 1.9438181850266815e-05, + "loss": 0.3307, + "step": 79 + }, + { + "epoch": 0.13611229264142918, + "grad_norm": 1.168246728422401, + "learning_rate": 1.941979226871563e-05, + "loss": 0.2401, + "step": 80 + }, + { + "epoch": 0.13781369629944704, + "grad_norm": 1.1820257294269807, + "learning_rate": 1.9401115532791134e-05, + "loss": 0.2873, + "step": 81 + }, + { + "epoch": 0.13951509995746492, + "grad_norm": 1.1784513558059517, + "learning_rate": 1.938215221183777e-05, + "loss": 0.2626, + "step": 82 + }, + { + "epoch": 0.14121650361548277, + "grad_norm": 1.2289419951789742, + "learning_rate": 1.936290288393629e-05, + "loss": 0.298, + "step": 83 + }, + { + "epoch": 0.14291790727350065, + "grad_norm": 1.4000578395186851, + "learning_rate": 1.9343368135886112e-05, + "loss": 0.3732, + "step": 84 + }, + { + "epoch": 0.1446193109315185, + "grad_norm": 1.0439182155840876, + "learning_rate": 1.932354856318746e-05, + "loss": 0.2588, + "step": 85 + }, + { + "epoch": 0.14632071458953635, + "grad_norm": 1.207165657752028, + "learning_rate": 1.9303444770023184e-05, + "loss": 0.2891, + "step": 86 + }, + { + "epoch": 0.14802211824755424, + "grad_norm": 1.2726095386172456, + "learning_rate": 1.9283057369240358e-05, + "loss": 0.3119, + "step": 87 + }, + { + "epoch": 0.1497235219055721, + "grad_norm": 1.2276163436209269, + "learning_rate": 1.9262386982331596e-05, + "loss": 0.2943, + "step": 88 + }, + { + "epoch": 0.15142492556358997, + "grad_norm": 1.0651841685636916, + "learning_rate": 1.9241434239416093e-05, + "loss": 0.2522, + "step": 89 + }, + { + "epoch": 0.15312632922160782, + "grad_norm": 1.230238605055406, + "learning_rate": 1.922019977922045e-05, + "loss": 0.2479, + "step": 90 + }, + { + "epoch": 0.1548277328796257, + "grad_norm": 1.2642140097550183, + "learning_rate": 1.919868424905915e-05, + "loss": 0.2472, + "step": 91 + }, + { + "epoch": 0.15652913653764355, + "grad_norm": 1.018041040748929, + "learning_rate": 1.9176888304814882e-05, + "loss": 0.2236, + "step": 92 + }, + { + "epoch": 0.15823054019566143, + "grad_norm": 1.174538871643398, + "learning_rate": 1.9154812610918503e-05, + "loss": 0.3216, + "step": 93 + }, + { + "epoch": 0.1599319438536793, + "grad_norm": 0.9925502075427942, + "learning_rate": 1.913245784032881e-05, + "loss": 0.2289, + "step": 94 + }, + { + "epoch": 0.16163334751169714, + "grad_norm": 0.9522124139838903, + "learning_rate": 1.9109824674512014e-05, + "loss": 0.1739, + "step": 95 + }, + { + "epoch": 0.16333475116971502, + "grad_norm": 1.1326192543397258, + "learning_rate": 1.9086913803420966e-05, + "loss": 0.2815, + "step": 96 + }, + { + "epoch": 0.16503615482773287, + "grad_norm": 1.380751756029331, + "learning_rate": 1.906372592547413e-05, + "loss": 0.2946, + "step": 97 + }, + { + "epoch": 0.16673755848575075, + "grad_norm": 1.0115472196543838, + "learning_rate": 1.9040261747534282e-05, + "loss": 0.2672, + "step": 98 + }, + { + "epoch": 0.1684389621437686, + "grad_norm": 1.0273767032266958, + "learning_rate": 1.9016521984886984e-05, + "loss": 0.2289, + "step": 99 + }, + { + "epoch": 0.1701403658017865, + "grad_norm": 1.0020635757019345, + "learning_rate": 1.8992507361218743e-05, + "loss": 0.2177, + "step": 100 + }, + { + "epoch": 0.17184176945980434, + "grad_norm": 0.9555306909281526, + "learning_rate": 1.8968218608594987e-05, + "loss": 0.1387, + "step": 101 + }, + { + "epoch": 0.1735431731178222, + "grad_norm": 1.2265198659870458, + "learning_rate": 1.8943656467437726e-05, + "loss": 0.3012, + "step": 102 + }, + { + "epoch": 0.17524457677584007, + "grad_norm": 1.1317317444482722, + "learning_rate": 1.8918821686502992e-05, + "loss": 0.2343, + "step": 103 + }, + { + "epoch": 0.17694598043385792, + "grad_norm": 0.9412615307783455, + "learning_rate": 1.8893715022858e-05, + "loss": 0.2143, + "step": 104 + }, + { + "epoch": 0.1786473840918758, + "grad_norm": 1.3681381109162778, + "learning_rate": 1.886833724185809e-05, + "loss": 0.2595, + "step": 105 + }, + { + "epoch": 0.18034878774989366, + "grad_norm": 0.9136867250269601, + "learning_rate": 1.8842689117123377e-05, + "loss": 0.1873, + "step": 106 + }, + { + "epoch": 0.18205019140791154, + "grad_norm": 1.013964380859395, + "learning_rate": 1.8816771430515178e-05, + "loss": 0.2273, + "step": 107 + }, + { + "epoch": 0.1837515950659294, + "grad_norm": 0.8892129875538982, + "learning_rate": 1.8790584972112174e-05, + "loss": 0.1588, + "step": 108 + }, + { + "epoch": 0.18545299872394724, + "grad_norm": 1.2815250199423684, + "learning_rate": 1.876413054018633e-05, + "loss": 0.3445, + "step": 109 + }, + { + "epoch": 0.18715440238196512, + "grad_norm": 0.9605582574779087, + "learning_rate": 1.873740894117854e-05, + "loss": 0.2057, + "step": 110 + }, + { + "epoch": 0.18885580603998298, + "grad_norm": 0.9353117937734325, + "learning_rate": 1.8710420989674093e-05, + "loss": 0.1847, + "step": 111 + }, + { + "epoch": 0.19055720969800086, + "grad_norm": 0.7696371461985803, + "learning_rate": 1.8683167508377775e-05, + "loss": 0.1749, + "step": 112 + }, + { + "epoch": 0.1922586133560187, + "grad_norm": 0.9969695101515251, + "learning_rate": 1.8655649328088836e-05, + "loss": 0.1832, + "step": 113 + }, + { + "epoch": 0.1939600170140366, + "grad_norm": 0.9743343159621933, + "learning_rate": 1.862786728767565e-05, + "loss": 0.1746, + "step": 114 + }, + { + "epoch": 0.19566142067205444, + "grad_norm": 1.0842774951363159, + "learning_rate": 1.8599822234050143e-05, + "loss": 0.243, + "step": 115 + }, + { + "epoch": 0.19736282433007232, + "grad_norm": 0.9022835246921371, + "learning_rate": 1.8571515022141974e-05, + "loss": 0.1842, + "step": 116 + }, + { + "epoch": 0.19906422798809018, + "grad_norm": 1.245953837027519, + "learning_rate": 1.8542946514872478e-05, + "loss": 0.2936, + "step": 117 + }, + { + "epoch": 0.20076563164610803, + "grad_norm": 0.8267840779011822, + "learning_rate": 1.851411758312835e-05, + "loss": 0.1626, + "step": 118 + }, + { + "epoch": 0.2024670353041259, + "grad_norm": 0.6842267496476119, + "learning_rate": 1.8485029105735112e-05, + "loss": 0.1392, + "step": 119 + }, + { + "epoch": 0.20416843896214376, + "grad_norm": 0.7737780933096716, + "learning_rate": 1.8455681969430307e-05, + "loss": 0.1709, + "step": 120 + }, + { + "epoch": 0.20586984262016164, + "grad_norm": 0.8184241013364575, + "learning_rate": 1.8426077068836487e-05, + "loss": 0.1885, + "step": 121 + }, + { + "epoch": 0.2075712462781795, + "grad_norm": 0.7745435868860747, + "learning_rate": 1.839621530643392e-05, + "loss": 0.1276, + "step": 122 + }, + { + "epoch": 0.20927264993619737, + "grad_norm": 1.0779421618581002, + "learning_rate": 1.8366097592533095e-05, + "loss": 0.2365, + "step": 123 + }, + { + "epoch": 0.21097405359421523, + "grad_norm": 1.1551262329182066, + "learning_rate": 1.8335724845246948e-05, + "loss": 0.2601, + "step": 124 + }, + { + "epoch": 0.21267545725223308, + "grad_norm": 0.8001035987730307, + "learning_rate": 1.830509799046292e-05, + "loss": 0.1469, + "step": 125 + }, + { + "epoch": 0.21437686091025096, + "grad_norm": 0.9511305022845807, + "learning_rate": 1.8274217961814682e-05, + "loss": 0.1802, + "step": 126 + }, + { + "epoch": 0.2160782645682688, + "grad_norm": 0.9271183565448525, + "learning_rate": 1.8243085700653698e-05, + "loss": 0.1828, + "step": 127 + }, + { + "epoch": 0.2177796682262867, + "grad_norm": 0.8735913168213245, + "learning_rate": 1.821170215602053e-05, + "loss": 0.2023, + "step": 128 + }, + { + "epoch": 0.21948107188430455, + "grad_norm": 0.8521894299941564, + "learning_rate": 1.818006828461591e-05, + "loss": 0.1761, + "step": 129 + }, + { + "epoch": 0.22118247554232243, + "grad_norm": 0.9792443208887637, + "learning_rate": 1.8148185050771554e-05, + "loss": 0.2576, + "step": 130 + }, + { + "epoch": 0.22288387920034028, + "grad_norm": 0.8404358211352582, + "learning_rate": 1.8116053426420793e-05, + "loss": 0.1603, + "step": 131 + }, + { + "epoch": 0.22458528285835813, + "grad_norm": 1.1623588238490783, + "learning_rate": 1.8083674391068925e-05, + "loss": 0.1154, + "step": 132 + }, + { + "epoch": 0.226286686516376, + "grad_norm": 0.6801957584586958, + "learning_rate": 1.8051048931763366e-05, + "loss": 0.1363, + "step": 133 + }, + { + "epoch": 0.22798809017439386, + "grad_norm": 0.7696431934514459, + "learning_rate": 1.8018178043063554e-05, + "loss": 0.1676, + "step": 134 + }, + { + "epoch": 0.22968949383241175, + "grad_norm": 0.922953677009161, + "learning_rate": 1.798506272701064e-05, + "loss": 0.1374, + "step": 135 + }, + { + "epoch": 0.2313908974904296, + "grad_norm": 1.2672781712438308, + "learning_rate": 1.795170399309692e-05, + "loss": 0.2058, + "step": 136 + }, + { + "epoch": 0.23309230114844748, + "grad_norm": 1.0966993213933063, + "learning_rate": 1.7918102858235103e-05, + "loss": 0.2572, + "step": 137 + }, + { + "epoch": 0.23479370480646533, + "grad_norm": 0.8987225953268668, + "learning_rate": 1.7884260346727257e-05, + "loss": 0.1801, + "step": 138 + }, + { + "epoch": 0.2364951084644832, + "grad_norm": 0.7280344498235162, + "learning_rate": 1.7850177490233635e-05, + "loss": 0.1646, + "step": 139 + }, + { + "epoch": 0.23819651212250106, + "grad_norm": 1.0079365876774031, + "learning_rate": 1.7815855327741185e-05, + "loss": 0.197, + "step": 140 + }, + { + "epoch": 0.23989791578051892, + "grad_norm": 0.818601852598636, + "learning_rate": 1.7781294905531908e-05, + "loss": 0.162, + "step": 141 + }, + { + "epoch": 0.2415993194385368, + "grad_norm": 1.5018742709785817, + "learning_rate": 1.774649727715094e-05, + "loss": 0.2039, + "step": 142 + }, + { + "epoch": 0.24330072309655465, + "grad_norm": 0.8214557387451863, + "learning_rate": 1.7711463503374466e-05, + "loss": 0.145, + "step": 143 + }, + { + "epoch": 0.24500212675457253, + "grad_norm": 0.8994136385878274, + "learning_rate": 1.7676194652177333e-05, + "loss": 0.1896, + "step": 144 + }, + { + "epoch": 0.24670353041259038, + "grad_norm": 0.943397470687139, + "learning_rate": 1.764069179870055e-05, + "loss": 0.1891, + "step": 145 + }, + { + "epoch": 0.24840493407060826, + "grad_norm": 0.8816465935595688, + "learning_rate": 1.760495602521847e-05, + "loss": 0.1656, + "step": 146 + }, + { + "epoch": 0.2501063377286261, + "grad_norm": 1.0078909042279265, + "learning_rate": 1.756898842110582e-05, + "loss": 0.2096, + "step": 147 + }, + { + "epoch": 0.25180774138664397, + "grad_norm": 0.7926147606398589, + "learning_rate": 1.753279008280449e-05, + "loss": 0.1338, + "step": 148 + }, + { + "epoch": 0.2535091450446618, + "grad_norm": 0.7439412092510606, + "learning_rate": 1.74963621137901e-05, + "loss": 0.1305, + "step": 149 + }, + { + "epoch": 0.25521054870267973, + "grad_norm": 0.8302484661622505, + "learning_rate": 1.7459705624538383e-05, + "loss": 0.1764, + "step": 150 + }, + { + "epoch": 0.2569119523606976, + "grad_norm": 0.8234450008369374, + "learning_rate": 1.7422821732491297e-05, + "loss": 0.1476, + "step": 151 + }, + { + "epoch": 0.25861335601871543, + "grad_norm": 0.8905216062958827, + "learning_rate": 1.7385711562022988e-05, + "loss": 0.1529, + "step": 152 + }, + { + "epoch": 0.2603147596767333, + "grad_norm": 0.8532444685014303, + "learning_rate": 1.734837624440551e-05, + "loss": 0.1626, + "step": 153 + }, + { + "epoch": 0.2620161633347512, + "grad_norm": 1.0574209295177952, + "learning_rate": 1.731081691777434e-05, + "loss": 0.1528, + "step": 154 + }, + { + "epoch": 0.26371756699276905, + "grad_norm": 0.7830462257448565, + "learning_rate": 1.7273034727093677e-05, + "loss": 0.2023, + "step": 155 + }, + { + "epoch": 0.2654189706507869, + "grad_norm": 0.812611079400798, + "learning_rate": 1.7235030824121542e-05, + "loss": 0.1365, + "step": 156 + }, + { + "epoch": 0.26712037430880475, + "grad_norm": 0.8001398058542689, + "learning_rate": 1.7196806367374656e-05, + "loss": 0.1759, + "step": 157 + }, + { + "epoch": 0.2688217779668226, + "grad_norm": 0.6971533495374251, + "learning_rate": 1.7158362522093153e-05, + "loss": 0.1336, + "step": 158 + }, + { + "epoch": 0.2705231816248405, + "grad_norm": 0.7992525448829314, + "learning_rate": 1.7119700460205026e-05, + "loss": 0.1746, + "step": 159 + }, + { + "epoch": 0.27222458528285837, + "grad_norm": 0.7250271941684896, + "learning_rate": 1.7080821360290426e-05, + "loss": 0.134, + "step": 160 + }, + { + "epoch": 0.2739259889408762, + "grad_norm": 0.6303617488032851, + "learning_rate": 1.7041726407545716e-05, + "loss": 0.0926, + "step": 161 + }, + { + "epoch": 0.27562739259889407, + "grad_norm": 0.8109547254061638, + "learning_rate": 1.7002416793747354e-05, + "loss": 0.1715, + "step": 162 + }, + { + "epoch": 0.2773287962569119, + "grad_norm": 0.828546316056403, + "learning_rate": 1.696289371721556e-05, + "loss": 0.1525, + "step": 163 + }, + { + "epoch": 0.27903019991492983, + "grad_norm": 1.055650387994432, + "learning_rate": 1.692315838277778e-05, + "loss": 0.2507, + "step": 164 + }, + { + "epoch": 0.2807316035729477, + "grad_norm": 0.8913907724701664, + "learning_rate": 1.6883212001731956e-05, + "loss": 0.1654, + "step": 165 + }, + { + "epoch": 0.28243300723096554, + "grad_norm": 0.635346435774696, + "learning_rate": 1.6843055791809623e-05, + "loss": 0.1115, + "step": 166 + }, + { + "epoch": 0.2841344108889834, + "grad_norm": 0.8656049770061993, + "learning_rate": 1.680269097713876e-05, + "loss": 0.2065, + "step": 167 + }, + { + "epoch": 0.2858358145470013, + "grad_norm": 0.7810406489451938, + "learning_rate": 1.6762118788206488e-05, + "loss": 0.129, + "step": 168 + }, + { + "epoch": 0.28753721820501915, + "grad_norm": 0.9878211467165512, + "learning_rate": 1.6721340461821555e-05, + "loss": 0.2221, + "step": 169 + }, + { + "epoch": 0.289238621863037, + "grad_norm": 0.934668471476854, + "learning_rate": 1.6680357241076632e-05, + "loss": 0.1796, + "step": 170 + }, + { + "epoch": 0.29094002552105486, + "grad_norm": 0.792988141592149, + "learning_rate": 1.6639170375310422e-05, + "loss": 0.1502, + "step": 171 + }, + { + "epoch": 0.2926414291790727, + "grad_norm": 0.8353958606861209, + "learning_rate": 1.6597781120069584e-05, + "loss": 0.1635, + "step": 172 + }, + { + "epoch": 0.2943428328370906, + "grad_norm": 0.8625422111702601, + "learning_rate": 1.655619073707043e-05, + "loss": 0.2002, + "step": 173 + }, + { + "epoch": 0.29604423649510847, + "grad_norm": 0.6618845100953816, + "learning_rate": 1.6514400494160498e-05, + "loss": 0.1119, + "step": 174 + }, + { + "epoch": 0.2977456401531263, + "grad_norm": 0.8273120959049606, + "learning_rate": 1.6472411665279872e-05, + "loss": 0.1654, + "step": 175 + }, + { + "epoch": 0.2994470438111442, + "grad_norm": 0.9810865058054768, + "learning_rate": 1.643022553042237e-05, + "loss": 0.201, + "step": 176 + }, + { + "epoch": 0.3011484474691621, + "grad_norm": 1.0484005920330577, + "learning_rate": 1.6387843375596513e-05, + "loss": 0.1409, + "step": 177 + }, + { + "epoch": 0.30284985112717994, + "grad_norm": 0.8127379666282436, + "learning_rate": 1.634526649278632e-05, + "loss": 0.1461, + "step": 178 + }, + { + "epoch": 0.3045512547851978, + "grad_norm": 0.8425016382304042, + "learning_rate": 1.630249617991194e-05, + "loss": 0.1608, + "step": 179 + }, + { + "epoch": 0.30625265844321564, + "grad_norm": 1.0046916865286986, + "learning_rate": 1.6259533740790055e-05, + "loss": 0.2019, + "step": 180 + }, + { + "epoch": 0.3079540621012335, + "grad_norm": 0.8492772399549352, + "learning_rate": 1.6216380485094164e-05, + "loss": 0.1914, + "step": 181 + }, + { + "epoch": 0.3096554657592514, + "grad_norm": 0.9960915228518785, + "learning_rate": 1.617303772831465e-05, + "loss": 0.2093, + "step": 182 + }, + { + "epoch": 0.31135686941726926, + "grad_norm": 0.9247387332446851, + "learning_rate": 1.6129506791718665e-05, + "loss": 0.1922, + "step": 183 + }, + { + "epoch": 0.3130582730752871, + "grad_norm": 0.740020053586076, + "learning_rate": 1.6085789002309873e-05, + "loss": 0.1393, + "step": 184 + }, + { + "epoch": 0.31475967673330496, + "grad_norm": 0.8528999348725693, + "learning_rate": 1.6041885692787985e-05, + "loss": 0.1799, + "step": 185 + }, + { + "epoch": 0.31646108039132287, + "grad_norm": 0.7155070165296488, + "learning_rate": 1.599779820150813e-05, + "loss": 0.16, + "step": 186 + }, + { + "epoch": 0.3181624840493407, + "grad_norm": 0.7690189603854283, + "learning_rate": 1.5953527872440063e-05, + "loss": 0.1775, + "step": 187 + }, + { + "epoch": 0.3198638877073586, + "grad_norm": 0.8259674786974674, + "learning_rate": 1.5909076055127202e-05, + "loss": 0.1824, + "step": 188 + }, + { + "epoch": 0.3215652913653764, + "grad_norm": 0.7876574119052634, + "learning_rate": 1.5864444104645473e-05, + "loss": 0.1722, + "step": 189 + }, + { + "epoch": 0.3232666950233943, + "grad_norm": 0.9030577969032576, + "learning_rate": 1.581963338156201e-05, + "loss": 0.1842, + "step": 190 + }, + { + "epoch": 0.3249680986814122, + "grad_norm": 0.9289586180781787, + "learning_rate": 1.5774645251893673e-05, + "loss": 0.1779, + "step": 191 + }, + { + "epoch": 0.32666950233943004, + "grad_norm": 0.8770757118969684, + "learning_rate": 1.5729481087065423e-05, + "loss": 0.1773, + "step": 192 + }, + { + "epoch": 0.3283709059974479, + "grad_norm": 0.6802158710923389, + "learning_rate": 1.5684142263868493e-05, + "loss": 0.1521, + "step": 193 + }, + { + "epoch": 0.33007230965546575, + "grad_norm": 1.0997705739794172, + "learning_rate": 1.5638630164418435e-05, + "loss": 0.1662, + "step": 194 + }, + { + "epoch": 0.3317737133134836, + "grad_norm": 0.7808826414072878, + "learning_rate": 1.5592946176112973e-05, + "loss": 0.1638, + "step": 195 + }, + { + "epoch": 0.3334751169715015, + "grad_norm": 0.7903040135430873, + "learning_rate": 1.554709169158972e-05, + "loss": 0.1692, + "step": 196 + }, + { + "epoch": 0.33517652062951936, + "grad_norm": 0.9177342284846766, + "learning_rate": 1.550106810868373e-05, + "loss": 0.1468, + "step": 197 + }, + { + "epoch": 0.3368779242875372, + "grad_norm": 0.5274979224341121, + "learning_rate": 1.5454876830384868e-05, + "loss": 0.123, + "step": 198 + }, + { + "epoch": 0.33857932794555506, + "grad_norm": 0.8158770161825017, + "learning_rate": 1.540851926479505e-05, + "loss": 0.1798, + "step": 199 + }, + { + "epoch": 0.340280731603573, + "grad_norm": 0.9022128712443924, + "learning_rate": 1.536199682508533e-05, + "loss": 0.147, + "step": 200 + }, + { + "epoch": 0.3419821352615908, + "grad_norm": 0.8069172017703115, + "learning_rate": 1.531531092945279e-05, + "loss": 0.1107, + "step": 201 + }, + { + "epoch": 0.3436835389196087, + "grad_norm": 0.9674518316083621, + "learning_rate": 1.526846300107734e-05, + "loss": 0.1702, + "step": 202 + }, + { + "epoch": 0.34538494257762653, + "grad_norm": 0.9803430491871968, + "learning_rate": 1.5221454468078336e-05, + "loss": 0.1958, + "step": 203 + }, + { + "epoch": 0.3470863462356444, + "grad_norm": 0.8275995588819659, + "learning_rate": 1.5174286763470995e-05, + "loss": 0.1594, + "step": 204 + }, + { + "epoch": 0.3487877498936623, + "grad_norm": 0.7460380626298814, + "learning_rate": 1.5126961325122773e-05, + "loss": 0.1517, + "step": 205 + }, + { + "epoch": 0.35048915355168014, + "grad_norm": 1.1000698478259734, + "learning_rate": 1.5079479595709493e-05, + "loss": 0.2305, + "step": 206 + }, + { + "epoch": 0.352190557209698, + "grad_norm": 0.7123013685624445, + "learning_rate": 1.5031843022671377e-05, + "loss": 0.1183, + "step": 207 + }, + { + "epoch": 0.35389196086771585, + "grad_norm": 0.8897276447764233, + "learning_rate": 1.4984053058168936e-05, + "loss": 0.1387, + "step": 208 + }, + { + "epoch": 0.35559336452573376, + "grad_norm": 0.9340710987029324, + "learning_rate": 1.4936111159038677e-05, + "loss": 0.1819, + "step": 209 + }, + { + "epoch": 0.3572947681837516, + "grad_norm": 0.7812715701004525, + "learning_rate": 1.4888018786748713e-05, + "loss": 0.1323, + "step": 210 + }, + { + "epoch": 0.35899617184176946, + "grad_norm": 0.9043387320413069, + "learning_rate": 1.4839777407354194e-05, + "loss": 0.1684, + "step": 211 + }, + { + "epoch": 0.3606975754997873, + "grad_norm": 0.8665338567342898, + "learning_rate": 1.4791388491452637e-05, + "loss": 0.2239, + "step": 212 + }, + { + "epoch": 0.36239897915780517, + "grad_norm": 0.838015121253979, + "learning_rate": 1.4742853514139076e-05, + "loss": 0.1737, + "step": 213 + }, + { + "epoch": 0.3641003828158231, + "grad_norm": 0.9515230799951901, + "learning_rate": 1.4694173954961105e-05, + "loss": 0.2423, + "step": 214 + }, + { + "epoch": 0.36580178647384093, + "grad_norm": 0.8323204743497482, + "learning_rate": 1.4645351297873774e-05, + "loss": 0.1813, + "step": 215 + }, + { + "epoch": 0.3675031901318588, + "grad_norm": 0.8485426786033755, + "learning_rate": 1.4596387031194354e-05, + "loss": 0.1386, + "step": 216 + }, + { + "epoch": 0.36920459378987663, + "grad_norm": 0.7040504585869962, + "learning_rate": 1.4547282647556964e-05, + "loss": 0.1817, + "step": 217 + }, + { + "epoch": 0.3709059974478945, + "grad_norm": 0.7787423361091415, + "learning_rate": 1.449803964386706e-05, + "loss": 0.1288, + "step": 218 + }, + { + "epoch": 0.3726074011059124, + "grad_norm": 0.8126666620020254, + "learning_rate": 1.4448659521255823e-05, + "loss": 0.1281, + "step": 219 + }, + { + "epoch": 0.37430880476393025, + "grad_norm": 0.7828387937846378, + "learning_rate": 1.4399143785034388e-05, + "loss": 0.171, + "step": 220 + }, + { + "epoch": 0.3760102084219481, + "grad_norm": 0.8286959735291576, + "learning_rate": 1.4349493944647953e-05, + "loss": 0.1553, + "step": 221 + }, + { + "epoch": 0.37771161207996595, + "grad_norm": 0.904473637938933, + "learning_rate": 1.4299711513629759e-05, + "loss": 0.1606, + "step": 222 + }, + { + "epoch": 0.37941301573798386, + "grad_norm": 1.0162200820417262, + "learning_rate": 1.4249798009554979e-05, + "loss": 0.1813, + "step": 223 + }, + { + "epoch": 0.3811144193960017, + "grad_norm": 0.8122233208254028, + "learning_rate": 1.419975495399442e-05, + "loss": 0.1848, + "step": 224 + }, + { + "epoch": 0.38281582305401957, + "grad_norm": 0.8622628685111067, + "learning_rate": 1.4149583872468165e-05, + "loss": 0.1004, + "step": 225 + }, + { + "epoch": 0.3845172267120374, + "grad_norm": 0.5688342877258121, + "learning_rate": 1.4099286294399051e-05, + "loss": 0.0903, + "step": 226 + }, + { + "epoch": 0.38621863037005527, + "grad_norm": 0.822964781871569, + "learning_rate": 1.404886375306607e-05, + "loss": 0.199, + "step": 227 + }, + { + "epoch": 0.3879200340280732, + "grad_norm": 0.931682446123269, + "learning_rate": 1.3998317785557597e-05, + "loss": 0.1274, + "step": 228 + }, + { + "epoch": 0.38962143768609103, + "grad_norm": 0.7527830215736461, + "learning_rate": 1.3947649932724563e-05, + "loss": 0.1941, + "step": 229 + }, + { + "epoch": 0.3913228413441089, + "grad_norm": 0.6091396020178222, + "learning_rate": 1.3896861739133456e-05, + "loss": 0.1061, + "step": 230 + }, + { + "epoch": 0.39302424500212674, + "grad_norm": 0.8209321222721515, + "learning_rate": 1.384595475301926e-05, + "loss": 0.1996, + "step": 231 + }, + { + "epoch": 0.39472564866014465, + "grad_norm": 0.6713408596564003, + "learning_rate": 1.3794930526238246e-05, + "loss": 0.1328, + "step": 232 + }, + { + "epoch": 0.3964270523181625, + "grad_norm": 0.7565726144729797, + "learning_rate": 1.3743790614220664e-05, + "loss": 0.2015, + "step": 233 + }, + { + "epoch": 0.39812845597618035, + "grad_norm": 0.708765632998576, + "learning_rate": 1.3692536575923334e-05, + "loss": 0.1456, + "step": 234 + }, + { + "epoch": 0.3998298596341982, + "grad_norm": 0.9072359171152111, + "learning_rate": 1.3641169973782117e-05, + "loss": 0.191, + "step": 235 + }, + { + "epoch": 0.40153126329221606, + "grad_norm": 0.6763550961602007, + "learning_rate": 1.3589692373664288e-05, + "loss": 0.1363, + "step": 236 + }, + { + "epoch": 0.40323266695023396, + "grad_norm": 0.98468671509315, + "learning_rate": 1.3538105344820798e-05, + "loss": 0.2171, + "step": 237 + }, + { + "epoch": 0.4049340706082518, + "grad_norm": 0.9074606290226381, + "learning_rate": 1.3486410459838448e-05, + "loss": 0.1416, + "step": 238 + }, + { + "epoch": 0.40663547426626967, + "grad_norm": 0.7908812264203224, + "learning_rate": 1.343460929459193e-05, + "loss": 0.1775, + "step": 239 + }, + { + "epoch": 0.4083368779242875, + "grad_norm": 0.948316525140999, + "learning_rate": 1.3382703428195812e-05, + "loss": 0.211, + "step": 240 + }, + { + "epoch": 0.4100382815823054, + "grad_norm": 0.6944274025232345, + "learning_rate": 1.3330694442956376e-05, + "loss": 0.1366, + "step": 241 + }, + { + "epoch": 0.4117396852403233, + "grad_norm": 0.5046768645162275, + "learning_rate": 1.3278583924323405e-05, + "loss": 0.0944, + "step": 242 + }, + { + "epoch": 0.41344108889834114, + "grad_norm": 0.631382851446314, + "learning_rate": 1.3226373460841835e-05, + "loss": 0.103, + "step": 243 + }, + { + "epoch": 0.415142492556359, + "grad_norm": 0.7547332778838409, + "learning_rate": 1.3174064644103334e-05, + "loss": 0.1366, + "step": 244 + }, + { + "epoch": 0.41684389621437684, + "grad_norm": 0.6985608520862292, + "learning_rate": 1.3121659068697797e-05, + "loss": 0.1021, + "step": 245 + }, + { + "epoch": 0.41854529987239475, + "grad_norm": 0.8945841940781856, + "learning_rate": 1.306915833216471e-05, + "loss": 0.2017, + "step": 246 + }, + { + "epoch": 0.4202467035304126, + "grad_norm": 0.7358062069482544, + "learning_rate": 1.3016564034944473e-05, + "loss": 0.1449, + "step": 247 + }, + { + "epoch": 0.42194810718843045, + "grad_norm": 0.8368287218297127, + "learning_rate": 1.29638777803296e-05, + "loss": 0.2142, + "step": 248 + }, + { + "epoch": 0.4236495108464483, + "grad_norm": 0.7385930596738185, + "learning_rate": 1.2911101174415861e-05, + "loss": 0.1515, + "step": 249 + }, + { + "epoch": 0.42535091450446616, + "grad_norm": 0.6124696818525555, + "learning_rate": 1.2858235826053294e-05, + "loss": 0.1026, + "step": 250 + }, + { + "epoch": 0.42705231816248407, + "grad_norm": 0.6585064118039888, + "learning_rate": 1.2805283346797179e-05, + "loss": 0.1548, + "step": 251 + }, + { + "epoch": 0.4287537218205019, + "grad_norm": 0.664663102051292, + "learning_rate": 1.2752245350858905e-05, + "loss": 0.1376, + "step": 252 + }, + { + "epoch": 0.4304551254785198, + "grad_norm": 0.6355614253492793, + "learning_rate": 1.2699123455056777e-05, + "loss": 0.076, + "step": 253 + }, + { + "epoch": 0.4321565291365376, + "grad_norm": 0.7913737160646693, + "learning_rate": 1.26459192787667e-05, + "loss": 0.1627, + "step": 254 + }, + { + "epoch": 0.43385793279455553, + "grad_norm": 0.5981363088751483, + "learning_rate": 1.2592634443872842e-05, + "loss": 0.125, + "step": 255 + }, + { + "epoch": 0.4355593364525734, + "grad_norm": 0.8497296660782837, + "learning_rate": 1.2539270574718172e-05, + "loss": 0.1686, + "step": 256 + }, + { + "epoch": 0.43726074011059124, + "grad_norm": 0.9578984008697402, + "learning_rate": 1.2485829298054952e-05, + "loss": 0.2193, + "step": 257 + }, + { + "epoch": 0.4389621437686091, + "grad_norm": 0.7681561666765131, + "learning_rate": 1.2432312242995158e-05, + "loss": 0.191, + "step": 258 + }, + { + "epoch": 0.44066354742662694, + "grad_norm": 0.8288141616096146, + "learning_rate": 1.2378721040960788e-05, + "loss": 0.1809, + "step": 259 + }, + { + "epoch": 0.44236495108464485, + "grad_norm": 0.5927843753008636, + "learning_rate": 1.232505732563416e-05, + "loss": 0.1229, + "step": 260 + }, + { + "epoch": 0.4440663547426627, + "grad_norm": 0.7667929540659729, + "learning_rate": 1.2271322732908091e-05, + "loss": 0.1315, + "step": 261 + }, + { + "epoch": 0.44576775840068056, + "grad_norm": 0.7013238891555887, + "learning_rate": 1.2217518900836045e-05, + "loss": 0.1712, + "step": 262 + }, + { + "epoch": 0.4474691620586984, + "grad_norm": 0.565163451130154, + "learning_rate": 1.2163647469582181e-05, + "loss": 0.1091, + "step": 263 + }, + { + "epoch": 0.44917056571671626, + "grad_norm": 0.8453672799307343, + "learning_rate": 1.210971008137136e-05, + "loss": 0.1641, + "step": 264 + }, + { + "epoch": 0.45087196937473417, + "grad_norm": 0.6809294524954712, + "learning_rate": 1.2055708380439089e-05, + "loss": 0.1867, + "step": 265 + }, + { + "epoch": 0.452573373032752, + "grad_norm": 0.6062080885284453, + "learning_rate": 1.2001644012981392e-05, + "loss": 0.123, + "step": 266 + }, + { + "epoch": 0.4542747766907699, + "grad_norm": 0.9357436441120437, + "learning_rate": 1.1947518627104637e-05, + "loss": 0.2072, + "step": 267 + }, + { + "epoch": 0.45597618034878773, + "grad_norm": 0.6463142286447096, + "learning_rate": 1.1893333872775275e-05, + "loss": 0.1469, + "step": 268 + }, + { + "epoch": 0.45767758400680564, + "grad_norm": 0.537321278995717, + "learning_rate": 1.1839091401769559e-05, + "loss": 0.095, + "step": 269 + }, + { + "epoch": 0.4593789876648235, + "grad_norm": 0.5961608154638831, + "learning_rate": 1.1784792867623179e-05, + "loss": 0.0951, + "step": 270 + }, + { + "epoch": 0.46108039132284134, + "grad_norm": 0.995593067113998, + "learning_rate": 1.1730439925580876e-05, + "loss": 0.2003, + "step": 271 + }, + { + "epoch": 0.4627817949808592, + "grad_norm": 0.7210208975818151, + "learning_rate": 1.1676034232545963e-05, + "loss": 0.1659, + "step": 272 + }, + { + "epoch": 0.46448319863887705, + "grad_norm": 0.8873340275771568, + "learning_rate": 1.1621577447029816e-05, + "loss": 0.189, + "step": 273 + }, + { + "epoch": 0.46618460229689496, + "grad_norm": 0.6692399384916299, + "learning_rate": 1.1567071229101332e-05, + "loss": 0.1571, + "step": 274 + }, + { + "epoch": 0.4678860059549128, + "grad_norm": 0.6080274223047835, + "learning_rate": 1.1512517240336304e-05, + "loss": 0.1374, + "step": 275 + }, + { + "epoch": 0.46958740961293066, + "grad_norm": 0.7446621035775705, + "learning_rate": 1.1457917143766786e-05, + "loss": 0.1614, + "step": 276 + }, + { + "epoch": 0.4712888132709485, + "grad_norm": 0.5302629870604803, + "learning_rate": 1.1403272603830384e-05, + "loss": 0.1088, + "step": 277 + }, + { + "epoch": 0.4729902169289664, + "grad_norm": 0.7619679072483031, + "learning_rate": 1.1348585286319529e-05, + "loss": 0.1111, + "step": 278 + }, + { + "epoch": 0.4746916205869843, + "grad_norm": 0.8250292407108464, + "learning_rate": 1.1293856858330678e-05, + "loss": 0.1516, + "step": 279 + }, + { + "epoch": 0.47639302424500213, + "grad_norm": 0.8756261050576111, + "learning_rate": 1.1239088988213522e-05, + "loss": 0.1332, + "step": 280 + }, + { + "epoch": 0.47809442790302, + "grad_norm": 0.5581128461771346, + "learning_rate": 1.11842833455201e-05, + "loss": 0.1024, + "step": 281 + }, + { + "epoch": 0.47979583156103783, + "grad_norm": 0.7788067100903946, + "learning_rate": 1.1129441600953916e-05, + "loss": 0.1967, + "step": 282 + }, + { + "epoch": 0.48149723521905574, + "grad_norm": 0.8950159197666822, + "learning_rate": 1.1074565426319014e-05, + "loss": 0.1891, + "step": 283 + }, + { + "epoch": 0.4831986388770736, + "grad_norm": 0.8593797843979919, + "learning_rate": 1.101965649446901e-05, + "loss": 0.15, + "step": 284 + }, + { + "epoch": 0.48490004253509145, + "grad_norm": 0.8277495297474546, + "learning_rate": 1.0964716479256094e-05, + "loss": 0.1712, + "step": 285 + }, + { + "epoch": 0.4866014461931093, + "grad_norm": 0.5151032512501623, + "learning_rate": 1.0909747055480004e-05, + "loss": 0.1222, + "step": 286 + }, + { + "epoch": 0.4883028498511272, + "grad_norm": 0.6433663339360174, + "learning_rate": 1.0854749898836974e-05, + "loss": 0.1135, + "step": 287 + }, + { + "epoch": 0.49000425350914506, + "grad_norm": 0.5490048384447381, + "learning_rate": 1.0799726685868648e-05, + "loss": 0.0826, + "step": 288 + }, + { + "epoch": 0.4917056571671629, + "grad_norm": 0.8816627849427396, + "learning_rate": 1.0744679093910987e-05, + "loss": 0.1479, + "step": 289 + }, + { + "epoch": 0.49340706082518077, + "grad_norm": 0.7832353160782042, + "learning_rate": 1.0689608801043107e-05, + "loss": 0.1534, + "step": 290 + }, + { + "epoch": 0.4951084644831986, + "grad_norm": 0.7440630609208617, + "learning_rate": 1.063451748603616e-05, + "loss": 0.1777, + "step": 291 + }, + { + "epoch": 0.4968098681412165, + "grad_norm": 0.8243370373673341, + "learning_rate": 1.0579406828302124e-05, + "loss": 0.2, + "step": 292 + }, + { + "epoch": 0.4985112717992344, + "grad_norm": 1.1003707769514204, + "learning_rate": 1.0524278507842637e-05, + "loss": 0.2474, + "step": 293 + }, + { + "epoch": 0.5002126754572522, + "grad_norm": 0.8067346058662889, + "learning_rate": 1.0469134205197762e-05, + "loss": 0.1369, + "step": 294 + }, + { + "epoch": 0.5019140791152701, + "grad_norm": 0.6725259371400214, + "learning_rate": 1.0413975601394765e-05, + "loss": 0.1267, + "step": 295 + }, + { + "epoch": 0.5036154827732879, + "grad_norm": 0.6734668147755227, + "learning_rate": 1.0358804377896876e-05, + "loss": 0.1635, + "step": 296 + }, + { + "epoch": 0.5053168864313058, + "grad_norm": 0.6055902136530477, + "learning_rate": 1.0303622216552022e-05, + "loss": 0.1306, + "step": 297 + }, + { + "epoch": 0.5070182900893236, + "grad_norm": 0.5788655807496915, + "learning_rate": 1.0248430799541564e-05, + "loss": 0.1235, + "step": 298 + }, + { + "epoch": 0.5087196937473416, + "grad_norm": 0.842975912773053, + "learning_rate": 1.019323180932901e-05, + "loss": 0.1602, + "step": 299 + }, + { + "epoch": 0.5104210974053595, + "grad_norm": 0.7350179522876372, + "learning_rate": 1.013802692860873e-05, + "loss": 0.174, + "step": 300 + }, + { + "epoch": 0.5121225010633773, + "grad_norm": 0.8054821395127268, + "learning_rate": 1.0082817840254667e-05, + "loss": 0.1751, + "step": 301 + }, + { + "epoch": 0.5138239047213952, + "grad_norm": 0.7473665920319642, + "learning_rate": 1.0027606227269026e-05, + "loss": 0.1545, + "step": 302 + }, + { + "epoch": 0.515525308379413, + "grad_norm": 0.6216088818787452, + "learning_rate": 9.972393772730975e-06, + "loss": 0.1218, + "step": 303 + }, + { + "epoch": 0.5172267120374309, + "grad_norm": 0.6661600562028501, + "learning_rate": 9.917182159745335e-06, + "loss": 0.1525, + "step": 304 + }, + { + "epoch": 0.5189281156954487, + "grad_norm": 0.6046700731402035, + "learning_rate": 9.861973071391272e-06, + "loss": 0.1263, + "step": 305 + }, + { + "epoch": 0.5206295193534666, + "grad_norm": 0.5725231471888524, + "learning_rate": 9.806768190670994e-06, + "loss": 0.1228, + "step": 306 + }, + { + "epoch": 0.5223309230114844, + "grad_norm": 0.9729384702384406, + "learning_rate": 9.751569200458438e-06, + "loss": 0.2509, + "step": 307 + }, + { + "epoch": 0.5240323266695024, + "grad_norm": 0.7322587669530063, + "learning_rate": 9.69637778344798e-06, + "loss": 0.1841, + "step": 308 + }, + { + "epoch": 0.5257337303275202, + "grad_norm": 0.5121046933124581, + "learning_rate": 9.641195622103126e-06, + "loss": 0.1023, + "step": 309 + }, + { + "epoch": 0.5274351339855381, + "grad_norm": 0.6169186855264731, + "learning_rate": 9.586024398605238e-06, + "loss": 0.1433, + "step": 310 + }, + { + "epoch": 0.529136537643556, + "grad_norm": 0.6201008013054504, + "learning_rate": 9.530865794802243e-06, + "loss": 0.1497, + "step": 311 + }, + { + "epoch": 0.5308379413015738, + "grad_norm": 0.571379112436459, + "learning_rate": 9.475721492157365e-06, + "loss": 0.1293, + "step": 312 + }, + { + "epoch": 0.5325393449595917, + "grad_norm": 0.6845721767553542, + "learning_rate": 9.420593171697876e-06, + "loss": 0.1604, + "step": 313 + }, + { + "epoch": 0.5342407486176095, + "grad_norm": 0.7434198371636971, + "learning_rate": 9.365482513963844e-06, + "loss": 0.1466, + "step": 314 + }, + { + "epoch": 0.5359421522756274, + "grad_norm": 0.8587046587534319, + "learning_rate": 9.310391198956896e-06, + "loss": 0.184, + "step": 315 + }, + { + "epoch": 0.5376435559336452, + "grad_norm": 0.3825870957288433, + "learning_rate": 9.255320906089017e-06, + "loss": 0.0669, + "step": 316 + }, + { + "epoch": 0.5393449595916632, + "grad_norm": 0.5784752736569795, + "learning_rate": 9.200273314131356e-06, + "loss": 0.109, + "step": 317 + }, + { + "epoch": 0.541046363249681, + "grad_norm": 0.4933298004593872, + "learning_rate": 9.145250101163032e-06, + "loss": 0.0914, + "step": 318 + }, + { + "epoch": 0.5427477669076989, + "grad_norm": 0.9213220624255627, + "learning_rate": 9.090252944520002e-06, + "loss": 0.2909, + "step": 319 + }, + { + "epoch": 0.5444491705657167, + "grad_norm": 0.5484087181215404, + "learning_rate": 9.035283520743911e-06, + "loss": 0.1162, + "step": 320 + }, + { + "epoch": 0.5461505742237346, + "grad_norm": 0.8248140118322426, + "learning_rate": 8.980343505530988e-06, + "loss": 0.1489, + "step": 321 + }, + { + "epoch": 0.5478519778817524, + "grad_norm": 0.7413474717377897, + "learning_rate": 8.925434573680986e-06, + "loss": 0.1503, + "step": 322 + }, + { + "epoch": 0.5495533815397703, + "grad_norm": 0.8844034973006791, + "learning_rate": 8.870558399046086e-06, + "loss": 0.1774, + "step": 323 + }, + { + "epoch": 0.5512547851977881, + "grad_norm": 0.6202467611602901, + "learning_rate": 8.815716654479903e-06, + "loss": 0.1433, + "step": 324 + }, + { + "epoch": 0.552956188855806, + "grad_norm": 0.7713841251553113, + "learning_rate": 8.76091101178648e-06, + "loss": 0.1751, + "step": 325 + }, + { + "epoch": 0.5546575925138238, + "grad_norm": 0.47984444436305723, + "learning_rate": 8.706143141669324e-06, + "loss": 0.0984, + "step": 326 + }, + { + "epoch": 0.5563589961718418, + "grad_norm": 0.6436950007671324, + "learning_rate": 8.651414713680474e-06, + "loss": 0.1498, + "step": 327 + }, + { + "epoch": 0.5580603998298597, + "grad_norm": 0.6921833463852107, + "learning_rate": 8.59672739616962e-06, + "loss": 0.1655, + "step": 328 + }, + { + "epoch": 0.5597618034878775, + "grad_norm": 0.8395929232422166, + "learning_rate": 8.542082856233216e-06, + "loss": 0.1605, + "step": 329 + }, + { + "epoch": 0.5614632071458954, + "grad_norm": 0.5962143534415955, + "learning_rate": 8.487482759663696e-06, + "loss": 0.1643, + "step": 330 + }, + { + "epoch": 0.5631646108039132, + "grad_norm": 0.6098817436138174, + "learning_rate": 8.43292877089867e-06, + "loss": 0.1288, + "step": 331 + }, + { + "epoch": 0.5648660144619311, + "grad_norm": 0.6410913271405208, + "learning_rate": 8.378422552970185e-06, + "loss": 0.1512, + "step": 332 + }, + { + "epoch": 0.5665674181199489, + "grad_norm": 0.7183672239663341, + "learning_rate": 8.32396576745404e-06, + "loss": 0.1515, + "step": 333 + }, + { + "epoch": 0.5682688217779668, + "grad_norm": 0.62064036861519, + "learning_rate": 8.269560074419126e-06, + "loss": 0.1294, + "step": 334 + }, + { + "epoch": 0.5699702254359846, + "grad_norm": 0.8658941182477415, + "learning_rate": 8.215207132376824e-06, + "loss": 0.2016, + "step": 335 + }, + { + "epoch": 0.5716716290940026, + "grad_norm": 0.6992061659383073, + "learning_rate": 8.160908598230448e-06, + "loss": 0.162, + "step": 336 + }, + { + "epoch": 0.5733730327520205, + "grad_norm": 0.5376328047353198, + "learning_rate": 8.10666612722473e-06, + "loss": 0.1376, + "step": 337 + }, + { + "epoch": 0.5750744364100383, + "grad_norm": 0.6263063891453132, + "learning_rate": 8.052481372895363e-06, + "loss": 0.1608, + "step": 338 + }, + { + "epoch": 0.5767758400680562, + "grad_norm": 0.8771630551993672, + "learning_rate": 7.998355987018606e-06, + "loss": 0.1973, + "step": 339 + }, + { + "epoch": 0.578477243726074, + "grad_norm": 0.390772239553796, + "learning_rate": 7.944291619560914e-06, + "loss": 0.0819, + "step": 340 + }, + { + "epoch": 0.5801786473840919, + "grad_norm": 0.42205631786094433, + "learning_rate": 7.890289918628644e-06, + "loss": 0.0894, + "step": 341 + }, + { + "epoch": 0.5818800510421097, + "grad_norm": 0.7237391846873152, + "learning_rate": 7.836352530417824e-06, + "loss": 0.1458, + "step": 342 + }, + { + "epoch": 0.5835814547001276, + "grad_norm": 0.8092581405655928, + "learning_rate": 7.782481099163958e-06, + "loss": 0.1982, + "step": 343 + }, + { + "epoch": 0.5852828583581454, + "grad_norm": 0.6661580391115077, + "learning_rate": 7.728677267091912e-06, + "loss": 0.1347, + "step": 344 + }, + { + "epoch": 0.5869842620161634, + "grad_norm": 0.5655956063796203, + "learning_rate": 7.674942674365847e-06, + "loss": 0.1371, + "step": 345 + }, + { + "epoch": 0.5886856656741812, + "grad_norm": 0.5204265585690476, + "learning_rate": 7.621278959039217e-06, + "loss": 0.1136, + "step": 346 + }, + { + "epoch": 0.5903870693321991, + "grad_norm": 0.5105862514234877, + "learning_rate": 7.567687757004843e-06, + "loss": 0.0944, + "step": 347 + }, + { + "epoch": 0.5920884729902169, + "grad_norm": 0.7438169939162386, + "learning_rate": 7.514170701945047e-06, + "loss": 0.1538, + "step": 348 + }, + { + "epoch": 0.5937898766482348, + "grad_norm": 0.5783343144822323, + "learning_rate": 7.460729425281831e-06, + "loss": 0.0973, + "step": 349 + }, + { + "epoch": 0.5954912803062526, + "grad_norm": 0.5382724582426407, + "learning_rate": 7.407365556127162e-06, + "loss": 0.0896, + "step": 350 + }, + { + "epoch": 0.5971926839642705, + "grad_norm": 0.5200384657247858, + "learning_rate": 7.354080721233303e-06, + "loss": 0.1007, + "step": 351 + }, + { + "epoch": 0.5988940876222884, + "grad_norm": 0.7143301177690192, + "learning_rate": 7.300876544943227e-06, + "loss": 0.1408, + "step": 352 + }, + { + "epoch": 0.6005954912803062, + "grad_norm": 0.5266211012982571, + "learning_rate": 7.247754649141097e-06, + "loss": 0.1174, + "step": 353 + }, + { + "epoch": 0.6022968949383242, + "grad_norm": 0.45026923816585473, + "learning_rate": 7.194716653202826e-06, + "loss": 0.091, + "step": 354 + }, + { + "epoch": 0.603998298596342, + "grad_norm": 0.6391615929640093, + "learning_rate": 7.1417641739467104e-06, + "loss": 0.126, + "step": 355 + }, + { + "epoch": 0.6056997022543599, + "grad_norm": 0.6502689654725452, + "learning_rate": 7.088898825584139e-06, + "loss": 0.1646, + "step": 356 + }, + { + "epoch": 0.6074011059123777, + "grad_norm": 0.593985585725139, + "learning_rate": 7.036122219670398e-06, + "loss": 0.117, + "step": 357 + }, + { + "epoch": 0.6091025095703956, + "grad_norm": 0.5471649548605663, + "learning_rate": 6.9834359650555305e-06, + "loss": 0.1213, + "step": 358 + }, + { + "epoch": 0.6108039132284134, + "grad_norm": 0.4279611992244824, + "learning_rate": 6.930841667835295e-06, + "loss": 0.0854, + "step": 359 + }, + { + "epoch": 0.6125053168864313, + "grad_norm": 0.5886558291754239, + "learning_rate": 6.878340931302208e-06, + "loss": 0.1366, + "step": 360 + }, + { + "epoch": 0.6142067205444491, + "grad_norm": 0.7575274587038588, + "learning_rate": 6.825935355896669e-06, + "loss": 0.1615, + "step": 361 + }, + { + "epoch": 0.615908124202467, + "grad_norm": 0.7321726214379118, + "learning_rate": 6.773626539158171e-06, + "loss": 0.1584, + "step": 362 + }, + { + "epoch": 0.617609527860485, + "grad_norm": 0.6015824107096192, + "learning_rate": 6.721416075676601e-06, + "loss": 0.1364, + "step": 363 + }, + { + "epoch": 0.6193109315185028, + "grad_norm": 0.49728749645413944, + "learning_rate": 6.669305557043626e-06, + "loss": 0.0812, + "step": 364 + }, + { + "epoch": 0.6210123351765207, + "grad_norm": 0.7307111292734793, + "learning_rate": 6.617296571804191e-06, + "loss": 0.213, + "step": 365 + }, + { + "epoch": 0.6227137388345385, + "grad_norm": 0.5382832275025874, + "learning_rate": 6.565390705408072e-06, + "loss": 0.1294, + "step": 366 + }, + { + "epoch": 0.6244151424925564, + "grad_norm": 0.6087186946730493, + "learning_rate": 6.513589540161556e-06, + "loss": 0.1361, + "step": 367 + }, + { + "epoch": 0.6261165461505742, + "grad_norm": 0.5461211520970832, + "learning_rate": 6.461894655179204e-06, + "loss": 0.1378, + "step": 368 + }, + { + "epoch": 0.6278179498085921, + "grad_norm": 0.7214649153639943, + "learning_rate": 6.410307626335717e-06, + "loss": 0.1786, + "step": 369 + }, + { + "epoch": 0.6295193534666099, + "grad_norm": 0.7389610680284584, + "learning_rate": 6.358830026217887e-06, + "loss": 0.1743, + "step": 370 + }, + { + "epoch": 0.6312207571246278, + "grad_norm": 0.5922656498049635, + "learning_rate": 6.30746342407667e-06, + "loss": 0.1335, + "step": 371 + }, + { + "epoch": 0.6329221607826457, + "grad_norm": 0.5373789691211449, + "learning_rate": 6.256209385779341e-06, + "loss": 0.0871, + "step": 372 + }, + { + "epoch": 0.6346235644406636, + "grad_norm": 0.6315569188084377, + "learning_rate": 6.205069473761756e-06, + "loss": 0.1431, + "step": 373 + }, + { + "epoch": 0.6363249680986814, + "grad_norm": 0.6341185707102986, + "learning_rate": 6.154045246980742e-06, + "loss": 0.1391, + "step": 374 + }, + { + "epoch": 0.6380263717566993, + "grad_norm": 0.5889671691595377, + "learning_rate": 6.1031382608665456e-06, + "loss": 0.1181, + "step": 375 + }, + { + "epoch": 0.6397277754147171, + "grad_norm": 0.4772740823040706, + "learning_rate": 6.052350067275441e-06, + "loss": 0.1129, + "step": 376 + }, + { + "epoch": 0.641429179072735, + "grad_norm": 0.6507678246797608, + "learning_rate": 6.001682214442406e-06, + "loss": 0.153, + "step": 377 + }, + { + "epoch": 0.6431305827307529, + "grad_norm": 0.6431702192602842, + "learning_rate": 5.951136246933933e-06, + "loss": 0.1419, + "step": 378 + }, + { + "epoch": 0.6448319863887707, + "grad_norm": 0.6411432598223046, + "learning_rate": 5.900713705600951e-06, + "loss": 0.1533, + "step": 379 + }, + { + "epoch": 0.6465333900467886, + "grad_norm": 0.6380078466742485, + "learning_rate": 5.850416127531841e-06, + "loss": 0.1313, + "step": 380 + }, + { + "epoch": 0.6482347937048064, + "grad_norm": 0.65349219803706, + "learning_rate": 5.800245046005585e-06, + "loss": 0.1455, + "step": 381 + }, + { + "epoch": 0.6499361973628244, + "grad_norm": 0.6604507918781273, + "learning_rate": 5.750201990445024e-06, + "loss": 0.151, + "step": 382 + }, + { + "epoch": 0.6516376010208422, + "grad_norm": 0.5303752739097137, + "learning_rate": 5.70028848637024e-06, + "loss": 0.1246, + "step": 383 + }, + { + "epoch": 0.6533390046788601, + "grad_norm": 0.4960121715965537, + "learning_rate": 5.650506055352052e-06, + "loss": 0.1144, + "step": 384 + }, + { + "epoch": 0.6550404083368779, + "grad_norm": 0.7363743811479757, + "learning_rate": 5.600856214965613e-06, + "loss": 0.1714, + "step": 385 + }, + { + "epoch": 0.6567418119948958, + "grad_norm": 0.7708482748891713, + "learning_rate": 5.551340478744176e-06, + "loss": 0.2031, + "step": 386 + }, + { + "epoch": 0.6584432156529136, + "grad_norm": 0.5876997390403462, + "learning_rate": 5.501960356132945e-06, + "loss": 0.149, + "step": 387 + }, + { + "epoch": 0.6601446193109315, + "grad_norm": 0.7419217553691858, + "learning_rate": 5.4527173524430395e-06, + "loss": 0.1476, + "step": 388 + }, + { + "epoch": 0.6618460229689493, + "grad_norm": 0.5553741412050056, + "learning_rate": 5.403612968805649e-06, + "loss": 0.1066, + "step": 389 + }, + { + "epoch": 0.6635474266269672, + "grad_norm": 0.5822325374677844, + "learning_rate": 5.354648702126229e-06, + "loss": 0.1272, + "step": 390 + }, + { + "epoch": 0.6652488302849852, + "grad_norm": 0.479370096729397, + "learning_rate": 5.305826045038899e-06, + "loss": 0.1105, + "step": 391 + }, + { + "epoch": 0.666950233943003, + "grad_norm": 0.8210938009467911, + "learning_rate": 5.257146485860927e-06, + "loss": 0.1824, + "step": 392 + }, + { + "epoch": 0.6686516376010209, + "grad_norm": 0.4950772847633868, + "learning_rate": 5.208611508547367e-06, + "loss": 0.1177, + "step": 393 + }, + { + "epoch": 0.6703530412590387, + "grad_norm": 0.6324119841557184, + "learning_rate": 5.160222592645808e-06, + "loss": 0.1508, + "step": 394 + }, + { + "epoch": 0.6720544449170566, + "grad_norm": 0.7140866463435349, + "learning_rate": 5.111981213251293e-06, + "loss": 0.1736, + "step": 395 + }, + { + "epoch": 0.6737558485750744, + "grad_norm": 0.8200921736017754, + "learning_rate": 5.063888840961325e-06, + "loss": 0.1803, + "step": 396 + }, + { + "epoch": 0.6754572522330923, + "grad_norm": 0.60107373708548, + "learning_rate": 5.015946941831064e-06, + "loss": 0.1373, + "step": 397 + }, + { + "epoch": 0.6771586558911101, + "grad_norm": 0.6740157487767822, + "learning_rate": 4.968156977328626e-06, + "loss": 0.1381, + "step": 398 + }, + { + "epoch": 0.678860059549128, + "grad_norm": 0.5504866029437426, + "learning_rate": 4.920520404290512e-06, + "loss": 0.1124, + "step": 399 + }, + { + "epoch": 0.680561463207146, + "grad_norm": 0.8451223023746436, + "learning_rate": 4.87303867487723e-06, + "loss": 0.1075, + "step": 400 + }, + { + "epoch": 0.6822628668651638, + "grad_norm": 0.5312101139216241, + "learning_rate": 4.825713236529005e-06, + "loss": 0.1064, + "step": 401 + }, + { + "epoch": 0.6839642705231816, + "grad_norm": 0.5704890965241033, + "learning_rate": 4.778545531921668e-06, + "loss": 0.082, + "step": 402 + }, + { + "epoch": 0.6856656741811995, + "grad_norm": 0.48848934157835633, + "learning_rate": 4.731536998922657e-06, + "loss": 0.0993, + "step": 403 + }, + { + "epoch": 0.6873670778392174, + "grad_norm": 0.5427862059005317, + "learning_rate": 4.684689070547216e-06, + "loss": 0.1124, + "step": 404 + }, + { + "epoch": 0.6890684814972352, + "grad_norm": 0.4381933589798914, + "learning_rate": 4.638003174914675e-06, + "loss": 0.0916, + "step": 405 + }, + { + "epoch": 0.6907698851552531, + "grad_norm": 0.6972391783553307, + "learning_rate": 4.591480735204953e-06, + "loss": 0.1389, + "step": 406 + }, + { + "epoch": 0.6924712888132709, + "grad_norm": 0.5669972933879025, + "learning_rate": 4.545123169615134e-06, + "loss": 0.1025, + "step": 407 + }, + { + "epoch": 0.6941726924712888, + "grad_norm": 0.592390909667276, + "learning_rate": 4.49893189131627e-06, + "loss": 0.1119, + "step": 408 + }, + { + "epoch": 0.6958740961293067, + "grad_norm": 0.5165646249545874, + "learning_rate": 4.45290830841028e-06, + "loss": 0.0892, + "step": 409 + }, + { + "epoch": 0.6975754997873246, + "grad_norm": 0.7809696427236026, + "learning_rate": 4.407053823887033e-06, + "loss": 0.1935, + "step": 410 + }, + { + "epoch": 0.6992769034453424, + "grad_norm": 0.6705533566926563, + "learning_rate": 4.361369835581569e-06, + "loss": 0.1269, + "step": 411 + }, + { + "epoch": 0.7009783071033603, + "grad_norm": 0.584388099377689, + "learning_rate": 4.315857736131508e-06, + "loss": 0.1505, + "step": 412 + }, + { + "epoch": 0.7026797107613781, + "grad_norm": 0.6462986239110993, + "learning_rate": 4.2705189129345814e-06, + "loss": 0.1562, + "step": 413 + }, + { + "epoch": 0.704381114419396, + "grad_norm": 0.570857299952672, + "learning_rate": 4.225354748106328e-06, + "loss": 0.1519, + "step": 414 + }, + { + "epoch": 0.7060825180774138, + "grad_norm": 0.8613783122294221, + "learning_rate": 4.180366618437996e-06, + "loss": 0.17, + "step": 415 + }, + { + "epoch": 0.7077839217354317, + "grad_norm": 0.5256623238361198, + "learning_rate": 4.13555589535453e-06, + "loss": 0.1176, + "step": 416 + }, + { + "epoch": 0.7094853253934496, + "grad_norm": 0.5329692146177042, + "learning_rate": 4.0909239448727985e-06, + "loss": 0.1134, + "step": 417 + }, + { + "epoch": 0.7111867290514675, + "grad_norm": 0.5412730128143294, + "learning_rate": 4.046472127559937e-06, + "loss": 0.1023, + "step": 418 + }, + { + "epoch": 0.7128881327094854, + "grad_norm": 0.6641877682257822, + "learning_rate": 4.002201798491875e-06, + "loss": 0.1339, + "step": 419 + }, + { + "epoch": 0.7145895363675032, + "grad_norm": 0.660591532391084, + "learning_rate": 3.958114307212018e-06, + "loss": 0.1374, + "step": 420 + }, + { + "epoch": 0.7162909400255211, + "grad_norm": 0.5373852917963093, + "learning_rate": 3.91421099769013e-06, + "loss": 0.1247, + "step": 421 + }, + { + "epoch": 0.7179923436835389, + "grad_norm": 0.6062843312528969, + "learning_rate": 3.870493208281337e-06, + "loss": 0.1345, + "step": 422 + }, + { + "epoch": 0.7196937473415568, + "grad_norm": 0.7327846214621127, + "learning_rate": 3.826962271685351e-06, + "loss": 0.1755, + "step": 423 + }, + { + "epoch": 0.7213951509995746, + "grad_norm": 0.6428495993943769, + "learning_rate": 3.7836195149058386e-06, + "loss": 0.1094, + "step": 424 + }, + { + "epoch": 0.7230965546575925, + "grad_norm": 0.7363139918204018, + "learning_rate": 3.7404662592099483e-06, + "loss": 0.1577, + "step": 425 + }, + { + "epoch": 0.7247979583156103, + "grad_norm": 0.8033787241820598, + "learning_rate": 3.697503820088063e-06, + "loss": 0.171, + "step": 426 + }, + { + "epoch": 0.7264993619736282, + "grad_norm": 0.6357088786408424, + "learning_rate": 3.654733507213678e-06, + "loss": 0.1417, + "step": 427 + }, + { + "epoch": 0.7282007656316462, + "grad_norm": 0.6818943024586197, + "learning_rate": 3.61215662440349e-06, + "loss": 0.1336, + "step": 428 + }, + { + "epoch": 0.729902169289664, + "grad_norm": 0.5663564889890744, + "learning_rate": 3.5697744695776326e-06, + "loss": 0.1057, + "step": 429 + }, + { + "epoch": 0.7316035729476819, + "grad_norm": 0.8157641682258925, + "learning_rate": 3.5275883347201336e-06, + "loss": 0.1611, + "step": 430 + }, + { + "epoch": 0.7333049766056997, + "grad_norm": 0.7843104964796086, + "learning_rate": 3.4855995058395066e-06, + "loss": 0.1812, + "step": 431 + }, + { + "epoch": 0.7350063802637176, + "grad_norm": 0.6620080605838786, + "learning_rate": 3.443809262929575e-06, + "loss": 0.1625, + "step": 432 + }, + { + "epoch": 0.7367077839217354, + "grad_norm": 0.6764654419703444, + "learning_rate": 3.4022188799304214e-06, + "loss": 0.144, + "step": 433 + }, + { + "epoch": 0.7384091875797533, + "grad_norm": 0.5644424740263085, + "learning_rate": 3.36082962468958e-06, + "loss": 0.1263, + "step": 434 + }, + { + "epoch": 0.7401105912377711, + "grad_norm": 0.5799072233692542, + "learning_rate": 3.3196427589233725e-06, + "loss": 0.1003, + "step": 435 + }, + { + "epoch": 0.741811994895789, + "grad_norm": 0.6800149986093089, + "learning_rate": 3.2786595381784512e-06, + "loss": 0.1388, + "step": 436 + }, + { + "epoch": 0.7435133985538069, + "grad_norm": 0.7928341276011606, + "learning_rate": 3.2378812117935154e-06, + "loss": 0.1903, + "step": 437 + }, + { + "epoch": 0.7452148022118248, + "grad_norm": 1.0122782411273037, + "learning_rate": 3.1973090228612404e-06, + "loss": 0.1856, + "step": 438 + }, + { + "epoch": 0.7469162058698426, + "grad_norm": 0.5535225691195111, + "learning_rate": 3.15694420819038e-06, + "loss": 0.1257, + "step": 439 + }, + { + "epoch": 0.7486176095278605, + "grad_norm": 0.4056045118096214, + "learning_rate": 3.116787998268046e-06, + "loss": 0.086, + "step": 440 + }, + { + "epoch": 0.7503190131858783, + "grad_norm": 0.7042100069054709, + "learning_rate": 3.076841617222228e-06, + "loss": 0.1674, + "step": 441 + }, + { + "epoch": 0.7520204168438962, + "grad_norm": 0.5701022869483006, + "learning_rate": 3.0371062827844434e-06, + "loss": 0.1388, + "step": 442 + }, + { + "epoch": 0.753721820501914, + "grad_norm": 0.5411441414159908, + "learning_rate": 2.997583206252647e-06, + "loss": 0.1335, + "step": 443 + }, + { + "epoch": 0.7554232241599319, + "grad_norm": 0.5928787504980437, + "learning_rate": 2.958273592454285e-06, + "loss": 0.1232, + "step": 444 + }, + { + "epoch": 0.7571246278179498, + "grad_norm": 0.7026645814797263, + "learning_rate": 2.9191786397095778e-06, + "loss": 0.1166, + "step": 445 + }, + { + "epoch": 0.7588260314759677, + "grad_norm": 0.6222635021806553, + "learning_rate": 2.880299539794975e-06, + "loss": 0.1016, + "step": 446 + }, + { + "epoch": 0.7605274351339856, + "grad_norm": 0.7249185245501563, + "learning_rate": 2.841637477906851e-06, + "loss": 0.153, + "step": 447 + }, + { + "epoch": 0.7622288387920034, + "grad_norm": 0.5857854559414456, + "learning_rate": 2.803193632625346e-06, + "loss": 0.1502, + "step": 448 + }, + { + "epoch": 0.7639302424500213, + "grad_norm": 0.5179342426075431, + "learning_rate": 2.7649691758784603e-06, + "loss": 0.1218, + "step": 449 + }, + { + "epoch": 0.7656316461080391, + "grad_norm": 0.5639523992123263, + "learning_rate": 2.7269652729063233e-06, + "loss": 0.139, + "step": 450 + }, + { + "epoch": 0.767333049766057, + "grad_norm": 0.6368025678557075, + "learning_rate": 2.689183082225659e-06, + "loss": 0.1488, + "step": 451 + }, + { + "epoch": 0.7690344534240748, + "grad_norm": 0.8300408569608919, + "learning_rate": 2.65162375559449e-06, + "loss": 0.179, + "step": 452 + }, + { + "epoch": 0.7707358570820927, + "grad_norm": 0.5936101243290062, + "learning_rate": 2.614288437977014e-06, + "loss": 0.1429, + "step": 453 + }, + { + "epoch": 0.7724372607401105, + "grad_norm": 0.7212331766925745, + "learning_rate": 2.5771782675087078e-06, + "loss": 0.1874, + "step": 454 + }, + { + "epoch": 0.7741386643981285, + "grad_norm": 0.7060511886948133, + "learning_rate": 2.5402943754616182e-06, + "loss": 0.1843, + "step": 455 + }, + { + "epoch": 0.7758400680561464, + "grad_norm": 0.46692174489621696, + "learning_rate": 2.5036378862099e-06, + "loss": 0.1121, + "step": 456 + }, + { + "epoch": 0.7775414717141642, + "grad_norm": 0.7206993698606143, + "learning_rate": 2.467209917195513e-06, + "loss": 0.1603, + "step": 457 + }, + { + "epoch": 0.7792428753721821, + "grad_norm": 0.5901971981845033, + "learning_rate": 2.4310115788941855e-06, + "loss": 0.1375, + "step": 458 + }, + { + "epoch": 0.7809442790301999, + "grad_norm": 0.6065027291241805, + "learning_rate": 2.3950439747815357e-06, + "loss": 0.1339, + "step": 459 + }, + { + "epoch": 0.7826456826882178, + "grad_norm": 0.5745544040111182, + "learning_rate": 2.359308201299454e-06, + "loss": 0.1089, + "step": 460 + }, + { + "epoch": 0.7843470863462356, + "grad_norm": 0.576203266433072, + "learning_rate": 2.3238053478226665e-06, + "loss": 0.1201, + "step": 461 + }, + { + "epoch": 0.7860484900042535, + "grad_norm": 0.49562353375183704, + "learning_rate": 2.2885364966255372e-06, + "loss": 0.1101, + "step": 462 + }, + { + "epoch": 0.7877498936622713, + "grad_norm": 0.7355734068065339, + "learning_rate": 2.2535027228490582e-06, + "loss": 0.1434, + "step": 463 + }, + { + "epoch": 0.7894512973202893, + "grad_norm": 0.7888019947233804, + "learning_rate": 2.2187050944680942e-06, + "loss": 0.1661, + "step": 464 + }, + { + "epoch": 0.7911527009783071, + "grad_norm": 0.49093455919485884, + "learning_rate": 2.18414467225882e-06, + "loss": 0.1039, + "step": 465 + }, + { + "epoch": 0.792854104636325, + "grad_norm": 0.6593888207635097, + "learning_rate": 2.1498225097663695e-06, + "loss": 0.164, + "step": 466 + }, + { + "epoch": 0.7945555082943428, + "grad_norm": 0.6522964179007439, + "learning_rate": 2.115739653272747e-06, + "loss": 0.1581, + "step": 467 + }, + { + "epoch": 0.7962569119523607, + "grad_norm": 0.6549220673653909, + "learning_rate": 2.0818971417649013e-06, + "loss": 0.145, + "step": 468 + }, + { + "epoch": 0.7979583156103786, + "grad_norm": 0.7374023481623728, + "learning_rate": 2.048296006903081e-06, + "loss": 0.2214, + "step": 469 + }, + { + "epoch": 0.7996597192683964, + "grad_norm": 0.5171075980296638, + "learning_rate": 2.0149372729893646e-06, + "loss": 0.1045, + "step": 470 + }, + { + "epoch": 0.8013611229264143, + "grad_norm": 0.7724031350744703, + "learning_rate": 1.981821956936448e-06, + "loss": 0.1465, + "step": 471 + }, + { + "epoch": 0.8030625265844321, + "grad_norm": 0.6672766903409693, + "learning_rate": 1.9489510682366363e-06, + "loss": 0.1585, + "step": 472 + }, + { + "epoch": 0.8047639302424501, + "grad_norm": 0.537425881406562, + "learning_rate": 1.916325608931079e-06, + "loss": 0.1182, + "step": 473 + }, + { + "epoch": 0.8064653339004679, + "grad_norm": 0.6021955534686722, + "learning_rate": 1.8839465735792095e-06, + "loss": 0.1243, + "step": 474 + }, + { + "epoch": 0.8081667375584858, + "grad_norm": 0.6980741352012216, + "learning_rate": 1.8518149492284477e-06, + "loss": 0.1689, + "step": 475 + }, + { + "epoch": 0.8098681412165036, + "grad_norm": 0.730166246025193, + "learning_rate": 1.8199317153840933e-06, + "loss": 0.1502, + "step": 476 + }, + { + "epoch": 0.8115695448745215, + "grad_norm": 0.7583033038540565, + "learning_rate": 1.7882978439794708e-06, + "loss": 0.1584, + "step": 477 + }, + { + "epoch": 0.8132709485325393, + "grad_norm": 0.6469728530679982, + "learning_rate": 1.756914299346304e-06, + "loss": 0.1813, + "step": 478 + }, + { + "epoch": 0.8149723521905572, + "grad_norm": 0.6962195184590543, + "learning_rate": 1.7257820381853197e-06, + "loss": 0.16, + "step": 479 + }, + { + "epoch": 0.816673755848575, + "grad_norm": 0.6319410863478037, + "learning_rate": 1.6949020095370816e-06, + "loss": 0.1345, + "step": 480 + }, + { + "epoch": 0.8183751595065929, + "grad_norm": 0.5248442238719696, + "learning_rate": 1.6642751547530512e-06, + "loss": 0.1146, + "step": 481 + }, + { + "epoch": 0.8200765631646108, + "grad_norm": 0.6293911735248475, + "learning_rate": 1.6339024074669107e-06, + "loss": 0.1629, + "step": 482 + }, + { + "epoch": 0.8217779668226287, + "grad_norm": 0.6968998179975308, + "learning_rate": 1.6037846935660807e-06, + "loss": 0.1336, + "step": 483 + }, + { + "epoch": 0.8234793704806466, + "grad_norm": 0.6389348451744808, + "learning_rate": 1.5739229311635152e-06, + "loss": 0.1664, + "step": 484 + }, + { + "epoch": 0.8251807741386644, + "grad_norm": 0.49483055937542064, + "learning_rate": 1.5443180305696948e-06, + "loss": 0.1146, + "step": 485 + }, + { + "epoch": 0.8268821777966823, + "grad_norm": 0.6128200727733571, + "learning_rate": 1.5149708942648922e-06, + "loss": 0.1594, + "step": 486 + }, + { + "epoch": 0.8285835814547001, + "grad_norm": 0.7129211349937769, + "learning_rate": 1.4858824168716524e-06, + "loss": 0.137, + "step": 487 + }, + { + "epoch": 0.830284985112718, + "grad_norm": 0.627404565029089, + "learning_rate": 1.4570534851275241e-06, + "loss": 0.1611, + "step": 488 + }, + { + "epoch": 0.8319863887707358, + "grad_norm": 0.5293175863409247, + "learning_rate": 1.4284849778580279e-06, + "loss": 0.1161, + "step": 489 + }, + { + "epoch": 0.8336877924287537, + "grad_norm": 0.43178654470703287, + "learning_rate": 1.4001777659498584e-06, + "loss": 0.0847, + "step": 490 + }, + { + "epoch": 0.8353891960867715, + "grad_norm": 0.6175796401407724, + "learning_rate": 1.3721327123243533e-06, + "loss": 0.1476, + "step": 491 + }, + { + "epoch": 0.8370905997447895, + "grad_norm": 0.5282862250164813, + "learning_rate": 1.3443506719111666e-06, + "loss": 0.0966, + "step": 492 + }, + { + "epoch": 0.8387920034028074, + "grad_norm": 0.844002715317485, + "learning_rate": 1.3168324916222296e-06, + "loss": 0.1762, + "step": 493 + }, + { + "epoch": 0.8404934070608252, + "grad_norm": 0.7243315647873998, + "learning_rate": 1.28957901032591e-06, + "loss": 0.1783, + "step": 494 + }, + { + "epoch": 0.8421948107188431, + "grad_norm": 0.5670895450202166, + "learning_rate": 1.2625910588214608e-06, + "loss": 0.123, + "step": 495 + }, + { + "epoch": 0.8438962143768609, + "grad_norm": 0.45059796707942773, + "learning_rate": 1.2358694598136755e-06, + "loss": 0.0878, + "step": 496 + }, + { + "epoch": 0.8455976180348788, + "grad_norm": 0.531389411405853, + "learning_rate": 1.2094150278878303e-06, + "loss": 0.1121, + "step": 497 + }, + { + "epoch": 0.8472990216928966, + "grad_norm": 0.6609175602313349, + "learning_rate": 1.1832285694848255e-06, + "loss": 0.1613, + "step": 498 + }, + { + "epoch": 0.8490004253509145, + "grad_norm": 0.6085551558377151, + "learning_rate": 1.1573108828766255e-06, + "loss": 0.1248, + "step": 499 + }, + { + "epoch": 0.8507018290089323, + "grad_norm": 0.6542536791921874, + "learning_rate": 1.1316627581419137e-06, + "loss": 0.1225, + "step": 500 + }, + { + "epoch": 0.8524032326669503, + "grad_norm": 0.8832425426832319, + "learning_rate": 1.1062849771420025e-06, + "loss": 0.2388, + "step": 501 + }, + { + "epoch": 0.8541046363249681, + "grad_norm": 0.5988831217488609, + "learning_rate": 1.0811783134970132e-06, + "loss": 0.1588, + "step": 502 + }, + { + "epoch": 0.855806039982986, + "grad_norm": 0.5904041973612292, + "learning_rate": 1.0563435325622762e-06, + "loss": 0.0993, + "step": 503 + }, + { + "epoch": 0.8575074436410038, + "grad_norm": 0.67012750106105, + "learning_rate": 1.0317813914050157e-06, + "loss": 0.1566, + "step": 504 + }, + { + "epoch": 0.8592088472990217, + "grad_norm": 0.811302728335703, + "learning_rate": 1.007492638781259e-06, + "loss": 0.2083, + "step": 505 + }, + { + "epoch": 0.8609102509570395, + "grad_norm": 0.5911216587965762, + "learning_rate": 9.834780151130196e-07, + "loss": 0.1502, + "step": 506 + }, + { + "epoch": 0.8626116546150574, + "grad_norm": 0.5311094246832349, + "learning_rate": 9.597382524657173e-07, + "loss": 0.1321, + "step": 507 + }, + { + "epoch": 0.8643130582730753, + "grad_norm": 0.6008563668665237, + "learning_rate": 9.362740745258736e-07, + "loss": 0.1295, + "step": 508 + }, + { + "epoch": 0.8660144619310931, + "grad_norm": 0.5294822126630425, + "learning_rate": 9.13086196579035e-07, + "loss": 0.1172, + "step": 509 + }, + { + "epoch": 0.8677158655891111, + "grad_norm": 0.497471337202083, + "learning_rate": 8.901753254879885e-07, + "loss": 0.1012, + "step": 510 + }, + { + "epoch": 0.8694172692471289, + "grad_norm": 0.49266935251385, + "learning_rate": 8.67542159671192e-07, + "loss": 0.1165, + "step": 511 + }, + { + "epoch": 0.8711186729051468, + "grad_norm": 0.7512023601884894, + "learning_rate": 8.451873890814988e-07, + "loss": 0.1713, + "step": 512 + }, + { + "epoch": 0.8728200765631646, + "grad_norm": 0.5490103557130668, + "learning_rate": 8.231116951851204e-07, + "loss": 0.101, + "step": 513 + }, + { + "epoch": 0.8745214802211825, + "grad_norm": 0.4969405496876176, + "learning_rate": 8.013157509408509e-07, + "loss": 0.1126, + "step": 514 + }, + { + "epoch": 0.8762228838792003, + "grad_norm": 0.7689540999105399, + "learning_rate": 7.79800220779554e-07, + "loss": 0.1773, + "step": 515 + }, + { + "epoch": 0.8779242875372182, + "grad_norm": 0.6616922275932038, + "learning_rate": 7.585657605839059e-07, + "loss": 0.1755, + "step": 516 + }, + { + "epoch": 0.879625691195236, + "grad_norm": 0.7846338726913418, + "learning_rate": 7.376130176684082e-07, + "loss": 0.2002, + "step": 517 + }, + { + "epoch": 0.8813270948532539, + "grad_norm": 0.6451675112951657, + "learning_rate": 7.169426307596428e-07, + "loss": 0.1471, + "step": 518 + }, + { + "epoch": 0.8830284985112719, + "grad_norm": 0.5202786277343583, + "learning_rate": 6.965552299768186e-07, + "loss": 0.119, + "step": 519 + }, + { + "epoch": 0.8847299021692897, + "grad_norm": 0.5480930128651451, + "learning_rate": 6.764514368125419e-07, + "loss": 0.1182, + "step": 520 + }, + { + "epoch": 0.8864313058273076, + "grad_norm": 0.6283859903702507, + "learning_rate": 6.566318641138902e-07, + "loss": 0.1209, + "step": 521 + }, + { + "epoch": 0.8881327094853254, + "grad_norm": 0.3197167201268544, + "learning_rate": 6.370971160637129e-07, + "loss": 0.0534, + "step": 522 + }, + { + "epoch": 0.8898341131433433, + "grad_norm": 0.7273489086638031, + "learning_rate": 6.178477881622325e-07, + "loss": 0.1465, + "step": 523 + }, + { + "epoch": 0.8915355168013611, + "grad_norm": 0.596106861974183, + "learning_rate": 5.98884467208869e-07, + "loss": 0.1478, + "step": 524 + }, + { + "epoch": 0.893236920459379, + "grad_norm": 0.5635354536633403, + "learning_rate": 5.802077312843723e-07, + "loss": 0.1276, + "step": 525 + }, + { + "epoch": 0.8949383241173968, + "grad_norm": 0.45363727626728617, + "learning_rate": 5.618181497331865e-07, + "loss": 0.0953, + "step": 526 + }, + { + "epoch": 0.8966397277754147, + "grad_norm": 0.48598924628118745, + "learning_rate": 5.437162831460962e-07, + "loss": 0.1165, + "step": 527 + }, + { + "epoch": 0.8983411314334325, + "grad_norm": 0.669586737859719, + "learning_rate": 5.259026833431468e-07, + "loss": 0.1283, + "step": 528 + }, + { + "epoch": 0.9000425350914505, + "grad_norm": 0.6597644601301488, + "learning_rate": 5.083778933568073e-07, + "loss": 0.1405, + "step": 529 + }, + { + "epoch": 0.9017439387494683, + "grad_norm": 0.749796356081093, + "learning_rate": 4.911424474154314e-07, + "loss": 0.1713, + "step": 530 + }, + { + "epoch": 0.9034453424074862, + "grad_norm": 0.733138055149784, + "learning_rate": 4.741968709269573e-07, + "loss": 0.1655, + "step": 531 + }, + { + "epoch": 0.905146746065504, + "grad_norm": 0.5850003782216738, + "learning_rate": 4.575416804629085e-07, + "loss": 0.1015, + "step": 532 + }, + { + "epoch": 0.9068481497235219, + "grad_norm": 0.5385240295697294, + "learning_rate": 4.411773837426303e-07, + "loss": 0.1306, + "step": 533 + }, + { + "epoch": 0.9085495533815398, + "grad_norm": 0.6472304411429782, + "learning_rate": 4.2510447961782055e-07, + "loss": 0.1527, + "step": 534 + }, + { + "epoch": 0.9102509570395576, + "grad_norm": 0.6603702785841319, + "learning_rate": 4.093234580573202e-07, + "loss": 0.1817, + "step": 535 + }, + { + "epoch": 0.9119523606975755, + "grad_norm": 0.5950794081562677, + "learning_rate": 3.938348001321812e-07, + "loss": 0.1396, + "step": 536 + }, + { + "epoch": 0.9136537643555933, + "grad_norm": 0.5590759902772791, + "learning_rate": 3.786389780009958e-07, + "loss": 0.1217, + "step": 537 + }, + { + "epoch": 0.9153551680136113, + "grad_norm": 0.4694970124433548, + "learning_rate": 3.637364548955047e-07, + "loss": 0.114, + "step": 538 + }, + { + "epoch": 0.9170565716716291, + "grad_norm": 0.8837605645726527, + "learning_rate": 3.491276851064784e-07, + "loss": 0.196, + "step": 539 + }, + { + "epoch": 0.918757975329647, + "grad_norm": 0.6350175654205374, + "learning_rate": 3.3481311396986626e-07, + "loss": 0.121, + "step": 540 + }, + { + "epoch": 0.9204593789876648, + "grad_norm": 0.6880582411723488, + "learning_rate": 3.2079317785322363e-07, + "loss": 0.172, + "step": 541 + }, + { + "epoch": 0.9221607826456827, + "grad_norm": 0.7372057906758976, + "learning_rate": 3.0706830414240164e-07, + "loss": 0.1442, + "step": 542 + }, + { + "epoch": 0.9238621863037005, + "grad_norm": 0.46261129624027075, + "learning_rate": 2.9363891122853097e-07, + "loss": 0.1151, + "step": 543 + }, + { + "epoch": 0.9255635899617184, + "grad_norm": 0.6787015843295537, + "learning_rate": 2.805054084952552e-07, + "loss": 0.1771, + "step": 544 + }, + { + "epoch": 0.9272649936197362, + "grad_norm": 0.6079847024018411, + "learning_rate": 2.6766819630626216e-07, + "loss": 0.1286, + "step": 545 + }, + { + "epoch": 0.9289663972777541, + "grad_norm": 0.6257881741973698, + "learning_rate": 2.5512766599306903e-07, + "loss": 0.1418, + "step": 546 + }, + { + "epoch": 0.9306678009357721, + "grad_norm": 0.5712524721144197, + "learning_rate": 2.4288419984310086e-07, + "loss": 0.124, + "step": 547 + }, + { + "epoch": 0.9323692045937899, + "grad_norm": 0.4443292776003244, + "learning_rate": 2.3093817108803318e-07, + "loss": 0.0871, + "step": 548 + }, + { + "epoch": 0.9340706082518078, + "grad_norm": 0.5259830897508192, + "learning_rate": 2.1928994389241454e-07, + "loss": 0.0973, + "step": 549 + }, + { + "epoch": 0.9357720119098256, + "grad_norm": 0.6024120570164246, + "learning_rate": 2.0793987334256637e-07, + "loss": 0.1281, + "step": 550 + }, + { + "epoch": 0.9374734155678435, + "grad_norm": 0.6161313566440694, + "learning_rate": 1.968883054357562e-07, + "loss": 0.0993, + "step": 551 + }, + { + "epoch": 0.9391748192258613, + "grad_norm": 0.6884471232807766, + "learning_rate": 1.861355770696549e-07, + "loss": 0.1419, + "step": 552 + }, + { + "epoch": 0.9408762228838792, + "grad_norm": 0.8490051863716163, + "learning_rate": 1.7568201603205827e-07, + "loss": 0.1642, + "step": 553 + }, + { + "epoch": 0.942577626541897, + "grad_norm": 0.7040499901991882, + "learning_rate": 1.6552794099090718e-07, + "loss": 0.1874, + "step": 554 + }, + { + "epoch": 0.9442790301999149, + "grad_norm": 0.4758175007683842, + "learning_rate": 1.5567366148455887e-07, + "loss": 0.0881, + "step": 555 + }, + { + "epoch": 0.9459804338579328, + "grad_norm": 0.4722309723663649, + "learning_rate": 1.4611947791236314e-07, + "loss": 0.0946, + "step": 556 + }, + { + "epoch": 0.9476818375159507, + "grad_norm": 0.5848989846654977, + "learning_rate": 1.3686568152549539e-07, + "loss": 0.1285, + "step": 557 + }, + { + "epoch": 0.9493832411739686, + "grad_norm": 0.6310014700455454, + "learning_rate": 1.2791255441809037e-07, + "loss": 0.1557, + "step": 558 + }, + { + "epoch": 0.9510846448319864, + "grad_norm": 0.5963374113714427, + "learning_rate": 1.1926036951862563e-07, + "loss": 0.1302, + "step": 559 + }, + { + "epoch": 0.9527860484900043, + "grad_norm": 0.5758036484872573, + "learning_rate": 1.109093905816172e-07, + "loss": 0.1305, + "step": 560 + }, + { + "epoch": 0.9544874521480221, + "grad_norm": 0.6558192274073563, + "learning_rate": 1.0285987217957038e-07, + "loss": 0.1667, + "step": 561 + }, + { + "epoch": 0.95618885580604, + "grad_norm": 0.8974658491015558, + "learning_rate": 9.511205969522263e-08, + "loss": 0.1751, + "step": 562 + }, + { + "epoch": 0.9578902594640578, + "grad_norm": 0.5819184064362098, + "learning_rate": 8.76661893140629e-08, + "loss": 0.1245, + "step": 563 + }, + { + "epoch": 0.9595916631220757, + "grad_norm": 0.6533402391561167, + "learning_rate": 8.052248801712958e-08, + "loss": 0.1485, + "step": 564 + }, + { + "epoch": 0.9612930667800936, + "grad_norm": 0.5478741217833424, + "learning_rate": 7.36811735740961e-08, + "loss": 0.1254, + "step": 565 + }, + { + "epoch": 0.9629944704381115, + "grad_norm": 0.4762227122729945, + "learning_rate": 6.714245453662504e-08, + "loss": 0.1002, + "step": 566 + }, + { + "epoch": 0.9646958740961293, + "grad_norm": 0.6489232881105945, + "learning_rate": 6.090653023201997e-08, + "loss": 0.1678, + "step": 567 + }, + { + "epoch": 0.9663972777541472, + "grad_norm": 0.6820435179693418, + "learning_rate": 5.497359075714026e-08, + "loss": 0.1318, + "step": 568 + }, + { + "epoch": 0.968098681412165, + "grad_norm": 0.5194424310453702, + "learning_rate": 4.934381697261015e-08, + "loss": 0.107, + "step": 569 + }, + { + "epoch": 0.9698000850701829, + "grad_norm": 0.5767259534107748, + "learning_rate": 4.401738049730653e-08, + "loss": 0.131, + "step": 570 + }, + { + "epoch": 0.9715014887282007, + "grad_norm": 0.822836988607629, + "learning_rate": 3.899444370312533e-08, + "loss": 0.2213, + "step": 571 + }, + { + "epoch": 0.9732028923862186, + "grad_norm": 0.6344299417297167, + "learning_rate": 3.4275159710032146e-08, + "loss": 0.1489, + "step": 572 + }, + { + "epoch": 0.9749042960442365, + "grad_norm": 0.580669991996899, + "learning_rate": 2.9859672381392644e-08, + "loss": 0.1247, + "step": 573 + }, + { + "epoch": 0.9766056997022544, + "grad_norm": 0.703203890284106, + "learning_rate": 2.574811631959273e-08, + "loss": 0.1736, + "step": 574 + }, + { + "epoch": 0.9783071033602723, + "grad_norm": 0.7897089666047964, + "learning_rate": 2.1940616861929608e-08, + "loss": 0.163, + "step": 575 + }, + { + "epoch": 0.9800085070182901, + "grad_norm": 0.6803631170844151, + "learning_rate": 1.8437290076792624e-08, + "loss": 0.153, + "step": 576 + }, + { + "epoch": 0.981709910676308, + "grad_norm": 0.5545468829005407, + "learning_rate": 1.5238242760126088e-08, + "loss": 0.1313, + "step": 577 + }, + { + "epoch": 0.9834113143343258, + "grad_norm": 0.8164414099561185, + "learning_rate": 1.234357243217188e-08, + "loss": 0.2041, + "step": 578 + }, + { + "epoch": 0.9851127179923437, + "grad_norm": 0.6514161235456599, + "learning_rate": 9.753367334499608e-09, + "loss": 0.1344, + "step": 579 + }, + { + "epoch": 0.9868141216503615, + "grad_norm": 0.5985215035231154, + "learning_rate": 7.467706427312093e-09, + "loss": 0.1188, + "step": 580 + }, + { + "epoch": 0.9885155253083794, + "grad_norm": 0.5063760313265934, + "learning_rate": 5.486659387043958e-09, + "loss": 0.1067, + "step": 581 + }, + { + "epoch": 0.9902169289663972, + "grad_norm": 0.5186821958679388, + "learning_rate": 3.810286604232216e-09, + "loss": 0.1034, + "step": 582 + }, + { + "epoch": 0.9919183326244151, + "grad_norm": 0.6651559646645941, + "learning_rate": 2.4386391816777488e-09, + "loss": 0.1629, + "step": 583 + }, + { + "epoch": 0.993619736282433, + "grad_norm": 0.4905063686357112, + "learning_rate": 1.3717589328898773e-09, + "loss": 0.0984, + "step": 584 + }, + { + "epoch": 0.9953211399404509, + "grad_norm": 0.7608623641135226, + "learning_rate": 6.096783808062778e-10, + "loss": 0.1734, + "step": 585 + }, + { + "epoch": 0.9970225435984688, + "grad_norm": 0.739767025879318, + "learning_rate": 1.524207568059932e-10, + "loss": 0.1384, + "step": 586 + }, + { + "epoch": 0.9987239472564866, + "grad_norm": 0.5060518028838383, + "learning_rate": 0.0, + "loss": 0.1231, + "step": 587 + }, + { + "epoch": 0.9987239472564866, + "step": 587, + "total_flos": 310771556384768.0, + "train_loss": 0.25847848631708137, + "train_runtime": 2597.8519, + "train_samples_per_second": 28.957, + "train_steps_per_second": 0.226 + } + ], + "logging_steps": 1.0, + "max_steps": 587, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 310771556384768.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}