diff --git "a/ajaymin28/vl-sg/checkpoint-2000/trainer_state.json" "b/ajaymin28/vl-sg/checkpoint-2000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/ajaymin28/vl-sg/checkpoint-2000/trainer_state.json" @@ -0,0 +1,14021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9762845849802373, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 13.631305851388058, + "learning_rate": 1.639344262295082e-07, + "loss": 0.7943, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 13.346641941057879, + "learning_rate": 3.278688524590164e-07, + "loss": 0.8013, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 12.925720270742593, + "learning_rate": 4.918032786885246e-07, + "loss": 0.8416, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 15.580924168656784, + "learning_rate": 6.557377049180328e-07, + "loss": 0.8441, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 13.551913967770986, + "learning_rate": 8.196721311475409e-07, + "loss": 0.735, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 16.620840204280576, + "learning_rate": 9.836065573770493e-07, + "loss": 0.8406, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 11.245707797327382, + "learning_rate": 1.1475409836065575e-06, + "loss": 0.6569, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 11.19931522169994, + "learning_rate": 1.3114754098360657e-06, + "loss": 0.6519, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 27.80206123230587, + "learning_rate": 1.4754098360655739e-06, + "loss": 0.652, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 9.075520262991187, + "learning_rate": 1.6393442622950819e-06, + "loss": 0.5741, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 13.673104969696313, + "learning_rate": 1.8032786885245903e-06, + "loss": 0.7034, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 11.465018340927124, + "learning_rate": 1.9672131147540985e-06, + "loss": 0.5484, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 9.595314473550937, + "learning_rate": 2.1311475409836067e-06, + "loss": 0.4478, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 7.3040612954225494, + "learning_rate": 2.295081967213115e-06, + "loss": 0.3268, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 5.722360257746658, + "learning_rate": 2.459016393442623e-06, + "loss": 0.3072, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 7.0490329481332985, + "learning_rate": 2.6229508196721314e-06, + "loss": 0.2465, + "step": 16 + }, + { + "epoch": 0.02, + "grad_norm": 8.131500605280447, + "learning_rate": 2.786885245901639e-06, + "loss": 0.2873, + "step": 17 + }, + { + "epoch": 0.02, + "grad_norm": 6.0745203091956474, + "learning_rate": 2.9508196721311478e-06, + "loss": 0.2323, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 5.161054943272229, + "learning_rate": 3.114754098360656e-06, + "loss": 0.2304, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 5.211697637623634, + "learning_rate": 3.2786885245901638e-06, + "loss": 0.2446, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 4.081749210877428, + "learning_rate": 3.4426229508196724e-06, + "loss": 0.1992, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 3.9576919724324435, + "learning_rate": 3.6065573770491806e-06, + "loss": 0.1976, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 3.919484126413122, + "learning_rate": 3.7704918032786884e-06, + "loss": 0.1694, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 3.990436329009411, + "learning_rate": 3.934426229508197e-06, + "loss": 0.1704, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 5.904975657739626, + "learning_rate": 4.098360655737705e-06, + "loss": 0.2249, + "step": 25 + }, + { + "epoch": 0.03, + "grad_norm": 3.400257571284763, + "learning_rate": 4.2622950819672135e-06, + "loss": 0.1817, + "step": 26 + }, + { + "epoch": 0.03, + "grad_norm": 3.6630000285111053, + "learning_rate": 4.426229508196722e-06, + "loss": 0.1724, + "step": 27 + }, + { + "epoch": 0.03, + "grad_norm": 4.8660635864126, + "learning_rate": 4.59016393442623e-06, + "loss": 0.2231, + "step": 28 + }, + { + "epoch": 0.03, + "grad_norm": 6.041616731907119, + "learning_rate": 4.754098360655738e-06, + "loss": 0.2331, + "step": 29 + }, + { + "epoch": 0.03, + "grad_norm": 4.699798847116446, + "learning_rate": 4.918032786885246e-06, + "loss": 0.1944, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 5.051559842542771, + "learning_rate": 5.0819672131147545e-06, + "loss": 0.1742, + "step": 31 + }, + { + "epoch": 0.03, + "grad_norm": 4.0126589941382855, + "learning_rate": 5.245901639344263e-06, + "loss": 0.1737, + "step": 32 + }, + { + "epoch": 0.03, + "grad_norm": 4.962155734254877, + "learning_rate": 5.409836065573772e-06, + "loss": 0.1951, + "step": 33 + }, + { + "epoch": 0.03, + "grad_norm": 6.367924200321454, + "learning_rate": 5.573770491803278e-06, + "loss": 0.2053, + "step": 34 + }, + { + "epoch": 0.03, + "grad_norm": 3.581007083931671, + "learning_rate": 5.737704918032787e-06, + "loss": 0.1344, + "step": 35 + }, + { + "epoch": 0.04, + "grad_norm": 4.814175913297936, + "learning_rate": 5.9016393442622956e-06, + "loss": 0.185, + "step": 36 + }, + { + "epoch": 0.04, + "grad_norm": 3.923834811559088, + "learning_rate": 6.065573770491804e-06, + "loss": 0.1955, + "step": 37 + }, + { + "epoch": 0.04, + "grad_norm": 2.8227363541701314, + "learning_rate": 6.229508196721312e-06, + "loss": 0.1639, + "step": 38 + }, + { + "epoch": 0.04, + "grad_norm": 3.6972292584990414, + "learning_rate": 6.393442622950821e-06, + "loss": 0.1703, + "step": 39 + }, + { + "epoch": 0.04, + "grad_norm": 2.6507311826944577, + "learning_rate": 6.5573770491803276e-06, + "loss": 0.178, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 3.9993573884654845, + "learning_rate": 6.721311475409837e-06, + "loss": 0.1552, + "step": 41 + }, + { + "epoch": 0.04, + "grad_norm": 5.629217820811896, + "learning_rate": 6.885245901639345e-06, + "loss": 0.1642, + "step": 42 + }, + { + "epoch": 0.04, + "grad_norm": 2.2303769162789946, + "learning_rate": 7.049180327868853e-06, + "loss": 0.1433, + "step": 43 + }, + { + "epoch": 0.04, + "grad_norm": 2.9022939442067437, + "learning_rate": 7.213114754098361e-06, + "loss": 0.1671, + "step": 44 + }, + { + "epoch": 0.04, + "grad_norm": 3.518448605554198, + "learning_rate": 7.3770491803278695e-06, + "loss": 0.1336, + "step": 45 + }, + { + "epoch": 0.05, + "grad_norm": 4.034618971631891, + "learning_rate": 7.540983606557377e-06, + "loss": 0.1501, + "step": 46 + }, + { + "epoch": 0.05, + "grad_norm": 3.26390119636968, + "learning_rate": 7.704918032786886e-06, + "loss": 0.1275, + "step": 47 + }, + { + "epoch": 0.05, + "grad_norm": 3.5565200003009316, + "learning_rate": 7.868852459016394e-06, + "loss": 0.1572, + "step": 48 + }, + { + "epoch": 0.05, + "grad_norm": 3.4801405609392058, + "learning_rate": 8.032786885245902e-06, + "loss": 0.1354, + "step": 49 + }, + { + "epoch": 0.05, + "grad_norm": 4.152281452474444, + "learning_rate": 8.19672131147541e-06, + "loss": 0.1358, + "step": 50 + }, + { + "epoch": 0.05, + "grad_norm": 2.8697633273885104, + "learning_rate": 8.360655737704919e-06, + "loss": 0.1512, + "step": 51 + }, + { + "epoch": 0.05, + "grad_norm": 4.5821217782901185, + "learning_rate": 8.524590163934427e-06, + "loss": 0.1327, + "step": 52 + }, + { + "epoch": 0.05, + "grad_norm": 1.915857828990885, + "learning_rate": 8.688524590163935e-06, + "loss": 0.1177, + "step": 53 + }, + { + "epoch": 0.05, + "grad_norm": 2.643567686766952, + "learning_rate": 8.852459016393443e-06, + "loss": 0.1116, + "step": 54 + }, + { + "epoch": 0.05, + "grad_norm": 4.827720168787582, + "learning_rate": 9.016393442622952e-06, + "loss": 0.1399, + "step": 55 + }, + { + "epoch": 0.06, + "grad_norm": 2.3325522278488298, + "learning_rate": 9.18032786885246e-06, + "loss": 0.1297, + "step": 56 + }, + { + "epoch": 0.06, + "grad_norm": 4.214346732745731, + "learning_rate": 9.344262295081968e-06, + "loss": 0.1792, + "step": 57 + }, + { + "epoch": 0.06, + "grad_norm": 2.828284793640583, + "learning_rate": 9.508196721311476e-06, + "loss": 0.1461, + "step": 58 + }, + { + "epoch": 0.06, + "grad_norm": 3.9596409765258485, + "learning_rate": 9.672131147540984e-06, + "loss": 0.1513, + "step": 59 + }, + { + "epoch": 0.06, + "grad_norm": 3.2036564881308625, + "learning_rate": 9.836065573770493e-06, + "loss": 0.1485, + "step": 60 + }, + { + "epoch": 0.06, + "grad_norm": 4.456609953730327, + "learning_rate": 1e-05, + "loss": 0.1056, + "step": 61 + }, + { + "epoch": 0.06, + "grad_norm": 7.707216229085872, + "learning_rate": 9.999993596770583e-06, + "loss": 0.1925, + "step": 62 + }, + { + "epoch": 0.06, + "grad_norm": 2.7136348282084795, + "learning_rate": 9.999974387098732e-06, + "loss": 0.1534, + "step": 63 + }, + { + "epoch": 0.06, + "grad_norm": 2.8902574550035913, + "learning_rate": 9.999942371033649e-06, + "loss": 0.1443, + "step": 64 + }, + { + "epoch": 0.06, + "grad_norm": 3.4920991371319463, + "learning_rate": 9.999897548657335e-06, + "loss": 0.1736, + "step": 65 + }, + { + "epoch": 0.07, + "grad_norm": 2.910381719634011, + "learning_rate": 9.999839920084596e-06, + "loss": 0.1346, + "step": 66 + }, + { + "epoch": 0.07, + "grad_norm": 3.8276514936410475, + "learning_rate": 9.999769485463033e-06, + "loss": 0.1439, + "step": 67 + }, + { + "epoch": 0.07, + "grad_norm": 2.8112591500388016, + "learning_rate": 9.999686244973048e-06, + "loss": 0.1058, + "step": 68 + }, + { + "epoch": 0.07, + "grad_norm": 2.3201219851502324, + "learning_rate": 9.999590198827851e-06, + "loss": 0.1096, + "step": 69 + }, + { + "epoch": 0.07, + "grad_norm": 3.675174239613632, + "learning_rate": 9.999481347273437e-06, + "loss": 0.1616, + "step": 70 + }, + { + "epoch": 0.07, + "grad_norm": 2.4734469072988774, + "learning_rate": 9.99935969058861e-06, + "loss": 0.1475, + "step": 71 + }, + { + "epoch": 0.07, + "grad_norm": 3.029244154732283, + "learning_rate": 9.999225229084967e-06, + "loss": 0.1441, + "step": 72 + }, + { + "epoch": 0.07, + "grad_norm": 2.74263319793326, + "learning_rate": 9.999077963106906e-06, + "loss": 0.1166, + "step": 73 + }, + { + "epoch": 0.07, + "grad_norm": 2.1161867356606834, + "learning_rate": 9.998917893031615e-06, + "loss": 0.1099, + "step": 74 + }, + { + "epoch": 0.07, + "grad_norm": 2.184783373365182, + "learning_rate": 9.99874501926908e-06, + "loss": 0.1147, + "step": 75 + }, + { + "epoch": 0.08, + "grad_norm": 2.477787258132454, + "learning_rate": 9.998559342262086e-06, + "loss": 0.1247, + "step": 76 + }, + { + "epoch": 0.08, + "grad_norm": 4.334018069534599, + "learning_rate": 9.998360862486199e-06, + "loss": 0.1204, + "step": 77 + }, + { + "epoch": 0.08, + "grad_norm": 2.8005941630072586, + "learning_rate": 9.998149580449787e-06, + "loss": 0.142, + "step": 78 + }, + { + "epoch": 0.08, + "grad_norm": 2.5403091682911296, + "learning_rate": 9.997925496694006e-06, + "loss": 0.0946, + "step": 79 + }, + { + "epoch": 0.08, + "grad_norm": 1.9804191935413014, + "learning_rate": 9.9976886117928e-06, + "loss": 0.1286, + "step": 80 + }, + { + "epoch": 0.08, + "grad_norm": 3.923557713497968, + "learning_rate": 9.997438926352896e-06, + "loss": 0.1837, + "step": 81 + }, + { + "epoch": 0.08, + "grad_norm": 2.670219851094306, + "learning_rate": 9.997176441013817e-06, + "loss": 0.1241, + "step": 82 + }, + { + "epoch": 0.08, + "grad_norm": 1.898925596705727, + "learning_rate": 9.99690115644786e-06, + "loss": 0.103, + "step": 83 + }, + { + "epoch": 0.08, + "grad_norm": 3.3909556938573298, + "learning_rate": 9.996613073360111e-06, + "loss": 0.1261, + "step": 84 + }, + { + "epoch": 0.08, + "grad_norm": 3.56655736296895, + "learning_rate": 9.996312192488436e-06, + "loss": 0.1625, + "step": 85 + }, + { + "epoch": 0.08, + "grad_norm": 3.7522296492231564, + "learning_rate": 9.995998514603478e-06, + "loss": 0.126, + "step": 86 + }, + { + "epoch": 0.09, + "grad_norm": 2.0743647967016217, + "learning_rate": 9.995672040508656e-06, + "loss": 0.127, + "step": 87 + }, + { + "epoch": 0.09, + "grad_norm": 2.3293214870202488, + "learning_rate": 9.995332771040167e-06, + "loss": 0.1302, + "step": 88 + }, + { + "epoch": 0.09, + "grad_norm": 3.5399329041251377, + "learning_rate": 9.994980707066979e-06, + "loss": 0.1361, + "step": 89 + }, + { + "epoch": 0.09, + "grad_norm": 3.628114945814919, + "learning_rate": 9.99461584949083e-06, + "loss": 0.1685, + "step": 90 + }, + { + "epoch": 0.09, + "grad_norm": 3.105678014096815, + "learning_rate": 9.994238199246226e-06, + "loss": 0.1358, + "step": 91 + }, + { + "epoch": 0.09, + "grad_norm": 2.807630136340985, + "learning_rate": 9.993847757300441e-06, + "loss": 0.1251, + "step": 92 + }, + { + "epoch": 0.09, + "grad_norm": 3.0778212654654142, + "learning_rate": 9.993444524653509e-06, + "loss": 0.1315, + "step": 93 + }, + { + "epoch": 0.09, + "grad_norm": 2.7611031344727106, + "learning_rate": 9.99302850233823e-06, + "loss": 0.1507, + "step": 94 + }, + { + "epoch": 0.09, + "grad_norm": 3.3385040063959, + "learning_rate": 9.992599691420154e-06, + "loss": 0.1321, + "step": 95 + }, + { + "epoch": 0.09, + "grad_norm": 3.032463630698966, + "learning_rate": 9.992158092997593e-06, + "loss": 0.1338, + "step": 96 + }, + { + "epoch": 0.1, + "grad_norm": 2.121891250986305, + "learning_rate": 9.99170370820161e-06, + "loss": 0.1069, + "step": 97 + }, + { + "epoch": 0.1, + "grad_norm": 1.7807561340523663, + "learning_rate": 9.991236538196016e-06, + "loss": 0.0973, + "step": 98 + }, + { + "epoch": 0.1, + "grad_norm": 3.2095866667891007, + "learning_rate": 9.99075658417737e-06, + "loss": 0.1206, + "step": 99 + }, + { + "epoch": 0.1, + "grad_norm": 4.005155338304001, + "learning_rate": 9.990263847374976e-06, + "loss": 0.1251, + "step": 100 + }, + { + "epoch": 0.1, + "grad_norm": 2.7454326241934206, + "learning_rate": 9.989758329050874e-06, + "loss": 0.1259, + "step": 101 + }, + { + "epoch": 0.1, + "grad_norm": 2.4997644178125036, + "learning_rate": 9.989240030499846e-06, + "loss": 0.1137, + "step": 102 + }, + { + "epoch": 0.1, + "grad_norm": 1.7860502564975003, + "learning_rate": 9.988708953049402e-06, + "loss": 0.0806, + "step": 103 + }, + { + "epoch": 0.1, + "grad_norm": 3.709783249623465, + "learning_rate": 9.988165098059791e-06, + "loss": 0.0849, + "step": 104 + }, + { + "epoch": 0.1, + "grad_norm": 4.329249225305103, + "learning_rate": 9.987608466923982e-06, + "loss": 0.1124, + "step": 105 + }, + { + "epoch": 0.1, + "grad_norm": 2.7750026523349485, + "learning_rate": 9.987039061067672e-06, + "loss": 0.1364, + "step": 106 + }, + { + "epoch": 0.11, + "grad_norm": 2.747140189403959, + "learning_rate": 9.986456881949271e-06, + "loss": 0.1191, + "step": 107 + }, + { + "epoch": 0.11, + "grad_norm": 2.165112231084418, + "learning_rate": 9.985861931059914e-06, + "loss": 0.1024, + "step": 108 + }, + { + "epoch": 0.11, + "grad_norm": 1.6540364928011821, + "learning_rate": 9.985254209923441e-06, + "loss": 0.0785, + "step": 109 + }, + { + "epoch": 0.11, + "grad_norm": 2.6773329820403036, + "learning_rate": 9.984633720096405e-06, + "loss": 0.1381, + "step": 110 + }, + { + "epoch": 0.11, + "grad_norm": 2.342199690015637, + "learning_rate": 9.984000463168059e-06, + "loss": 0.1246, + "step": 111 + }, + { + "epoch": 0.11, + "grad_norm": 1.5558883663248728, + "learning_rate": 9.983354440760363e-06, + "loss": 0.0819, + "step": 112 + }, + { + "epoch": 0.11, + "grad_norm": 2.170180142906283, + "learning_rate": 9.982695654527966e-06, + "loss": 0.1205, + "step": 113 + }, + { + "epoch": 0.11, + "grad_norm": 2.0678134875414313, + "learning_rate": 9.98202410615821e-06, + "loss": 0.1162, + "step": 114 + }, + { + "epoch": 0.11, + "grad_norm": 2.364069460723156, + "learning_rate": 9.98133979737113e-06, + "loss": 0.1225, + "step": 115 + }, + { + "epoch": 0.11, + "grad_norm": 1.3581386805595628, + "learning_rate": 9.98064272991944e-06, + "loss": 0.0847, + "step": 116 + }, + { + "epoch": 0.12, + "grad_norm": 2.200954920692463, + "learning_rate": 9.97993290558853e-06, + "loss": 0.1115, + "step": 117 + }, + { + "epoch": 0.12, + "grad_norm": 2.471145135562813, + "learning_rate": 9.97921032619647e-06, + "loss": 0.1026, + "step": 118 + }, + { + "epoch": 0.12, + "grad_norm": 2.810870720141375, + "learning_rate": 9.978474993593996e-06, + "loss": 0.0932, + "step": 119 + }, + { + "epoch": 0.12, + "grad_norm": 2.1161776267633927, + "learning_rate": 9.977726909664509e-06, + "loss": 0.0858, + "step": 120 + }, + { + "epoch": 0.12, + "grad_norm": 2.3126812719954857, + "learning_rate": 9.97696607632407e-06, + "loss": 0.1193, + "step": 121 + }, + { + "epoch": 0.12, + "grad_norm": 4.38494203181323, + "learning_rate": 9.976192495521397e-06, + "loss": 0.1434, + "step": 122 + }, + { + "epoch": 0.12, + "grad_norm": 2.8894160681506684, + "learning_rate": 9.975406169237853e-06, + "loss": 0.119, + "step": 123 + }, + { + "epoch": 0.12, + "grad_norm": 2.5708693258091966, + "learning_rate": 9.974607099487452e-06, + "loss": 0.1178, + "step": 124 + }, + { + "epoch": 0.12, + "grad_norm": 3.3884674963007506, + "learning_rate": 9.973795288316843e-06, + "loss": 0.1452, + "step": 125 + }, + { + "epoch": 0.12, + "grad_norm": 2.1641350610587784, + "learning_rate": 9.972970737805312e-06, + "loss": 0.1058, + "step": 126 + }, + { + "epoch": 0.13, + "grad_norm": 2.4935395649774468, + "learning_rate": 9.972133450064774e-06, + "loss": 0.1261, + "step": 127 + }, + { + "epoch": 0.13, + "grad_norm": 2.529944326619432, + "learning_rate": 9.971283427239764e-06, + "loss": 0.1574, + "step": 128 + }, + { + "epoch": 0.13, + "grad_norm": 3.152117071068294, + "learning_rate": 9.970420671507444e-06, + "loss": 0.1287, + "step": 129 + }, + { + "epoch": 0.13, + "grad_norm": 1.7931505522057154, + "learning_rate": 9.969545185077578e-06, + "loss": 0.0834, + "step": 130 + }, + { + "epoch": 0.13, + "grad_norm": 2.5714230930757864, + "learning_rate": 9.968656970192545e-06, + "loss": 0.1086, + "step": 131 + }, + { + "epoch": 0.13, + "grad_norm": 1.4226134781975723, + "learning_rate": 9.967756029127321e-06, + "loss": 0.0746, + "step": 132 + }, + { + "epoch": 0.13, + "grad_norm": 2.1304563124483606, + "learning_rate": 9.96684236418948e-06, + "loss": 0.081, + "step": 133 + }, + { + "epoch": 0.13, + "grad_norm": 1.997037756466334, + "learning_rate": 9.965915977719186e-06, + "loss": 0.0828, + "step": 134 + }, + { + "epoch": 0.13, + "grad_norm": 1.8748016550446636, + "learning_rate": 9.964976872089181e-06, + "loss": 0.1029, + "step": 135 + }, + { + "epoch": 0.13, + "grad_norm": 2.1324337837984615, + "learning_rate": 9.964025049704791e-06, + "loss": 0.0912, + "step": 136 + }, + { + "epoch": 0.14, + "grad_norm": 2.729065553381391, + "learning_rate": 9.96306051300391e-06, + "loss": 0.1081, + "step": 137 + }, + { + "epoch": 0.14, + "grad_norm": 2.0390277690579706, + "learning_rate": 9.962083264456999e-06, + "loss": 0.0983, + "step": 138 + }, + { + "epoch": 0.14, + "grad_norm": 2.865830909717086, + "learning_rate": 9.961093306567076e-06, + "loss": 0.114, + "step": 139 + }, + { + "epoch": 0.14, + "grad_norm": 2.3533721656364404, + "learning_rate": 9.960090641869713e-06, + "loss": 0.1084, + "step": 140 + }, + { + "epoch": 0.14, + "grad_norm": 1.5551902725664026, + "learning_rate": 9.959075272933026e-06, + "loss": 0.1113, + "step": 141 + }, + { + "epoch": 0.14, + "grad_norm": 2.071888359696449, + "learning_rate": 9.958047202357669e-06, + "loss": 0.1174, + "step": 142 + }, + { + "epoch": 0.14, + "grad_norm": 2.8366168158210243, + "learning_rate": 9.957006432776835e-06, + "loss": 0.1237, + "step": 143 + }, + { + "epoch": 0.14, + "grad_norm": 2.2814774928629244, + "learning_rate": 9.955952966856235e-06, + "loss": 0.0816, + "step": 144 + }, + { + "epoch": 0.14, + "grad_norm": 3.6787467519373642, + "learning_rate": 9.954886807294104e-06, + "loss": 0.1389, + "step": 145 + }, + { + "epoch": 0.14, + "grad_norm": 1.776350872381563, + "learning_rate": 9.953807956821186e-06, + "loss": 0.0973, + "step": 146 + }, + { + "epoch": 0.15, + "grad_norm": 2.743539597608596, + "learning_rate": 9.952716418200736e-06, + "loss": 0.0925, + "step": 147 + }, + { + "epoch": 0.15, + "grad_norm": 1.4252300401619833, + "learning_rate": 9.9516121942285e-06, + "loss": 0.0931, + "step": 148 + }, + { + "epoch": 0.15, + "grad_norm": 2.263822153457368, + "learning_rate": 9.950495287732718e-06, + "loss": 0.0909, + "step": 149 + }, + { + "epoch": 0.15, + "grad_norm": 2.140538985744693, + "learning_rate": 9.949365701574111e-06, + "loss": 0.0912, + "step": 150 + }, + { + "epoch": 0.15, + "grad_norm": 2.138218262804208, + "learning_rate": 9.948223438645885e-06, + "loss": 0.1138, + "step": 151 + }, + { + "epoch": 0.15, + "grad_norm": 1.8680837459099324, + "learning_rate": 9.947068501873702e-06, + "loss": 0.1038, + "step": 152 + }, + { + "epoch": 0.15, + "grad_norm": 1.4482792126490553, + "learning_rate": 9.945900894215697e-06, + "loss": 0.1034, + "step": 153 + }, + { + "epoch": 0.15, + "grad_norm": 2.585416283038379, + "learning_rate": 9.94472061866245e-06, + "loss": 0.0943, + "step": 154 + }, + { + "epoch": 0.15, + "grad_norm": 2.242948767982364, + "learning_rate": 9.943527678236993e-06, + "loss": 0.1127, + "step": 155 + }, + { + "epoch": 0.15, + "grad_norm": 1.7899257398914066, + "learning_rate": 9.942322075994795e-06, + "loss": 0.105, + "step": 156 + }, + { + "epoch": 0.16, + "grad_norm": 1.796907562819366, + "learning_rate": 9.941103815023754e-06, + "loss": 0.0979, + "step": 157 + }, + { + "epoch": 0.16, + "grad_norm": 1.992648855587167, + "learning_rate": 9.939872898444193e-06, + "loss": 0.0841, + "step": 158 + }, + { + "epoch": 0.16, + "grad_norm": 1.4007767172605858, + "learning_rate": 9.938629329408846e-06, + "loss": 0.0873, + "step": 159 + }, + { + "epoch": 0.16, + "grad_norm": 1.5723243881307907, + "learning_rate": 9.937373111102859e-06, + "loss": 0.0946, + "step": 160 + }, + { + "epoch": 0.16, + "grad_norm": 1.816342977635474, + "learning_rate": 9.936104246743773e-06, + "loss": 0.0912, + "step": 161 + }, + { + "epoch": 0.16, + "grad_norm": 2.2290152164656765, + "learning_rate": 9.934822739581518e-06, + "loss": 0.1103, + "step": 162 + }, + { + "epoch": 0.16, + "grad_norm": 2.487123775788363, + "learning_rate": 9.93352859289841e-06, + "loss": 0.1247, + "step": 163 + }, + { + "epoch": 0.16, + "grad_norm": 1.3289229263800526, + "learning_rate": 9.932221810009136e-06, + "loss": 0.0765, + "step": 164 + }, + { + "epoch": 0.16, + "grad_norm": 1.441273715142216, + "learning_rate": 9.930902394260746e-06, + "loss": 0.0795, + "step": 165 + }, + { + "epoch": 0.16, + "grad_norm": 1.9404987464382477, + "learning_rate": 9.92957034903265e-06, + "loss": 0.1015, + "step": 166 + }, + { + "epoch": 0.17, + "grad_norm": 1.3410400098803896, + "learning_rate": 9.928225677736605e-06, + "loss": 0.073, + "step": 167 + }, + { + "epoch": 0.17, + "grad_norm": 2.0502444988691617, + "learning_rate": 9.926868383816707e-06, + "loss": 0.0969, + "step": 168 + }, + { + "epoch": 0.17, + "grad_norm": 2.0877152198641795, + "learning_rate": 9.92549847074938e-06, + "loss": 0.1356, + "step": 169 + }, + { + "epoch": 0.17, + "grad_norm": 2.9152870160820563, + "learning_rate": 9.924115942043372e-06, + "loss": 0.1136, + "step": 170 + }, + { + "epoch": 0.17, + "grad_norm": 2.0743638774511117, + "learning_rate": 9.922720801239744e-06, + "loss": 0.1299, + "step": 171 + }, + { + "epoch": 0.17, + "grad_norm": 2.7953929763715504, + "learning_rate": 9.921313051911856e-06, + "loss": 0.0908, + "step": 172 + }, + { + "epoch": 0.17, + "grad_norm": 1.9157646131826696, + "learning_rate": 9.919892697665366e-06, + "loss": 0.0934, + "step": 173 + }, + { + "epoch": 0.17, + "grad_norm": 1.0178472764702429, + "learning_rate": 9.918459742138216e-06, + "loss": 0.0663, + "step": 174 + }, + { + "epoch": 0.17, + "grad_norm": 4.943463027055093, + "learning_rate": 9.917014189000622e-06, + "loss": 0.1212, + "step": 175 + }, + { + "epoch": 0.17, + "grad_norm": 2.0231520392386524, + "learning_rate": 9.915556041955067e-06, + "loss": 0.1125, + "step": 176 + }, + { + "epoch": 0.17, + "grad_norm": 2.0028604714913203, + "learning_rate": 9.914085304736292e-06, + "loss": 0.0851, + "step": 177 + }, + { + "epoch": 0.18, + "grad_norm": 1.3013421574166353, + "learning_rate": 9.912601981111287e-06, + "loss": 0.0693, + "step": 178 + }, + { + "epoch": 0.18, + "grad_norm": 5.039237793918344, + "learning_rate": 9.911106074879272e-06, + "loss": 0.1213, + "step": 179 + }, + { + "epoch": 0.18, + "grad_norm": 1.4853225489196868, + "learning_rate": 9.9095975898717e-06, + "loss": 0.0717, + "step": 180 + }, + { + "epoch": 0.18, + "grad_norm": 1.5225237739191446, + "learning_rate": 9.908076529952244e-06, + "loss": 0.0734, + "step": 181 + }, + { + "epoch": 0.18, + "grad_norm": 1.8510887662937323, + "learning_rate": 9.90654289901678e-06, + "loss": 0.0916, + "step": 182 + }, + { + "epoch": 0.18, + "grad_norm": 1.9812919920187775, + "learning_rate": 9.904996700993383e-06, + "loss": 0.1082, + "step": 183 + }, + { + "epoch": 0.18, + "grad_norm": 1.6600233528402641, + "learning_rate": 9.90343793984232e-06, + "loss": 0.1138, + "step": 184 + }, + { + "epoch": 0.18, + "grad_norm": 2.383212353529967, + "learning_rate": 9.901866619556033e-06, + "loss": 0.0827, + "step": 185 + }, + { + "epoch": 0.18, + "grad_norm": 2.925331137805128, + "learning_rate": 9.90028274415913e-06, + "loss": 0.1037, + "step": 186 + }, + { + "epoch": 0.18, + "grad_norm": 2.37916222656882, + "learning_rate": 9.898686317708381e-06, + "loss": 0.0784, + "step": 187 + }, + { + "epoch": 0.19, + "grad_norm": 2.2716641949041647, + "learning_rate": 9.897077344292695e-06, + "loss": 0.0951, + "step": 188 + }, + { + "epoch": 0.19, + "grad_norm": 2.9242092279254224, + "learning_rate": 9.89545582803313e-06, + "loss": 0.0993, + "step": 189 + }, + { + "epoch": 0.19, + "grad_norm": 4.12901206008378, + "learning_rate": 9.893821773082852e-06, + "loss": 0.1273, + "step": 190 + }, + { + "epoch": 0.19, + "grad_norm": 2.105626108417437, + "learning_rate": 9.892175183627161e-06, + "loss": 0.1047, + "step": 191 + }, + { + "epoch": 0.19, + "grad_norm": 4.3441589498848545, + "learning_rate": 9.890516063883449e-06, + "loss": 0.0932, + "step": 192 + }, + { + "epoch": 0.19, + "grad_norm": 1.8098066422942596, + "learning_rate": 9.888844418101207e-06, + "loss": 0.1016, + "step": 193 + }, + { + "epoch": 0.19, + "grad_norm": 2.0529364761864364, + "learning_rate": 9.887160250562007e-06, + "loss": 0.1207, + "step": 194 + }, + { + "epoch": 0.19, + "grad_norm": 1.9596501504369412, + "learning_rate": 9.885463565579494e-06, + "loss": 0.0956, + "step": 195 + }, + { + "epoch": 0.19, + "grad_norm": 1.1140435352484783, + "learning_rate": 9.883754367499372e-06, + "loss": 0.0734, + "step": 196 + }, + { + "epoch": 0.19, + "grad_norm": 2.7698226079660913, + "learning_rate": 9.8820326606994e-06, + "loss": 0.1388, + "step": 197 + }, + { + "epoch": 0.2, + "grad_norm": 1.6701850513217271, + "learning_rate": 9.880298449589365e-06, + "loss": 0.0958, + "step": 198 + }, + { + "epoch": 0.2, + "grad_norm": 2.2916206595956448, + "learning_rate": 9.878551738611092e-06, + "loss": 0.1151, + "step": 199 + }, + { + "epoch": 0.2, + "grad_norm": 1.6787179895305941, + "learning_rate": 9.876792532238417e-06, + "loss": 0.0862, + "step": 200 + }, + { + "epoch": 0.2, + "grad_norm": 2.4125068562773224, + "learning_rate": 9.87502083497718e-06, + "loss": 0.1087, + "step": 201 + }, + { + "epoch": 0.2, + "grad_norm": 1.3258740208971436, + "learning_rate": 9.873236651365215e-06, + "loss": 0.0681, + "step": 202 + }, + { + "epoch": 0.2, + "grad_norm": 2.1361575934052066, + "learning_rate": 9.871439985972338e-06, + "loss": 0.0855, + "step": 203 + }, + { + "epoch": 0.2, + "grad_norm": 1.417534590578083, + "learning_rate": 9.869630843400331e-06, + "loss": 0.064, + "step": 204 + }, + { + "epoch": 0.2, + "grad_norm": 2.0072193710609736, + "learning_rate": 9.867809228282937e-06, + "loss": 0.0842, + "step": 205 + }, + { + "epoch": 0.2, + "grad_norm": 5.084886745710113, + "learning_rate": 9.865975145285842e-06, + "loss": 0.0837, + "step": 206 + }, + { + "epoch": 0.2, + "grad_norm": 4.2565455054149535, + "learning_rate": 9.864128599106672e-06, + "loss": 0.093, + "step": 207 + }, + { + "epoch": 0.21, + "grad_norm": 3.4752672341475757, + "learning_rate": 9.862269594474967e-06, + "loss": 0.1319, + "step": 208 + }, + { + "epoch": 0.21, + "grad_norm": 2.2901130251571824, + "learning_rate": 9.86039813615218e-06, + "loss": 0.0852, + "step": 209 + }, + { + "epoch": 0.21, + "grad_norm": 1.4976044292493496, + "learning_rate": 9.858514228931664e-06, + "loss": 0.0856, + "step": 210 + }, + { + "epoch": 0.21, + "grad_norm": 1.8407522923248512, + "learning_rate": 9.856617877638656e-06, + "loss": 0.0928, + "step": 211 + }, + { + "epoch": 0.21, + "grad_norm": 2.476856421690967, + "learning_rate": 9.854709087130261e-06, + "loss": 0.0785, + "step": 212 + }, + { + "epoch": 0.21, + "grad_norm": 1.6556235800226011, + "learning_rate": 9.85278786229545e-06, + "loss": 0.076, + "step": 213 + }, + { + "epoch": 0.21, + "grad_norm": 2.534209397246005, + "learning_rate": 9.850854208055043e-06, + "loss": 0.0913, + "step": 214 + }, + { + "epoch": 0.21, + "grad_norm": 1.409274827048923, + "learning_rate": 9.848908129361689e-06, + "loss": 0.0856, + "step": 215 + }, + { + "epoch": 0.21, + "grad_norm": 1.6845946861454264, + "learning_rate": 9.846949631199865e-06, + "loss": 0.0765, + "step": 216 + }, + { + "epoch": 0.21, + "grad_norm": 2.0629627391913066, + "learning_rate": 9.844978718585855e-06, + "loss": 0.0907, + "step": 217 + }, + { + "epoch": 0.22, + "grad_norm": 3.0848081390621207, + "learning_rate": 9.842995396567744e-06, + "loss": 0.0755, + "step": 218 + }, + { + "epoch": 0.22, + "grad_norm": 2.937175272950418, + "learning_rate": 9.840999670225396e-06, + "loss": 0.0825, + "step": 219 + }, + { + "epoch": 0.22, + "grad_norm": 2.3382983974983316, + "learning_rate": 9.83899154467045e-06, + "loss": 0.1139, + "step": 220 + }, + { + "epoch": 0.22, + "grad_norm": 1.8713367237587217, + "learning_rate": 9.8369710250463e-06, + "loss": 0.0955, + "step": 221 + }, + { + "epoch": 0.22, + "grad_norm": 2.04053344806577, + "learning_rate": 9.834938116528086e-06, + "loss": 0.0851, + "step": 222 + }, + { + "epoch": 0.22, + "grad_norm": 2.8318815354282614, + "learning_rate": 9.832892824322681e-06, + "loss": 0.1112, + "step": 223 + }, + { + "epoch": 0.22, + "grad_norm": 5.8606361693646125, + "learning_rate": 9.830835153668675e-06, + "loss": 0.1507, + "step": 224 + }, + { + "epoch": 0.22, + "grad_norm": 2.7538317703216597, + "learning_rate": 9.828765109836362e-06, + "loss": 0.0914, + "step": 225 + }, + { + "epoch": 0.22, + "grad_norm": 1.9703680557018663, + "learning_rate": 9.826682698127731e-06, + "loss": 0.1028, + "step": 226 + }, + { + "epoch": 0.22, + "grad_norm": 3.1041065295008856, + "learning_rate": 9.824587923876442e-06, + "loss": 0.1184, + "step": 227 + }, + { + "epoch": 0.23, + "grad_norm": 2.39277051465413, + "learning_rate": 9.822480792447826e-06, + "loss": 0.1179, + "step": 228 + }, + { + "epoch": 0.23, + "grad_norm": 1.7491779318374767, + "learning_rate": 9.82036130923886e-06, + "loss": 0.0969, + "step": 229 + }, + { + "epoch": 0.23, + "grad_norm": 1.726332596955443, + "learning_rate": 9.81822947967816e-06, + "loss": 0.0698, + "step": 230 + }, + { + "epoch": 0.23, + "grad_norm": 1.0927820863569315, + "learning_rate": 9.81608530922596e-06, + "loss": 0.0586, + "step": 231 + }, + { + "epoch": 0.23, + "grad_norm": 2.3063159183295077, + "learning_rate": 9.81392880337411e-06, + "loss": 0.0859, + "step": 232 + }, + { + "epoch": 0.23, + "grad_norm": 1.9256645726866466, + "learning_rate": 9.81175996764605e-06, + "loss": 0.1002, + "step": 233 + }, + { + "epoch": 0.23, + "grad_norm": 1.631213101565748, + "learning_rate": 9.8095788075968e-06, + "loss": 0.0905, + "step": 234 + }, + { + "epoch": 0.23, + "grad_norm": 1.758854636637926, + "learning_rate": 9.807385328812948e-06, + "loss": 0.076, + "step": 235 + }, + { + "epoch": 0.23, + "grad_norm": 2.051432984550155, + "learning_rate": 9.805179536912635e-06, + "loss": 0.0792, + "step": 236 + }, + { + "epoch": 0.23, + "grad_norm": 2.1924640722920685, + "learning_rate": 9.802961437545532e-06, + "loss": 0.0997, + "step": 237 + }, + { + "epoch": 0.24, + "grad_norm": 4.414190947211591, + "learning_rate": 9.800731036392846e-06, + "loss": 0.0715, + "step": 238 + }, + { + "epoch": 0.24, + "grad_norm": 6.039154923532322, + "learning_rate": 9.79848833916728e-06, + "loss": 0.112, + "step": 239 + }, + { + "epoch": 0.24, + "grad_norm": 3.277383152775661, + "learning_rate": 9.796233351613036e-06, + "loss": 0.1544, + "step": 240 + }, + { + "epoch": 0.24, + "grad_norm": 1.823021976103906, + "learning_rate": 9.793966079505797e-06, + "loss": 0.0842, + "step": 241 + }, + { + "epoch": 0.24, + "grad_norm": 3.912512017914575, + "learning_rate": 9.791686528652708e-06, + "loss": 0.0974, + "step": 242 + }, + { + "epoch": 0.24, + "grad_norm": 1.9153410476183683, + "learning_rate": 9.789394704892364e-06, + "loss": 0.1026, + "step": 243 + }, + { + "epoch": 0.24, + "grad_norm": 2.807843325639007, + "learning_rate": 9.787090614094795e-06, + "loss": 0.0807, + "step": 244 + }, + { + "epoch": 0.24, + "grad_norm": 2.4132064235794966, + "learning_rate": 9.784774262161448e-06, + "loss": 0.0944, + "step": 245 + }, + { + "epoch": 0.24, + "grad_norm": 2.5293791979763536, + "learning_rate": 9.782445655025175e-06, + "loss": 0.115, + "step": 246 + }, + { + "epoch": 0.24, + "grad_norm": 1.428020426458093, + "learning_rate": 9.780104798650221e-06, + "loss": 0.0797, + "step": 247 + }, + { + "epoch": 0.25, + "grad_norm": 2.016570138879204, + "learning_rate": 9.777751699032203e-06, + "loss": 0.0894, + "step": 248 + }, + { + "epoch": 0.25, + "grad_norm": 2.0703814315935216, + "learning_rate": 9.775386362198093e-06, + "loss": 0.0686, + "step": 249 + }, + { + "epoch": 0.25, + "grad_norm": 2.246554230901814, + "learning_rate": 9.773008794206209e-06, + "loss": 0.0863, + "step": 250 + }, + { + "epoch": 0.25, + "grad_norm": 2.2872072039177085, + "learning_rate": 9.770619001146198e-06, + "loss": 0.0961, + "step": 251 + }, + { + "epoch": 0.25, + "grad_norm": 1.4319299004238022, + "learning_rate": 9.768216989139017e-06, + "loss": 0.0741, + "step": 252 + }, + { + "epoch": 0.25, + "grad_norm": 2.665388204235485, + "learning_rate": 9.765802764336917e-06, + "loss": 0.0858, + "step": 253 + }, + { + "epoch": 0.25, + "grad_norm": 2.083220714768147, + "learning_rate": 9.763376332923435e-06, + "loss": 0.1004, + "step": 254 + }, + { + "epoch": 0.25, + "grad_norm": 1.8288065714795982, + "learning_rate": 9.760937701113368e-06, + "loss": 0.082, + "step": 255 + }, + { + "epoch": 0.25, + "grad_norm": 1.42391892235683, + "learning_rate": 9.758486875152766e-06, + "loss": 0.064, + "step": 256 + }, + { + "epoch": 0.25, + "grad_norm": 3.2151765663560514, + "learning_rate": 9.756023861318907e-06, + "loss": 0.1338, + "step": 257 + }, + { + "epoch": 0.25, + "grad_norm": 2.149962952777695, + "learning_rate": 9.75354866592029e-06, + "loss": 0.0832, + "step": 258 + }, + { + "epoch": 0.26, + "grad_norm": 2.7962412801494425, + "learning_rate": 9.75106129529661e-06, + "loss": 0.0816, + "step": 259 + }, + { + "epoch": 0.26, + "grad_norm": 1.3866114809470351, + "learning_rate": 9.748561755818751e-06, + "loss": 0.0848, + "step": 260 + }, + { + "epoch": 0.26, + "grad_norm": 2.2395857876895726, + "learning_rate": 9.746050053888761e-06, + "loss": 0.1089, + "step": 261 + }, + { + "epoch": 0.26, + "grad_norm": 2.1697229926909465, + "learning_rate": 9.743526195939844e-06, + "loss": 0.1046, + "step": 262 + }, + { + "epoch": 0.26, + "grad_norm": 1.8188765207908812, + "learning_rate": 9.740990188436336e-06, + "loss": 0.0854, + "step": 263 + }, + { + "epoch": 0.26, + "grad_norm": 1.6284142753747066, + "learning_rate": 9.73844203787369e-06, + "loss": 0.0848, + "step": 264 + }, + { + "epoch": 0.26, + "grad_norm": 1.5105954281748584, + "learning_rate": 9.735881750778463e-06, + "loss": 0.0816, + "step": 265 + }, + { + "epoch": 0.26, + "grad_norm": 2.4226387847216695, + "learning_rate": 9.733309333708303e-06, + "loss": 0.1207, + "step": 266 + }, + { + "epoch": 0.26, + "grad_norm": 1.2394786486778886, + "learning_rate": 9.730724793251914e-06, + "loss": 0.0801, + "step": 267 + }, + { + "epoch": 0.26, + "grad_norm": 2.1909334570172754, + "learning_rate": 9.72812813602906e-06, + "loss": 0.1096, + "step": 268 + }, + { + "epoch": 0.27, + "grad_norm": 1.8150706378609962, + "learning_rate": 9.725519368690539e-06, + "loss": 0.0795, + "step": 269 + }, + { + "epoch": 0.27, + "grad_norm": 1.8365063330132647, + "learning_rate": 9.722898497918165e-06, + "loss": 0.0919, + "step": 270 + }, + { + "epoch": 0.27, + "grad_norm": 1.664954614981809, + "learning_rate": 9.720265530424752e-06, + "loss": 0.1014, + "step": 271 + }, + { + "epoch": 0.27, + "grad_norm": 1.7255068806392266, + "learning_rate": 9.717620472954098e-06, + "loss": 0.0803, + "step": 272 + }, + { + "epoch": 0.27, + "grad_norm": 1.77913692453882, + "learning_rate": 9.714963332280967e-06, + "loss": 0.0962, + "step": 273 + }, + { + "epoch": 0.27, + "grad_norm": 1.4814102439238948, + "learning_rate": 9.712294115211074e-06, + "loss": 0.0913, + "step": 274 + }, + { + "epoch": 0.27, + "grad_norm": 2.253001053387503, + "learning_rate": 9.709612828581058e-06, + "loss": 0.0763, + "step": 275 + }, + { + "epoch": 0.27, + "grad_norm": 1.7833683606000394, + "learning_rate": 9.70691947925848e-06, + "loss": 0.0775, + "step": 276 + }, + { + "epoch": 0.27, + "grad_norm": 1.4058700337148404, + "learning_rate": 9.704214074141793e-06, + "loss": 0.0874, + "step": 277 + }, + { + "epoch": 0.27, + "grad_norm": 1.3466667907354781, + "learning_rate": 9.701496620160327e-06, + "loss": 0.0704, + "step": 278 + }, + { + "epoch": 0.28, + "grad_norm": 2.1309554442052114, + "learning_rate": 9.698767124274276e-06, + "loss": 0.1082, + "step": 279 + }, + { + "epoch": 0.28, + "grad_norm": 1.5829938262087564, + "learning_rate": 9.696025593474676e-06, + "loss": 0.0826, + "step": 280 + }, + { + "epoch": 0.28, + "grad_norm": 2.5487626505131384, + "learning_rate": 9.693272034783385e-06, + "loss": 0.0889, + "step": 281 + }, + { + "epoch": 0.28, + "grad_norm": 1.6596439121539222, + "learning_rate": 9.690506455253073e-06, + "loss": 0.099, + "step": 282 + }, + { + "epoch": 0.28, + "grad_norm": 1.69312107368622, + "learning_rate": 9.687728861967192e-06, + "loss": 0.0737, + "step": 283 + }, + { + "epoch": 0.28, + "grad_norm": 1.355971373182455, + "learning_rate": 9.684939262039974e-06, + "loss": 0.0688, + "step": 284 + }, + { + "epoch": 0.28, + "grad_norm": 1.2553069216995882, + "learning_rate": 9.682137662616393e-06, + "loss": 0.0604, + "step": 285 + }, + { + "epoch": 0.28, + "grad_norm": 1.4072716167061843, + "learning_rate": 9.679324070872168e-06, + "loss": 0.061, + "step": 286 + }, + { + "epoch": 0.28, + "grad_norm": 1.3801539197582835, + "learning_rate": 9.676498494013724e-06, + "loss": 0.068, + "step": 287 + }, + { + "epoch": 0.28, + "grad_norm": 2.0356080364751667, + "learning_rate": 9.67366093927819e-06, + "loss": 0.0797, + "step": 288 + }, + { + "epoch": 0.29, + "grad_norm": 1.5713160560356783, + "learning_rate": 9.670811413933372e-06, + "loss": 0.0581, + "step": 289 + }, + { + "epoch": 0.29, + "grad_norm": 2.1345366422930265, + "learning_rate": 9.667949925277733e-06, + "loss": 0.0583, + "step": 290 + }, + { + "epoch": 0.29, + "grad_norm": 1.6352674907213263, + "learning_rate": 9.665076480640383e-06, + "loss": 0.074, + "step": 291 + }, + { + "epoch": 0.29, + "grad_norm": 1.4038554071150915, + "learning_rate": 9.662191087381051e-06, + "loss": 0.0543, + "step": 292 + }, + { + "epoch": 0.29, + "grad_norm": 2.1170162395093253, + "learning_rate": 9.659293752890072e-06, + "loss": 0.1129, + "step": 293 + }, + { + "epoch": 0.29, + "grad_norm": 1.708030080537653, + "learning_rate": 9.656384484588363e-06, + "loss": 0.0907, + "step": 294 + }, + { + "epoch": 0.29, + "grad_norm": 1.7196125677896754, + "learning_rate": 9.65346328992741e-06, + "loss": 0.0874, + "step": 295 + }, + { + "epoch": 0.29, + "grad_norm": 2.160266089759155, + "learning_rate": 9.650530176389245e-06, + "loss": 0.0771, + "step": 296 + }, + { + "epoch": 0.29, + "grad_norm": 1.2038797082738868, + "learning_rate": 9.64758515148643e-06, + "loss": 0.0809, + "step": 297 + }, + { + "epoch": 0.29, + "grad_norm": 1.7119010674264274, + "learning_rate": 9.644628222762027e-06, + "loss": 0.0707, + "step": 298 + }, + { + "epoch": 0.3, + "grad_norm": 1.087952105670146, + "learning_rate": 9.641659397789599e-06, + "loss": 0.0469, + "step": 299 + }, + { + "epoch": 0.3, + "grad_norm": 2.956435229689038, + "learning_rate": 9.63867868417317e-06, + "loss": 0.1137, + "step": 300 + }, + { + "epoch": 0.3, + "grad_norm": 1.9933370820366156, + "learning_rate": 9.635686089547219e-06, + "loss": 0.069, + "step": 301 + }, + { + "epoch": 0.3, + "grad_norm": 2.081689772329396, + "learning_rate": 9.632681621576651e-06, + "loss": 0.1044, + "step": 302 + }, + { + "epoch": 0.3, + "grad_norm": 2.2096559086670005, + "learning_rate": 9.629665287956788e-06, + "loss": 0.0821, + "step": 303 + }, + { + "epoch": 0.3, + "grad_norm": 1.484080717855697, + "learning_rate": 9.626637096413339e-06, + "loss": 0.0693, + "step": 304 + }, + { + "epoch": 0.3, + "grad_norm": 1.7412247084697006, + "learning_rate": 9.623597054702388e-06, + "loss": 0.0699, + "step": 305 + }, + { + "epoch": 0.3, + "grad_norm": 1.9095850402493102, + "learning_rate": 9.620545170610364e-06, + "loss": 0.096, + "step": 306 + }, + { + "epoch": 0.3, + "grad_norm": 3.9340127820809814, + "learning_rate": 9.61748145195404e-06, + "loss": 0.0881, + "step": 307 + }, + { + "epoch": 0.3, + "grad_norm": 2.063160163996744, + "learning_rate": 9.614405906580486e-06, + "loss": 0.0947, + "step": 308 + }, + { + "epoch": 0.31, + "grad_norm": 1.4516427837059722, + "learning_rate": 9.611318542367076e-06, + "loss": 0.0718, + "step": 309 + }, + { + "epoch": 0.31, + "grad_norm": 1.6438745401278574, + "learning_rate": 9.60821936722145e-06, + "loss": 0.0708, + "step": 310 + }, + { + "epoch": 0.31, + "grad_norm": 2.7053858895529537, + "learning_rate": 9.605108389081497e-06, + "loss": 0.0815, + "step": 311 + }, + { + "epoch": 0.31, + "grad_norm": 2.495838987672316, + "learning_rate": 9.601985615915342e-06, + "loss": 0.088, + "step": 312 + }, + { + "epoch": 0.31, + "grad_norm": 2.182730205772672, + "learning_rate": 9.598851055721317e-06, + "loss": 0.0956, + "step": 313 + }, + { + "epoch": 0.31, + "grad_norm": 1.355337420631783, + "learning_rate": 9.595704716527946e-06, + "loss": 0.0618, + "step": 314 + }, + { + "epoch": 0.31, + "grad_norm": 1.9182810450895473, + "learning_rate": 9.592546606393922e-06, + "loss": 0.1141, + "step": 315 + }, + { + "epoch": 0.31, + "grad_norm": 1.3996443318689997, + "learning_rate": 9.589376733408086e-06, + "loss": 0.068, + "step": 316 + }, + { + "epoch": 0.31, + "grad_norm": 1.0459197241210223, + "learning_rate": 9.586195105689406e-06, + "loss": 0.0631, + "step": 317 + }, + { + "epoch": 0.31, + "grad_norm": 2.0378716872013047, + "learning_rate": 9.583001731386962e-06, + "loss": 0.1295, + "step": 318 + }, + { + "epoch": 0.32, + "grad_norm": 1.6754804382652202, + "learning_rate": 9.579796618679917e-06, + "loss": 0.0889, + "step": 319 + }, + { + "epoch": 0.32, + "grad_norm": 2.216230794120504, + "learning_rate": 9.576579775777497e-06, + "loss": 0.095, + "step": 320 + }, + { + "epoch": 0.32, + "grad_norm": 1.4318931227506908, + "learning_rate": 9.573351210918976e-06, + "loss": 0.0766, + "step": 321 + }, + { + "epoch": 0.32, + "grad_norm": 1.6404177698368994, + "learning_rate": 9.570110932373652e-06, + "loss": 0.0901, + "step": 322 + }, + { + "epoch": 0.32, + "grad_norm": 1.3109869919883583, + "learning_rate": 9.566858948440823e-06, + "loss": 0.063, + "step": 323 + }, + { + "epoch": 0.32, + "grad_norm": 1.8968886949685448, + "learning_rate": 9.56359526744977e-06, + "loss": 0.0965, + "step": 324 + }, + { + "epoch": 0.32, + "grad_norm": 1.5650335354640215, + "learning_rate": 9.560319897759729e-06, + "loss": 0.099, + "step": 325 + }, + { + "epoch": 0.32, + "grad_norm": 1.435820283477607, + "learning_rate": 9.557032847759879e-06, + "loss": 0.075, + "step": 326 + }, + { + "epoch": 0.32, + "grad_norm": 1.3933170068610747, + "learning_rate": 9.553734125869316e-06, + "loss": 0.0721, + "step": 327 + }, + { + "epoch": 0.32, + "grad_norm": 2.0619677922888786, + "learning_rate": 9.550423740537028e-06, + "loss": 0.0868, + "step": 328 + }, + { + "epoch": 0.33, + "grad_norm": 3.2967042911447932, + "learning_rate": 9.547101700241878e-06, + "loss": 0.1052, + "step": 329 + }, + { + "epoch": 0.33, + "grad_norm": 2.131619635803148, + "learning_rate": 9.543768013492578e-06, + "loss": 0.1245, + "step": 330 + }, + { + "epoch": 0.33, + "grad_norm": 1.5230807628829646, + "learning_rate": 9.540422688827677e-06, + "loss": 0.0829, + "step": 331 + }, + { + "epoch": 0.33, + "grad_norm": 0.9367377983487091, + "learning_rate": 9.537065734815523e-06, + "loss": 0.0662, + "step": 332 + }, + { + "epoch": 0.33, + "grad_norm": 2.066131384955803, + "learning_rate": 9.533697160054256e-06, + "loss": 0.0801, + "step": 333 + }, + { + "epoch": 0.33, + "grad_norm": 1.597529173903688, + "learning_rate": 9.53031697317178e-06, + "loss": 0.0916, + "step": 334 + }, + { + "epoch": 0.33, + "grad_norm": 1.2965629730886212, + "learning_rate": 9.52692518282574e-06, + "loss": 0.0799, + "step": 335 + }, + { + "epoch": 0.33, + "grad_norm": 1.2843598864453172, + "learning_rate": 9.5235217977035e-06, + "loss": 0.0658, + "step": 336 + }, + { + "epoch": 0.33, + "grad_norm": 2.624715892770218, + "learning_rate": 9.520106826522123e-06, + "loss": 0.0933, + "step": 337 + }, + { + "epoch": 0.33, + "grad_norm": 1.1720007241534482, + "learning_rate": 9.516680278028343e-06, + "loss": 0.0527, + "step": 338 + }, + { + "epoch": 0.33, + "grad_norm": 1.3607490046691666, + "learning_rate": 9.513242160998555e-06, + "loss": 0.0742, + "step": 339 + }, + { + "epoch": 0.34, + "grad_norm": 1.3952909438425838, + "learning_rate": 9.509792484238776e-06, + "loss": 0.0715, + "step": 340 + }, + { + "epoch": 0.34, + "grad_norm": 1.7424553551612125, + "learning_rate": 9.506331256584637e-06, + "loss": 0.0963, + "step": 341 + }, + { + "epoch": 0.34, + "grad_norm": 2.578495355923729, + "learning_rate": 9.502858486901354e-06, + "loss": 0.113, + "step": 342 + }, + { + "epoch": 0.34, + "grad_norm": 3.2366093718995437, + "learning_rate": 9.4993741840837e-06, + "loss": 0.0901, + "step": 343 + }, + { + "epoch": 0.34, + "grad_norm": 1.983175168502808, + "learning_rate": 9.495878357055992e-06, + "loss": 0.1064, + "step": 344 + }, + { + "epoch": 0.34, + "grad_norm": 0.9500124640313773, + "learning_rate": 9.492371014772063e-06, + "loss": 0.0427, + "step": 345 + }, + { + "epoch": 0.34, + "grad_norm": 0.8851187458756449, + "learning_rate": 9.48885216621524e-06, + "loss": 0.0579, + "step": 346 + }, + { + "epoch": 0.34, + "grad_norm": 1.622075510857947, + "learning_rate": 9.485321820398321e-06, + "loss": 0.0668, + "step": 347 + }, + { + "epoch": 0.34, + "grad_norm": 1.6580856557518695, + "learning_rate": 9.481779986363552e-06, + "loss": 0.0789, + "step": 348 + }, + { + "epoch": 0.34, + "grad_norm": 1.512095570610664, + "learning_rate": 9.478226673182602e-06, + "loss": 0.069, + "step": 349 + }, + { + "epoch": 0.35, + "grad_norm": 1.6702707971480697, + "learning_rate": 9.474661889956544e-06, + "loss": 0.0994, + "step": 350 + }, + { + "epoch": 0.35, + "grad_norm": 2.1155812559129124, + "learning_rate": 9.471085645815828e-06, + "loss": 0.0884, + "step": 351 + }, + { + "epoch": 0.35, + "grad_norm": 2.0091628967886317, + "learning_rate": 9.467497949920256e-06, + "loss": 0.0753, + "step": 352 + }, + { + "epoch": 0.35, + "grad_norm": 0.9773778602091453, + "learning_rate": 9.463898811458968e-06, + "loss": 0.0535, + "step": 353 + }, + { + "epoch": 0.35, + "grad_norm": 2.3011646246801316, + "learning_rate": 9.460288239650409e-06, + "loss": 0.096, + "step": 354 + }, + { + "epoch": 0.35, + "grad_norm": 1.6098500437903527, + "learning_rate": 9.4566662437423e-06, + "loss": 0.0645, + "step": 355 + }, + { + "epoch": 0.35, + "grad_norm": 0.8545640076569774, + "learning_rate": 9.453032833011633e-06, + "loss": 0.0535, + "step": 356 + }, + { + "epoch": 0.35, + "grad_norm": 1.13656418875037, + "learning_rate": 9.449388016764634e-06, + "loss": 0.0536, + "step": 357 + }, + { + "epoch": 0.35, + "grad_norm": 2.1924165470684622, + "learning_rate": 9.44573180433674e-06, + "loss": 0.11, + "step": 358 + }, + { + "epoch": 0.35, + "grad_norm": 1.8340819553384473, + "learning_rate": 9.442064205092578e-06, + "loss": 0.0812, + "step": 359 + }, + { + "epoch": 0.36, + "grad_norm": 1.5107648883572784, + "learning_rate": 9.43838522842594e-06, + "loss": 0.0834, + "step": 360 + }, + { + "epoch": 0.36, + "grad_norm": 0.9803211912182362, + "learning_rate": 9.434694883759757e-06, + "loss": 0.0527, + "step": 361 + }, + { + "epoch": 0.36, + "grad_norm": 2.113097249223458, + "learning_rate": 9.43099318054608e-06, + "loss": 0.1023, + "step": 362 + }, + { + "epoch": 0.36, + "grad_norm": 1.7775339990135852, + "learning_rate": 9.427280128266049e-06, + "loss": 0.0714, + "step": 363 + }, + { + "epoch": 0.36, + "grad_norm": 1.6899887297282574, + "learning_rate": 9.42355573642988e-06, + "loss": 0.0728, + "step": 364 + }, + { + "epoch": 0.36, + "grad_norm": 2.7074309015837548, + "learning_rate": 9.419820014576818e-06, + "loss": 0.0687, + "step": 365 + }, + { + "epoch": 0.36, + "grad_norm": 1.8657976560603164, + "learning_rate": 9.416072972275144e-06, + "loss": 0.1106, + "step": 366 + }, + { + "epoch": 0.36, + "grad_norm": 1.2617207451485761, + "learning_rate": 9.412314619122124e-06, + "loss": 0.0524, + "step": 367 + }, + { + "epoch": 0.36, + "grad_norm": 1.2819250350418456, + "learning_rate": 9.408544964743996e-06, + "loss": 0.0807, + "step": 368 + }, + { + "epoch": 0.36, + "grad_norm": 1.1937701721663347, + "learning_rate": 9.404764018795947e-06, + "loss": 0.0518, + "step": 369 + }, + { + "epoch": 0.37, + "grad_norm": 1.2777976949870833, + "learning_rate": 9.400971790962081e-06, + "loss": 0.0563, + "step": 370 + }, + { + "epoch": 0.37, + "grad_norm": 1.4766786972271955, + "learning_rate": 9.3971682909554e-06, + "loss": 0.0751, + "step": 371 + }, + { + "epoch": 0.37, + "grad_norm": 1.5716995031811691, + "learning_rate": 9.393353528517778e-06, + "loss": 0.0532, + "step": 372 + }, + { + "epoch": 0.37, + "grad_norm": 0.8565525773799597, + "learning_rate": 9.389527513419935e-06, + "loss": 0.0546, + "step": 373 + }, + { + "epoch": 0.37, + "grad_norm": 1.2523781687323514, + "learning_rate": 9.38569025546141e-06, + "loss": 0.0663, + "step": 374 + }, + { + "epoch": 0.37, + "grad_norm": 1.5941548292464838, + "learning_rate": 9.381841764470543e-06, + "loss": 0.0849, + "step": 375 + }, + { + "epoch": 0.37, + "grad_norm": 0.9991597120815358, + "learning_rate": 9.377982050304441e-06, + "loss": 0.0468, + "step": 376 + }, + { + "epoch": 0.37, + "grad_norm": 3.2320914466684454, + "learning_rate": 9.374111122848957e-06, + "loss": 0.0932, + "step": 377 + }, + { + "epoch": 0.37, + "grad_norm": 1.5056596004131972, + "learning_rate": 9.370228992018666e-06, + "loss": 0.0737, + "step": 378 + }, + { + "epoch": 0.37, + "grad_norm": 1.229304673858704, + "learning_rate": 9.36633566775684e-06, + "loss": 0.0602, + "step": 379 + }, + { + "epoch": 0.38, + "grad_norm": 1.5220192209727423, + "learning_rate": 9.362431160035415e-06, + "loss": 0.0698, + "step": 380 + }, + { + "epoch": 0.38, + "grad_norm": 1.7867448206398417, + "learning_rate": 9.358515478854977e-06, + "loss": 0.0849, + "step": 381 + }, + { + "epoch": 0.38, + "grad_norm": 1.0935869033882721, + "learning_rate": 9.354588634244728e-06, + "loss": 0.0748, + "step": 382 + }, + { + "epoch": 0.38, + "grad_norm": 1.532490450509265, + "learning_rate": 9.35065063626246e-06, + "loss": 0.0703, + "step": 383 + }, + { + "epoch": 0.38, + "grad_norm": 1.4938766268199541, + "learning_rate": 9.346701494994537e-06, + "loss": 0.0704, + "step": 384 + }, + { + "epoch": 0.38, + "grad_norm": 1.3175682655861047, + "learning_rate": 9.342741220555862e-06, + "loss": 0.0658, + "step": 385 + }, + { + "epoch": 0.38, + "grad_norm": 1.8799920557327616, + "learning_rate": 9.338769823089853e-06, + "loss": 0.072, + "step": 386 + }, + { + "epoch": 0.38, + "grad_norm": 2.091485141339395, + "learning_rate": 9.334787312768419e-06, + "loss": 0.1154, + "step": 387 + }, + { + "epoch": 0.38, + "grad_norm": 1.1795003198750242, + "learning_rate": 9.330793699791929e-06, + "loss": 0.0644, + "step": 388 + }, + { + "epoch": 0.38, + "grad_norm": 3.29652939495859, + "learning_rate": 9.326788994389191e-06, + "loss": 0.1329, + "step": 389 + }, + { + "epoch": 0.39, + "grad_norm": 2.755267217555506, + "learning_rate": 9.322773206817425e-06, + "loss": 0.0632, + "step": 390 + }, + { + "epoch": 0.39, + "grad_norm": 1.5860689233573197, + "learning_rate": 9.318746347362234e-06, + "loss": 0.0723, + "step": 391 + }, + { + "epoch": 0.39, + "grad_norm": 1.9815352814144065, + "learning_rate": 9.31470842633758e-06, + "loss": 0.0838, + "step": 392 + }, + { + "epoch": 0.39, + "grad_norm": 1.4741872607662527, + "learning_rate": 9.310659454085757e-06, + "loss": 0.0663, + "step": 393 + }, + { + "epoch": 0.39, + "grad_norm": 1.330523711040219, + "learning_rate": 9.306599440977364e-06, + "loss": 0.0671, + "step": 394 + }, + { + "epoch": 0.39, + "grad_norm": 2.538380618597983, + "learning_rate": 9.302528397411282e-06, + "loss": 0.0898, + "step": 395 + }, + { + "epoch": 0.39, + "grad_norm": 1.4259119324898288, + "learning_rate": 9.298446333814634e-06, + "loss": 0.077, + "step": 396 + }, + { + "epoch": 0.39, + "grad_norm": 1.4798744530874832, + "learning_rate": 9.294353260642785e-06, + "loss": 0.0697, + "step": 397 + }, + { + "epoch": 0.39, + "grad_norm": 1.546206268272879, + "learning_rate": 9.290249188379284e-06, + "loss": 0.0627, + "step": 398 + }, + { + "epoch": 0.39, + "grad_norm": 2.9537128285464975, + "learning_rate": 9.286134127535859e-06, + "loss": 0.0867, + "step": 399 + }, + { + "epoch": 0.4, + "grad_norm": 2.3995970063745107, + "learning_rate": 9.282008088652381e-06, + "loss": 0.0795, + "step": 400 + }, + { + "epoch": 0.4, + "grad_norm": 1.5437480493944655, + "learning_rate": 9.27787108229684e-06, + "loss": 0.0605, + "step": 401 + }, + { + "epoch": 0.4, + "grad_norm": 1.3231852490977054, + "learning_rate": 9.273723119065316e-06, + "loss": 0.0642, + "step": 402 + }, + { + "epoch": 0.4, + "grad_norm": 1.0983511223510702, + "learning_rate": 9.269564209581953e-06, + "loss": 0.0698, + "step": 403 + }, + { + "epoch": 0.4, + "grad_norm": 0.9638250329004523, + "learning_rate": 9.265394364498933e-06, + "loss": 0.0613, + "step": 404 + }, + { + "epoch": 0.4, + "grad_norm": 1.1237185184057257, + "learning_rate": 9.261213594496443e-06, + "loss": 0.0605, + "step": 405 + }, + { + "epoch": 0.4, + "grad_norm": 1.111228491358394, + "learning_rate": 9.257021910282658e-06, + "loss": 0.0594, + "step": 406 + }, + { + "epoch": 0.4, + "grad_norm": 1.412987219933755, + "learning_rate": 9.252819322593702e-06, + "loss": 0.0632, + "step": 407 + }, + { + "epoch": 0.4, + "grad_norm": 2.238843236884171, + "learning_rate": 9.248605842193628e-06, + "loss": 0.0562, + "step": 408 + }, + { + "epoch": 0.4, + "grad_norm": 1.5281808809312805, + "learning_rate": 9.24438147987439e-06, + "loss": 0.0645, + "step": 409 + }, + { + "epoch": 0.41, + "grad_norm": 1.360991380105523, + "learning_rate": 9.240146246455814e-06, + "loss": 0.0753, + "step": 410 + }, + { + "epoch": 0.41, + "grad_norm": 1.2922523007020124, + "learning_rate": 9.235900152785564e-06, + "loss": 0.0434, + "step": 411 + }, + { + "epoch": 0.41, + "grad_norm": 1.4373970506401919, + "learning_rate": 9.231643209739128e-06, + "loss": 0.0927, + "step": 412 + }, + { + "epoch": 0.41, + "grad_norm": 6.034620905490409, + "learning_rate": 9.227375428219777e-06, + "loss": 0.0797, + "step": 413 + }, + { + "epoch": 0.41, + "grad_norm": 1.4667858443208852, + "learning_rate": 9.22309681915855e-06, + "loss": 0.0542, + "step": 414 + }, + { + "epoch": 0.41, + "grad_norm": 2.1848260113622535, + "learning_rate": 9.218807393514207e-06, + "loss": 0.0788, + "step": 415 + }, + { + "epoch": 0.41, + "grad_norm": 0.9929252392688195, + "learning_rate": 9.21450716227322e-06, + "loss": 0.0633, + "step": 416 + }, + { + "epoch": 0.41, + "grad_norm": 1.4775624542456316, + "learning_rate": 9.210196136449738e-06, + "loss": 0.0664, + "step": 417 + }, + { + "epoch": 0.41, + "grad_norm": 1.235653428937132, + "learning_rate": 9.205874327085554e-06, + "loss": 0.0664, + "step": 418 + }, + { + "epoch": 0.41, + "grad_norm": 1.4916279720222827, + "learning_rate": 9.201541745250084e-06, + "loss": 0.0651, + "step": 419 + }, + { + "epoch": 0.42, + "grad_norm": 1.346805901014103, + "learning_rate": 9.197198402040335e-06, + "loss": 0.0668, + "step": 420 + }, + { + "epoch": 0.42, + "grad_norm": 1.7380719887472837, + "learning_rate": 9.192844308580872e-06, + "loss": 0.1061, + "step": 421 + }, + { + "epoch": 0.42, + "grad_norm": 1.1977746654700658, + "learning_rate": 9.188479476023803e-06, + "loss": 0.0513, + "step": 422 + }, + { + "epoch": 0.42, + "grad_norm": 1.4928135949497092, + "learning_rate": 9.184103915548737e-06, + "loss": 0.0527, + "step": 423 + }, + { + "epoch": 0.42, + "grad_norm": 1.0948400388091197, + "learning_rate": 9.179717638362756e-06, + "loss": 0.0609, + "step": 424 + }, + { + "epoch": 0.42, + "grad_norm": 1.367623112561712, + "learning_rate": 9.175320655700407e-06, + "loss": 0.0636, + "step": 425 + }, + { + "epoch": 0.42, + "grad_norm": 1.9122244810169786, + "learning_rate": 9.170912978823634e-06, + "loss": 0.0924, + "step": 426 + }, + { + "epoch": 0.42, + "grad_norm": 2.7194683882049424, + "learning_rate": 9.166494619021788e-06, + "loss": 0.0869, + "step": 427 + }, + { + "epoch": 0.42, + "grad_norm": 0.8058781739102134, + "learning_rate": 9.162065587611578e-06, + "loss": 0.0498, + "step": 428 + }, + { + "epoch": 0.42, + "grad_norm": 0.9174740077703543, + "learning_rate": 9.157625895937045e-06, + "loss": 0.049, + "step": 429 + }, + { + "epoch": 0.42, + "grad_norm": 1.538856642201719, + "learning_rate": 9.153175555369536e-06, + "loss": 0.0762, + "step": 430 + }, + { + "epoch": 0.43, + "grad_norm": 1.1280379258945326, + "learning_rate": 9.14871457730767e-06, + "loss": 0.0493, + "step": 431 + }, + { + "epoch": 0.43, + "grad_norm": 1.546891200724216, + "learning_rate": 9.144242973177311e-06, + "loss": 0.0579, + "step": 432 + }, + { + "epoch": 0.43, + "grad_norm": 1.1259850013209407, + "learning_rate": 9.139760754431549e-06, + "loss": 0.0561, + "step": 433 + }, + { + "epoch": 0.43, + "grad_norm": 1.390978156286863, + "learning_rate": 9.135267932550647e-06, + "loss": 0.0758, + "step": 434 + }, + { + "epoch": 0.43, + "grad_norm": 2.314675628407316, + "learning_rate": 9.130764519042036e-06, + "loss": 0.0511, + "step": 435 + }, + { + "epoch": 0.43, + "grad_norm": 1.0433867643994965, + "learning_rate": 9.126250525440273e-06, + "loss": 0.0421, + "step": 436 + }, + { + "epoch": 0.43, + "grad_norm": 1.955888645256769, + "learning_rate": 9.12172596330701e-06, + "loss": 0.0874, + "step": 437 + }, + { + "epoch": 0.43, + "grad_norm": 1.5882258901883926, + "learning_rate": 9.117190844230971e-06, + "loss": 0.0507, + "step": 438 + }, + { + "epoch": 0.43, + "grad_norm": 1.900326904241668, + "learning_rate": 9.112645179827922e-06, + "loss": 0.0686, + "step": 439 + }, + { + "epoch": 0.43, + "grad_norm": 3.011111441922898, + "learning_rate": 9.108088981740633e-06, + "loss": 0.0862, + "step": 440 + }, + { + "epoch": 0.44, + "grad_norm": 1.4966326512252073, + "learning_rate": 9.103522261638857e-06, + "loss": 0.074, + "step": 441 + }, + { + "epoch": 0.44, + "grad_norm": 1.4385678568699105, + "learning_rate": 9.098945031219297e-06, + "loss": 0.0734, + "step": 442 + }, + { + "epoch": 0.44, + "grad_norm": 1.5204707684735033, + "learning_rate": 9.094357302205575e-06, + "loss": 0.0841, + "step": 443 + }, + { + "epoch": 0.44, + "grad_norm": 1.2937317248210154, + "learning_rate": 9.089759086348204e-06, + "loss": 0.0587, + "step": 444 + }, + { + "epoch": 0.44, + "grad_norm": 1.3861131844899162, + "learning_rate": 9.085150395424557e-06, + "loss": 0.0526, + "step": 445 + }, + { + "epoch": 0.44, + "grad_norm": 4.749413906607524, + "learning_rate": 9.080531241238836e-06, + "loss": 0.0681, + "step": 446 + }, + { + "epoch": 0.44, + "grad_norm": 1.1655901911911675, + "learning_rate": 9.075901635622041e-06, + "loss": 0.0651, + "step": 447 + }, + { + "epoch": 0.44, + "grad_norm": 3.563849142939697, + "learning_rate": 9.071261590431945e-06, + "loss": 0.0615, + "step": 448 + }, + { + "epoch": 0.44, + "grad_norm": 0.9666664856605189, + "learning_rate": 9.066611117553058e-06, + "loss": 0.0547, + "step": 449 + }, + { + "epoch": 0.44, + "grad_norm": 1.6304500518473257, + "learning_rate": 9.061950228896594e-06, + "loss": 0.0719, + "step": 450 + }, + { + "epoch": 0.45, + "grad_norm": 2.208068296719404, + "learning_rate": 9.057278936400453e-06, + "loss": 0.0499, + "step": 451 + }, + { + "epoch": 0.45, + "grad_norm": 1.2910850367163424, + "learning_rate": 9.052597252029175e-06, + "loss": 0.0506, + "step": 452 + }, + { + "epoch": 0.45, + "grad_norm": 1.0096660024374238, + "learning_rate": 9.047905187773922e-06, + "loss": 0.0598, + "step": 453 + }, + { + "epoch": 0.45, + "grad_norm": 2.3017894716148746, + "learning_rate": 9.043202755652436e-06, + "loss": 0.093, + "step": 454 + }, + { + "epoch": 0.45, + "grad_norm": 1.1809136704456458, + "learning_rate": 9.038489967709023e-06, + "loss": 0.0717, + "step": 455 + }, + { + "epoch": 0.45, + "grad_norm": 1.4118385240601037, + "learning_rate": 9.033766836014504e-06, + "loss": 0.0607, + "step": 456 + }, + { + "epoch": 0.45, + "grad_norm": 1.190671617119357, + "learning_rate": 9.029033372666199e-06, + "loss": 0.0676, + "step": 457 + }, + { + "epoch": 0.45, + "grad_norm": 1.4608113102052052, + "learning_rate": 9.024289589787885e-06, + "loss": 0.0912, + "step": 458 + }, + { + "epoch": 0.45, + "grad_norm": 1.0716327370456122, + "learning_rate": 9.019535499529781e-06, + "loss": 0.0704, + "step": 459 + }, + { + "epoch": 0.45, + "grad_norm": 1.9547421435045815, + "learning_rate": 9.014771114068492e-06, + "loss": 0.0786, + "step": 460 + }, + { + "epoch": 0.46, + "grad_norm": 1.5264171632599215, + "learning_rate": 9.009996445607004e-06, + "loss": 0.063, + "step": 461 + }, + { + "epoch": 0.46, + "grad_norm": 1.3271932708271126, + "learning_rate": 9.005211506374636e-06, + "loss": 0.045, + "step": 462 + }, + { + "epoch": 0.46, + "grad_norm": 2.455795720502849, + "learning_rate": 9.00041630862701e-06, + "loss": 0.0755, + "step": 463 + }, + { + "epoch": 0.46, + "grad_norm": 1.5728504984245728, + "learning_rate": 8.99561086464603e-06, + "loss": 0.0689, + "step": 464 + }, + { + "epoch": 0.46, + "grad_norm": 1.0184694441905213, + "learning_rate": 8.990795186739838e-06, + "loss": 0.0465, + "step": 465 + }, + { + "epoch": 0.46, + "grad_norm": 1.711545145839596, + "learning_rate": 8.985969287242791e-06, + "loss": 0.0713, + "step": 466 + }, + { + "epoch": 0.46, + "grad_norm": 1.6749239768622135, + "learning_rate": 8.981133178515424e-06, + "loss": 0.0877, + "step": 467 + }, + { + "epoch": 0.46, + "grad_norm": 1.4534332873265494, + "learning_rate": 8.976286872944425e-06, + "loss": 0.0534, + "step": 468 + }, + { + "epoch": 0.46, + "grad_norm": 1.7939842165258273, + "learning_rate": 8.971430382942597e-06, + "loss": 0.0655, + "step": 469 + }, + { + "epoch": 0.46, + "grad_norm": 1.45853725629627, + "learning_rate": 8.966563720948824e-06, + "loss": 0.1031, + "step": 470 + }, + { + "epoch": 0.47, + "grad_norm": 1.177368183337995, + "learning_rate": 8.96168689942805e-06, + "loss": 0.0606, + "step": 471 + }, + { + "epoch": 0.47, + "grad_norm": 1.235444390625768, + "learning_rate": 8.956799930871238e-06, + "loss": 0.0688, + "step": 472 + }, + { + "epoch": 0.47, + "grad_norm": 1.452038429310052, + "learning_rate": 8.95190282779534e-06, + "loss": 0.0655, + "step": 473 + }, + { + "epoch": 0.47, + "grad_norm": 1.4724243348741484, + "learning_rate": 8.946995602743267e-06, + "loss": 0.0603, + "step": 474 + }, + { + "epoch": 0.47, + "grad_norm": 0.8899006566216181, + "learning_rate": 8.94207826828385e-06, + "loss": 0.0533, + "step": 475 + }, + { + "epoch": 0.47, + "grad_norm": 1.4619154759372903, + "learning_rate": 8.937150837011822e-06, + "loss": 0.0754, + "step": 476 + }, + { + "epoch": 0.47, + "grad_norm": 0.9738974265046594, + "learning_rate": 8.932213321547769e-06, + "loss": 0.0507, + "step": 477 + }, + { + "epoch": 0.47, + "grad_norm": 1.5501236401082463, + "learning_rate": 8.927265734538111e-06, + "loss": 0.0565, + "step": 478 + }, + { + "epoch": 0.47, + "grad_norm": 1.1332840368532724, + "learning_rate": 8.92230808865506e-06, + "loss": 0.051, + "step": 479 + }, + { + "epoch": 0.47, + "grad_norm": 2.2256642469778365, + "learning_rate": 8.917340396596594e-06, + "loss": 0.0835, + "step": 480 + }, + { + "epoch": 0.48, + "grad_norm": 1.7683836409006217, + "learning_rate": 8.912362671086424e-06, + "loss": 0.0805, + "step": 481 + }, + { + "epoch": 0.48, + "grad_norm": 1.1807223874785262, + "learning_rate": 8.907374924873953e-06, + "loss": 0.0606, + "step": 482 + }, + { + "epoch": 0.48, + "grad_norm": 1.3547721846092768, + "learning_rate": 8.902377170734258e-06, + "loss": 0.0615, + "step": 483 + }, + { + "epoch": 0.48, + "grad_norm": 1.5945685777575744, + "learning_rate": 8.897369421468042e-06, + "loss": 0.0934, + "step": 484 + }, + { + "epoch": 0.48, + "grad_norm": 0.9304262314397095, + "learning_rate": 8.892351689901618e-06, + "loss": 0.0498, + "step": 485 + }, + { + "epoch": 0.48, + "grad_norm": 1.5883917689539437, + "learning_rate": 8.887323988886853e-06, + "loss": 0.063, + "step": 486 + }, + { + "epoch": 0.48, + "grad_norm": 2.322868869946993, + "learning_rate": 8.882286331301162e-06, + "loss": 0.0786, + "step": 487 + }, + { + "epoch": 0.48, + "grad_norm": 2.0519135254924814, + "learning_rate": 8.877238730047454e-06, + "loss": 0.0596, + "step": 488 + }, + { + "epoch": 0.48, + "grad_norm": 1.624611435100832, + "learning_rate": 8.872181198054109e-06, + "loss": 0.0605, + "step": 489 + }, + { + "epoch": 0.48, + "grad_norm": 1.7411620989235403, + "learning_rate": 8.86711374827494e-06, + "loss": 0.0644, + "step": 490 + }, + { + "epoch": 0.49, + "grad_norm": 1.2561485952196334, + "learning_rate": 8.862036393689167e-06, + "loss": 0.0609, + "step": 491 + }, + { + "epoch": 0.49, + "grad_norm": 2.565086561625876, + "learning_rate": 8.856949147301375e-06, + "loss": 0.0802, + "step": 492 + }, + { + "epoch": 0.49, + "grad_norm": 2.118910169047855, + "learning_rate": 8.851852022141486e-06, + "loss": 0.0738, + "step": 493 + }, + { + "epoch": 0.49, + "grad_norm": 1.8276607532145808, + "learning_rate": 8.846745031264727e-06, + "loss": 0.0836, + "step": 494 + }, + { + "epoch": 0.49, + "grad_norm": 2.573960942810051, + "learning_rate": 8.84162818775159e-06, + "loss": 0.1106, + "step": 495 + }, + { + "epoch": 0.49, + "grad_norm": 1.338345683797441, + "learning_rate": 8.836501504707802e-06, + "loss": 0.0633, + "step": 496 + }, + { + "epoch": 0.49, + "grad_norm": 1.357804125150053, + "learning_rate": 8.831364995264297e-06, + "loss": 0.0683, + "step": 497 + }, + { + "epoch": 0.49, + "grad_norm": 1.1612434605900008, + "learning_rate": 8.826218672577175e-06, + "loss": 0.056, + "step": 498 + }, + { + "epoch": 0.49, + "grad_norm": 1.7127111262285413, + "learning_rate": 8.821062549827669e-06, + "loss": 0.047, + "step": 499 + }, + { + "epoch": 0.49, + "grad_norm": 1.413765018338368, + "learning_rate": 8.815896640222112e-06, + "loss": 0.0639, + "step": 500 + }, + { + "epoch": 0.5, + "grad_norm": 1.7809132641091092, + "learning_rate": 8.810720956991906e-06, + "loss": 0.0761, + "step": 501 + }, + { + "epoch": 0.5, + "grad_norm": 4.7508580300066034, + "learning_rate": 8.805535513393488e-06, + "loss": 0.1202, + "step": 502 + }, + { + "epoch": 0.5, + "grad_norm": 1.1850119164950286, + "learning_rate": 8.800340322708291e-06, + "loss": 0.0711, + "step": 503 + }, + { + "epoch": 0.5, + "grad_norm": 0.9803941992462488, + "learning_rate": 8.795135398242715e-06, + "loss": 0.0494, + "step": 504 + }, + { + "epoch": 0.5, + "grad_norm": 1.4729217571131208, + "learning_rate": 8.78992075332809e-06, + "loss": 0.0698, + "step": 505 + }, + { + "epoch": 0.5, + "grad_norm": 1.2395336577118885, + "learning_rate": 8.78469640132064e-06, + "loss": 0.0525, + "step": 506 + }, + { + "epoch": 0.5, + "grad_norm": 1.1422345460944723, + "learning_rate": 8.779462355601457e-06, + "loss": 0.0527, + "step": 507 + }, + { + "epoch": 0.5, + "grad_norm": 1.3434980922499284, + "learning_rate": 8.774218629576461e-06, + "loss": 0.0568, + "step": 508 + }, + { + "epoch": 0.5, + "grad_norm": 1.902815923319694, + "learning_rate": 8.768965236676362e-06, + "loss": 0.0619, + "step": 509 + }, + { + "epoch": 0.5, + "grad_norm": 1.5712968770389895, + "learning_rate": 8.763702190356632e-06, + "loss": 0.0977, + "step": 510 + }, + { + "epoch": 0.5, + "grad_norm": 1.6657851322761215, + "learning_rate": 8.758429504097469e-06, + "loss": 0.0716, + "step": 511 + }, + { + "epoch": 0.51, + "grad_norm": 1.3756418955263106, + "learning_rate": 8.753147191403762e-06, + "loss": 0.0573, + "step": 512 + }, + { + "epoch": 0.51, + "grad_norm": 1.066335150336722, + "learning_rate": 8.747855265805053e-06, + "loss": 0.0571, + "step": 513 + }, + { + "epoch": 0.51, + "grad_norm": 3.0604816767014786, + "learning_rate": 8.742553740855507e-06, + "loss": 0.0745, + "step": 514 + }, + { + "epoch": 0.51, + "grad_norm": 1.5747177810123547, + "learning_rate": 8.737242630133878e-06, + "loss": 0.0585, + "step": 515 + }, + { + "epoch": 0.51, + "grad_norm": 1.265484983287345, + "learning_rate": 8.73192194724347e-06, + "loss": 0.0841, + "step": 516 + }, + { + "epoch": 0.51, + "grad_norm": 1.5874226214432121, + "learning_rate": 8.726591705812102e-06, + "loss": 0.0676, + "step": 517 + }, + { + "epoch": 0.51, + "grad_norm": 4.428005055418884, + "learning_rate": 8.721251919492083e-06, + "loss": 0.099, + "step": 518 + }, + { + "epoch": 0.51, + "grad_norm": 1.6128730700351466, + "learning_rate": 8.715902601960157e-06, + "loss": 0.0909, + "step": 519 + }, + { + "epoch": 0.51, + "grad_norm": 1.2540385526514841, + "learning_rate": 8.71054376691749e-06, + "loss": 0.0598, + "step": 520 + }, + { + "epoch": 0.51, + "grad_norm": 1.8172924798142867, + "learning_rate": 8.705175428089622e-06, + "loss": 0.085, + "step": 521 + }, + { + "epoch": 0.52, + "grad_norm": 2.3520076068544826, + "learning_rate": 8.699797599226438e-06, + "loss": 0.0716, + "step": 522 + }, + { + "epoch": 0.52, + "grad_norm": 1.3037215232354775, + "learning_rate": 8.694410294102121e-06, + "loss": 0.0749, + "step": 523 + }, + { + "epoch": 0.52, + "grad_norm": 0.947508559116718, + "learning_rate": 8.689013526515136e-06, + "loss": 0.0517, + "step": 524 + }, + { + "epoch": 0.52, + "grad_norm": 1.504232673619683, + "learning_rate": 8.683607310288176e-06, + "loss": 0.0827, + "step": 525 + }, + { + "epoch": 0.52, + "grad_norm": 1.1415599175520932, + "learning_rate": 8.67819165926814e-06, + "loss": 0.0673, + "step": 526 + }, + { + "epoch": 0.52, + "grad_norm": 2.134782275091129, + "learning_rate": 8.672766587326091e-06, + "loss": 0.074, + "step": 527 + }, + { + "epoch": 0.52, + "grad_norm": 1.2286611260487121, + "learning_rate": 8.66733210835722e-06, + "loss": 0.0701, + "step": 528 + }, + { + "epoch": 0.52, + "grad_norm": 1.7430230560995867, + "learning_rate": 8.661888236280813e-06, + "loss": 0.084, + "step": 529 + }, + { + "epoch": 0.52, + "grad_norm": 1.1726076433801744, + "learning_rate": 8.656434985040215e-06, + "loss": 0.0446, + "step": 530 + }, + { + "epoch": 0.52, + "grad_norm": 1.416331100895462, + "learning_rate": 8.650972368602793e-06, + "loss": 0.0641, + "step": 531 + }, + { + "epoch": 0.53, + "grad_norm": 0.943146778515395, + "learning_rate": 8.645500400959904e-06, + "loss": 0.0417, + "step": 532 + }, + { + "epoch": 0.53, + "grad_norm": 2.0610796045934787, + "learning_rate": 8.640019096126851e-06, + "loss": 0.0586, + "step": 533 + }, + { + "epoch": 0.53, + "grad_norm": 1.1955873477768537, + "learning_rate": 8.634528468142855e-06, + "loss": 0.082, + "step": 534 + }, + { + "epoch": 0.53, + "grad_norm": 1.3787963183214098, + "learning_rate": 8.629028531071018e-06, + "loss": 0.0577, + "step": 535 + }, + { + "epoch": 0.53, + "grad_norm": 1.2226100702680849, + "learning_rate": 8.623519298998282e-06, + "loss": 0.0488, + "step": 536 + }, + { + "epoch": 0.53, + "grad_norm": 1.4620543475274264, + "learning_rate": 8.618000786035399e-06, + "loss": 0.0608, + "step": 537 + }, + { + "epoch": 0.53, + "grad_norm": 1.6117757963010761, + "learning_rate": 8.612473006316888e-06, + "loss": 0.0669, + "step": 538 + }, + { + "epoch": 0.53, + "grad_norm": 1.4385278132607686, + "learning_rate": 8.606935974001009e-06, + "loss": 0.0532, + "step": 539 + }, + { + "epoch": 0.53, + "grad_norm": 3.9070996845688652, + "learning_rate": 8.601389703269716e-06, + "loss": 0.0749, + "step": 540 + }, + { + "epoch": 0.53, + "grad_norm": 1.8584314450629578, + "learning_rate": 8.595834208328629e-06, + "loss": 0.0693, + "step": 541 + }, + { + "epoch": 0.54, + "grad_norm": 0.8975535225824105, + "learning_rate": 8.590269503406986e-06, + "loss": 0.0447, + "step": 542 + }, + { + "epoch": 0.54, + "grad_norm": 1.6838852869871483, + "learning_rate": 8.584695602757624e-06, + "loss": 0.0722, + "step": 543 + }, + { + "epoch": 0.54, + "grad_norm": 1.4605905386561235, + "learning_rate": 8.579112520656928e-06, + "loss": 0.072, + "step": 544 + }, + { + "epoch": 0.54, + "grad_norm": 1.1773973628095062, + "learning_rate": 8.5735202714048e-06, + "loss": 0.0511, + "step": 545 + }, + { + "epoch": 0.54, + "grad_norm": 1.5662400268745837, + "learning_rate": 8.56791886932462e-06, + "loss": 0.0646, + "step": 546 + }, + { + "epoch": 0.54, + "grad_norm": 1.1584626449271131, + "learning_rate": 8.562308328763217e-06, + "loss": 0.0663, + "step": 547 + }, + { + "epoch": 0.54, + "grad_norm": 1.2096222560437233, + "learning_rate": 8.556688664090822e-06, + "loss": 0.0541, + "step": 548 + }, + { + "epoch": 0.54, + "grad_norm": 1.1044021637797712, + "learning_rate": 8.551059889701032e-06, + "loss": 0.0462, + "step": 549 + }, + { + "epoch": 0.54, + "grad_norm": 1.275190179335817, + "learning_rate": 8.545422020010782e-06, + "loss": 0.0436, + "step": 550 + }, + { + "epoch": 0.54, + "grad_norm": 1.758686455350391, + "learning_rate": 8.539775069460306e-06, + "loss": 0.0646, + "step": 551 + }, + { + "epoch": 0.55, + "grad_norm": 1.5896830704748524, + "learning_rate": 8.534119052513084e-06, + "loss": 0.0652, + "step": 552 + }, + { + "epoch": 0.55, + "grad_norm": 2.94046553261661, + "learning_rate": 8.52845398365583e-06, + "loss": 0.0767, + "step": 553 + }, + { + "epoch": 0.55, + "grad_norm": 2.538412265816462, + "learning_rate": 8.52277987739844e-06, + "loss": 0.0815, + "step": 554 + }, + { + "epoch": 0.55, + "grad_norm": 1.1662847269102934, + "learning_rate": 8.517096748273951e-06, + "loss": 0.0665, + "step": 555 + }, + { + "epoch": 0.55, + "grad_norm": 1.6022494338744144, + "learning_rate": 8.511404610838519e-06, + "loss": 0.0516, + "step": 556 + }, + { + "epoch": 0.55, + "grad_norm": 0.939781419965344, + "learning_rate": 8.505703479671365e-06, + "loss": 0.0448, + "step": 557 + }, + { + "epoch": 0.55, + "grad_norm": 1.1018914419618033, + "learning_rate": 8.499993369374754e-06, + "loss": 0.0521, + "step": 558 + }, + { + "epoch": 0.55, + "grad_norm": 1.0558610033685443, + "learning_rate": 8.49427429457394e-06, + "loss": 0.0513, + "step": 559 + }, + { + "epoch": 0.55, + "grad_norm": 1.2787207221715446, + "learning_rate": 8.488546269917145e-06, + "loss": 0.0554, + "step": 560 + }, + { + "epoch": 0.55, + "grad_norm": 1.2983076996891234, + "learning_rate": 8.48280931007551e-06, + "loss": 0.0526, + "step": 561 + }, + { + "epoch": 0.56, + "grad_norm": 1.5470353895885407, + "learning_rate": 8.477063429743061e-06, + "loss": 0.0652, + "step": 562 + }, + { + "epoch": 0.56, + "grad_norm": 1.643121959436738, + "learning_rate": 8.471308643636682e-06, + "loss": 0.0878, + "step": 563 + }, + { + "epoch": 0.56, + "grad_norm": 1.2378141085176515, + "learning_rate": 8.465544966496049e-06, + "loss": 0.069, + "step": 564 + }, + { + "epoch": 0.56, + "grad_norm": 1.3515805346107896, + "learning_rate": 8.459772413083628e-06, + "loss": 0.0583, + "step": 565 + }, + { + "epoch": 0.56, + "grad_norm": 1.6101441673355676, + "learning_rate": 8.45399099818461e-06, + "loss": 0.0592, + "step": 566 + }, + { + "epoch": 0.56, + "grad_norm": 2.2878452013970008, + "learning_rate": 8.448200736606883e-06, + "loss": 0.0674, + "step": 567 + }, + { + "epoch": 0.56, + "grad_norm": 8.07020821775226, + "learning_rate": 8.442401643181e-06, + "loss": 0.0557, + "step": 568 + }, + { + "epoch": 0.56, + "grad_norm": 1.522275745146963, + "learning_rate": 8.43659373276013e-06, + "loss": 0.0647, + "step": 569 + }, + { + "epoch": 0.56, + "grad_norm": 0.9372445462464527, + "learning_rate": 8.430777020220026e-06, + "loss": 0.0505, + "step": 570 + }, + { + "epoch": 0.56, + "grad_norm": 1.0766527001955493, + "learning_rate": 8.424951520458987e-06, + "loss": 0.0497, + "step": 571 + }, + { + "epoch": 0.57, + "grad_norm": 1.3515114497395717, + "learning_rate": 8.419117248397816e-06, + "loss": 0.0592, + "step": 572 + }, + { + "epoch": 0.57, + "grad_norm": 1.2860874473607398, + "learning_rate": 8.413274218979787e-06, + "loss": 0.0602, + "step": 573 + }, + { + "epoch": 0.57, + "grad_norm": 2.060996016732545, + "learning_rate": 8.407422447170603e-06, + "loss": 0.0731, + "step": 574 + }, + { + "epoch": 0.57, + "grad_norm": 1.142925453362851, + "learning_rate": 8.401561947958357e-06, + "loss": 0.0392, + "step": 575 + }, + { + "epoch": 0.57, + "grad_norm": 1.7919299238414284, + "learning_rate": 8.3956927363535e-06, + "loss": 0.0728, + "step": 576 + }, + { + "epoch": 0.57, + "grad_norm": 1.6619277286864411, + "learning_rate": 8.389814827388795e-06, + "loss": 0.0583, + "step": 577 + }, + { + "epoch": 0.57, + "grad_norm": 1.1741375011364992, + "learning_rate": 8.383928236119282e-06, + "loss": 0.0442, + "step": 578 + }, + { + "epoch": 0.57, + "grad_norm": 2.019745040704028, + "learning_rate": 8.378032977622235e-06, + "loss": 0.0633, + "step": 579 + }, + { + "epoch": 0.57, + "grad_norm": 1.5237063624620941, + "learning_rate": 8.372129066997137e-06, + "loss": 0.0689, + "step": 580 + }, + { + "epoch": 0.57, + "grad_norm": 1.3155095691531686, + "learning_rate": 8.366216519365623e-06, + "loss": 0.0484, + "step": 581 + }, + { + "epoch": 0.58, + "grad_norm": 1.0155526815115254, + "learning_rate": 8.36029534987145e-06, + "loss": 0.0625, + "step": 582 + }, + { + "epoch": 0.58, + "grad_norm": 1.0948218257424078, + "learning_rate": 8.354365573680465e-06, + "loss": 0.0538, + "step": 583 + }, + { + "epoch": 0.58, + "grad_norm": 1.5326474826791807, + "learning_rate": 8.348427205980552e-06, + "loss": 0.0663, + "step": 584 + }, + { + "epoch": 0.58, + "grad_norm": 1.4121187656089296, + "learning_rate": 8.342480261981604e-06, + "loss": 0.066, + "step": 585 + }, + { + "epoch": 0.58, + "grad_norm": 0.961029784661072, + "learning_rate": 8.336524756915482e-06, + "loss": 0.0456, + "step": 586 + }, + { + "epoch": 0.58, + "grad_norm": 1.7902506613018505, + "learning_rate": 8.330560706035968e-06, + "loss": 0.0755, + "step": 587 + }, + { + "epoch": 0.58, + "grad_norm": 1.8123904283951255, + "learning_rate": 8.32458812461874e-06, + "loss": 0.0576, + "step": 588 + }, + { + "epoch": 0.58, + "grad_norm": 1.1387807867413784, + "learning_rate": 8.318607027961318e-06, + "loss": 0.0769, + "step": 589 + }, + { + "epoch": 0.58, + "grad_norm": 1.5212598171396146, + "learning_rate": 8.312617431383039e-06, + "loss": 0.0678, + "step": 590 + }, + { + "epoch": 0.58, + "grad_norm": 1.1337647098881944, + "learning_rate": 8.306619350225006e-06, + "loss": 0.0475, + "step": 591 + }, + { + "epoch": 0.58, + "grad_norm": 0.6095612646846799, + "learning_rate": 8.300612799850055e-06, + "loss": 0.0288, + "step": 592 + }, + { + "epoch": 0.59, + "grad_norm": 1.2650131261757638, + "learning_rate": 8.294597795642715e-06, + "loss": 0.0547, + "step": 593 + }, + { + "epoch": 0.59, + "grad_norm": 1.7591819855712723, + "learning_rate": 8.288574353009164e-06, + "loss": 0.0939, + "step": 594 + }, + { + "epoch": 0.59, + "grad_norm": 1.3474754038703902, + "learning_rate": 8.282542487377199e-06, + "loss": 0.0664, + "step": 595 + }, + { + "epoch": 0.59, + "grad_norm": 0.9668815112287411, + "learning_rate": 8.276502214196187e-06, + "loss": 0.0472, + "step": 596 + }, + { + "epoch": 0.59, + "grad_norm": 3.637641184362882, + "learning_rate": 8.270453548937028e-06, + "loss": 0.0716, + "step": 597 + }, + { + "epoch": 0.59, + "grad_norm": 0.8896896880386068, + "learning_rate": 8.264396507092121e-06, + "loss": 0.0494, + "step": 598 + }, + { + "epoch": 0.59, + "grad_norm": 1.8835890673936688, + "learning_rate": 8.258331104175317e-06, + "loss": 0.0623, + "step": 599 + }, + { + "epoch": 0.59, + "grad_norm": 0.9496660157175251, + "learning_rate": 8.25225735572188e-06, + "loss": 0.0524, + "step": 600 + }, + { + "epoch": 0.59, + "grad_norm": 1.5322847657196466, + "learning_rate": 8.246175277288457e-06, + "loss": 0.0498, + "step": 601 + }, + { + "epoch": 0.59, + "grad_norm": 1.0174572189523983, + "learning_rate": 8.240084884453022e-06, + "loss": 0.0614, + "step": 602 + }, + { + "epoch": 0.6, + "grad_norm": 0.91746310590934, + "learning_rate": 8.233986192814846e-06, + "loss": 0.0513, + "step": 603 + }, + { + "epoch": 0.6, + "grad_norm": 1.7684397279630049, + "learning_rate": 8.227879217994461e-06, + "loss": 0.0748, + "step": 604 + }, + { + "epoch": 0.6, + "grad_norm": 1.9785240601515683, + "learning_rate": 8.221763975633611e-06, + "loss": 0.0733, + "step": 605 + }, + { + "epoch": 0.6, + "grad_norm": 1.1291467469735303, + "learning_rate": 8.215640481395215e-06, + "loss": 0.06, + "step": 606 + }, + { + "epoch": 0.6, + "grad_norm": 1.4847914237132795, + "learning_rate": 8.20950875096333e-06, + "loss": 0.0557, + "step": 607 + }, + { + "epoch": 0.6, + "grad_norm": 0.9325607449405862, + "learning_rate": 8.203368800043103e-06, + "loss": 0.0436, + "step": 608 + }, + { + "epoch": 0.6, + "grad_norm": 1.7643923412720106, + "learning_rate": 8.197220644360745e-06, + "loss": 0.0643, + "step": 609 + }, + { + "epoch": 0.6, + "grad_norm": 1.3674698089019297, + "learning_rate": 8.19106429966347e-06, + "loss": 0.0569, + "step": 610 + }, + { + "epoch": 0.6, + "grad_norm": 1.0667370558475722, + "learning_rate": 8.18489978171948e-06, + "loss": 0.0592, + "step": 611 + }, + { + "epoch": 0.6, + "grad_norm": 1.7980358087158415, + "learning_rate": 8.178727106317899e-06, + "loss": 0.08, + "step": 612 + }, + { + "epoch": 0.61, + "grad_norm": 1.5273761560502266, + "learning_rate": 8.17254628926875e-06, + "loss": 0.0546, + "step": 613 + }, + { + "epoch": 0.61, + "grad_norm": 4.116897956255585, + "learning_rate": 8.166357346402914e-06, + "loss": 0.0628, + "step": 614 + }, + { + "epoch": 0.61, + "grad_norm": 1.7103264990604705, + "learning_rate": 8.160160293572073e-06, + "loss": 0.0488, + "step": 615 + }, + { + "epoch": 0.61, + "grad_norm": 1.1284051204409844, + "learning_rate": 8.15395514664869e-06, + "loss": 0.048, + "step": 616 + }, + { + "epoch": 0.61, + "grad_norm": 1.2783445700303377, + "learning_rate": 8.147741921525956e-06, + "loss": 0.0666, + "step": 617 + }, + { + "epoch": 0.61, + "grad_norm": 1.4795816320080954, + "learning_rate": 8.141520634117756e-06, + "loss": 0.0564, + "step": 618 + }, + { + "epoch": 0.61, + "grad_norm": 4.181586224479883, + "learning_rate": 8.13529130035862e-06, + "loss": 0.0677, + "step": 619 + }, + { + "epoch": 0.61, + "grad_norm": 1.1124677628660073, + "learning_rate": 8.129053936203688e-06, + "loss": 0.0522, + "step": 620 + }, + { + "epoch": 0.61, + "grad_norm": 1.7114645633433125, + "learning_rate": 8.122808557628674e-06, + "loss": 0.0646, + "step": 621 + }, + { + "epoch": 0.61, + "grad_norm": 1.531141332248485, + "learning_rate": 8.116555180629808e-06, + "loss": 0.0524, + "step": 622 + }, + { + "epoch": 0.62, + "grad_norm": 2.0519226390878296, + "learning_rate": 8.11029382122382e-06, + "loss": 0.0904, + "step": 623 + }, + { + "epoch": 0.62, + "grad_norm": 1.411218515136369, + "learning_rate": 8.104024495447873e-06, + "loss": 0.0436, + "step": 624 + }, + { + "epoch": 0.62, + "grad_norm": 1.7136185797858745, + "learning_rate": 8.097747219359543e-06, + "loss": 0.0513, + "step": 625 + }, + { + "epoch": 0.62, + "grad_norm": 1.1230390583540037, + "learning_rate": 8.091462009036764e-06, + "loss": 0.0358, + "step": 626 + }, + { + "epoch": 0.62, + "grad_norm": 1.3037179652892361, + "learning_rate": 8.085168880577792e-06, + "loss": 0.0471, + "step": 627 + }, + { + "epoch": 0.62, + "grad_norm": 0.9244134810312545, + "learning_rate": 8.078867850101168e-06, + "loss": 0.0395, + "step": 628 + }, + { + "epoch": 0.62, + "grad_norm": 1.219173315497727, + "learning_rate": 8.072558933745668e-06, + "loss": 0.0499, + "step": 629 + }, + { + "epoch": 0.62, + "grad_norm": 1.4030210979804876, + "learning_rate": 8.066242147670268e-06, + "loss": 0.0418, + "step": 630 + }, + { + "epoch": 0.62, + "grad_norm": 1.3156097733110124, + "learning_rate": 8.059917508054101e-06, + "loss": 0.0488, + "step": 631 + }, + { + "epoch": 0.62, + "grad_norm": 1.6817087679339362, + "learning_rate": 8.05358503109641e-06, + "loss": 0.0656, + "step": 632 + }, + { + "epoch": 0.63, + "grad_norm": 0.7541039302247473, + "learning_rate": 8.04724473301652e-06, + "loss": 0.0353, + "step": 633 + }, + { + "epoch": 0.63, + "grad_norm": 1.7192355457284463, + "learning_rate": 8.040896630053786e-06, + "loss": 0.0645, + "step": 634 + }, + { + "epoch": 0.63, + "grad_norm": 1.5761084488991353, + "learning_rate": 8.034540738467549e-06, + "loss": 0.0644, + "step": 635 + }, + { + "epoch": 0.63, + "grad_norm": 1.0289767918987414, + "learning_rate": 8.0281770745371e-06, + "loss": 0.0504, + "step": 636 + }, + { + "epoch": 0.63, + "grad_norm": 1.845359052407356, + "learning_rate": 8.021805654561643e-06, + "loss": 0.051, + "step": 637 + }, + { + "epoch": 0.63, + "grad_norm": 1.3253111334584065, + "learning_rate": 8.015426494860242e-06, + "loss": 0.0589, + "step": 638 + }, + { + "epoch": 0.63, + "grad_norm": 1.0579407657699587, + "learning_rate": 8.009039611771785e-06, + "loss": 0.0632, + "step": 639 + }, + { + "epoch": 0.63, + "grad_norm": 2.1426778255786547, + "learning_rate": 8.002645021654944e-06, + "loss": 0.0625, + "step": 640 + }, + { + "epoch": 0.63, + "grad_norm": 2.4755077125900704, + "learning_rate": 7.996242740888131e-06, + "loss": 0.0867, + "step": 641 + }, + { + "epoch": 0.63, + "grad_norm": 2.469653970091983, + "learning_rate": 7.989832785869454e-06, + "loss": 0.1006, + "step": 642 + }, + { + "epoch": 0.64, + "grad_norm": 3.418240485230458, + "learning_rate": 7.98341517301668e-06, + "loss": 0.0957, + "step": 643 + }, + { + "epoch": 0.64, + "grad_norm": 1.1863463247272923, + "learning_rate": 7.976989918767183e-06, + "loss": 0.0476, + "step": 644 + }, + { + "epoch": 0.64, + "grad_norm": 1.9320047106615985, + "learning_rate": 7.970557039577918e-06, + "loss": 0.1027, + "step": 645 + }, + { + "epoch": 0.64, + "grad_norm": 0.8756315880718288, + "learning_rate": 7.964116551925365e-06, + "loss": 0.0512, + "step": 646 + }, + { + "epoch": 0.64, + "grad_norm": 1.2079349714663918, + "learning_rate": 7.957668472305491e-06, + "loss": 0.0608, + "step": 647 + }, + { + "epoch": 0.64, + "grad_norm": 1.067062794690031, + "learning_rate": 7.951212817233709e-06, + "loss": 0.0498, + "step": 648 + }, + { + "epoch": 0.64, + "grad_norm": 1.4353105762665406, + "learning_rate": 7.944749603244837e-06, + "loss": 0.0693, + "step": 649 + }, + { + "epoch": 0.64, + "grad_norm": 1.120606097195914, + "learning_rate": 7.938278846893051e-06, + "loss": 0.0471, + "step": 650 + }, + { + "epoch": 0.64, + "grad_norm": 0.669764308329259, + "learning_rate": 7.931800564751844e-06, + "loss": 0.0426, + "step": 651 + }, + { + "epoch": 0.64, + "grad_norm": 1.4490252401737391, + "learning_rate": 7.925314773413988e-06, + "loss": 0.0769, + "step": 652 + }, + { + "epoch": 0.65, + "grad_norm": 1.0443343523013158, + "learning_rate": 7.918821489491489e-06, + "loss": 0.0449, + "step": 653 + }, + { + "epoch": 0.65, + "grad_norm": 0.9141229109009441, + "learning_rate": 7.91232072961554e-06, + "loss": 0.0572, + "step": 654 + }, + { + "epoch": 0.65, + "grad_norm": 3.972278870552116, + "learning_rate": 7.905812510436483e-06, + "loss": 0.0588, + "step": 655 + }, + { + "epoch": 0.65, + "grad_norm": 1.0982712477608652, + "learning_rate": 7.899296848623766e-06, + "loss": 0.0727, + "step": 656 + }, + { + "epoch": 0.65, + "grad_norm": 1.3069436042751426, + "learning_rate": 7.892773760865901e-06, + "loss": 0.0616, + "step": 657 + }, + { + "epoch": 0.65, + "grad_norm": 1.036432390647132, + "learning_rate": 7.886243263870421e-06, + "loss": 0.0398, + "step": 658 + }, + { + "epoch": 0.65, + "grad_norm": 1.3003358641954792, + "learning_rate": 7.879705374363831e-06, + "loss": 0.0517, + "step": 659 + }, + { + "epoch": 0.65, + "grad_norm": 0.9881779327346152, + "learning_rate": 7.873160109091574e-06, + "loss": 0.0466, + "step": 660 + }, + { + "epoch": 0.65, + "grad_norm": 1.2844285680285321, + "learning_rate": 7.866607484817984e-06, + "loss": 0.048, + "step": 661 + }, + { + "epoch": 0.65, + "grad_norm": 1.1217634195547732, + "learning_rate": 7.860047518326243e-06, + "loss": 0.0469, + "step": 662 + }, + { + "epoch": 0.66, + "grad_norm": 1.7765193310118434, + "learning_rate": 7.853480226418345e-06, + "loss": 0.0727, + "step": 663 + }, + { + "epoch": 0.66, + "grad_norm": 2.351110060170768, + "learning_rate": 7.846905625915032e-06, + "loss": 0.0602, + "step": 664 + }, + { + "epoch": 0.66, + "grad_norm": 2.2480431610205707, + "learning_rate": 7.84032373365578e-06, + "loss": 0.0668, + "step": 665 + }, + { + "epoch": 0.66, + "grad_norm": 1.279228731526593, + "learning_rate": 7.833734566498732e-06, + "loss": 0.0619, + "step": 666 + }, + { + "epoch": 0.66, + "grad_norm": 1.3069130651640115, + "learning_rate": 7.827138141320672e-06, + "loss": 0.0487, + "step": 667 + }, + { + "epoch": 0.66, + "grad_norm": 0.6649550741739837, + "learning_rate": 7.820534475016965e-06, + "loss": 0.0357, + "step": 668 + }, + { + "epoch": 0.66, + "grad_norm": 1.1780747788613053, + "learning_rate": 7.813923584501529e-06, + "loss": 0.0507, + "step": 669 + }, + { + "epoch": 0.66, + "grad_norm": 1.3976292130290968, + "learning_rate": 7.807305486706782e-06, + "loss": 0.0455, + "step": 670 + }, + { + "epoch": 0.66, + "grad_norm": 1.0135391390071877, + "learning_rate": 7.800680198583608e-06, + "loss": 0.0352, + "step": 671 + }, + { + "epoch": 0.66, + "grad_norm": 1.8518127433708236, + "learning_rate": 7.794047737101298e-06, + "loss": 0.0768, + "step": 672 + }, + { + "epoch": 0.67, + "grad_norm": 1.1646871241715808, + "learning_rate": 7.787408119247522e-06, + "loss": 0.0368, + "step": 673 + }, + { + "epoch": 0.67, + "grad_norm": 1.0054665399271567, + "learning_rate": 7.78076136202828e-06, + "loss": 0.0444, + "step": 674 + }, + { + "epoch": 0.67, + "grad_norm": 1.0060984550389096, + "learning_rate": 7.774107482467856e-06, + "loss": 0.058, + "step": 675 + }, + { + "epoch": 0.67, + "grad_norm": 0.8843245325848659, + "learning_rate": 7.767446497608775e-06, + "loss": 0.0485, + "step": 676 + }, + { + "epoch": 0.67, + "grad_norm": 2.007540144322301, + "learning_rate": 7.760778424511766e-06, + "loss": 0.0589, + "step": 677 + }, + { + "epoch": 0.67, + "grad_norm": 0.8352098796640809, + "learning_rate": 7.754103280255707e-06, + "loss": 0.0405, + "step": 678 + }, + { + "epoch": 0.67, + "grad_norm": 0.47213607467956525, + "learning_rate": 7.747421081937592e-06, + "loss": 0.026, + "step": 679 + }, + { + "epoch": 0.67, + "grad_norm": 1.2589822578996794, + "learning_rate": 7.740731846672479e-06, + "loss": 0.0397, + "step": 680 + }, + { + "epoch": 0.67, + "grad_norm": 1.2489792445554098, + "learning_rate": 7.734035591593452e-06, + "loss": 0.0629, + "step": 681 + }, + { + "epoch": 0.67, + "grad_norm": 1.2130156604486495, + "learning_rate": 7.727332333851574e-06, + "loss": 0.0623, + "step": 682 + }, + { + "epoch": 0.67, + "grad_norm": 1.0670881828943366, + "learning_rate": 7.720622090615844e-06, + "loss": 0.0444, + "step": 683 + }, + { + "epoch": 0.68, + "grad_norm": 1.5910472088638483, + "learning_rate": 7.713904879073153e-06, + "loss": 0.063, + "step": 684 + }, + { + "epoch": 0.68, + "grad_norm": 1.0945352572471834, + "learning_rate": 7.707180716428237e-06, + "loss": 0.0543, + "step": 685 + }, + { + "epoch": 0.68, + "grad_norm": 1.1888354792478368, + "learning_rate": 7.700449619903643e-06, + "loss": 0.0494, + "step": 686 + }, + { + "epoch": 0.68, + "grad_norm": 2.234737397503048, + "learning_rate": 7.693711606739667e-06, + "loss": 0.0654, + "step": 687 + }, + { + "epoch": 0.68, + "grad_norm": 1.5921467818053312, + "learning_rate": 7.686966694194334e-06, + "loss": 0.0473, + "step": 688 + }, + { + "epoch": 0.68, + "grad_norm": 1.068287322661853, + "learning_rate": 7.680214899543328e-06, + "loss": 0.0498, + "step": 689 + }, + { + "epoch": 0.68, + "grad_norm": 1.090304711427149, + "learning_rate": 7.673456240079966e-06, + "loss": 0.0537, + "step": 690 + }, + { + "epoch": 0.68, + "grad_norm": 1.2623514077463773, + "learning_rate": 7.666690733115147e-06, + "loss": 0.0323, + "step": 691 + }, + { + "epoch": 0.68, + "grad_norm": 1.0821798809113374, + "learning_rate": 7.65991839597731e-06, + "loss": 0.0541, + "step": 692 + }, + { + "epoch": 0.68, + "grad_norm": 2.3982738877088425, + "learning_rate": 7.653139246012383e-06, + "loss": 0.0995, + "step": 693 + }, + { + "epoch": 0.69, + "grad_norm": 1.0333414715948794, + "learning_rate": 7.64635330058375e-06, + "loss": 0.0534, + "step": 694 + }, + { + "epoch": 0.69, + "grad_norm": 1.5282306413091515, + "learning_rate": 7.639560577072194e-06, + "loss": 0.0507, + "step": 695 + }, + { + "epoch": 0.69, + "grad_norm": 1.4155931930782537, + "learning_rate": 7.632761092875864e-06, + "loss": 0.0767, + "step": 696 + }, + { + "epoch": 0.69, + "grad_norm": 1.421296001948551, + "learning_rate": 7.625954865410224e-06, + "loss": 0.0688, + "step": 697 + }, + { + "epoch": 0.69, + "grad_norm": 0.5818062433164756, + "learning_rate": 7.619141912108008e-06, + "loss": 0.0368, + "step": 698 + }, + { + "epoch": 0.69, + "grad_norm": 0.8119759623455076, + "learning_rate": 7.612322250419173e-06, + "loss": 0.0357, + "step": 699 + }, + { + "epoch": 0.69, + "grad_norm": 1.4030844108026284, + "learning_rate": 7.605495897810868e-06, + "loss": 0.0652, + "step": 700 + }, + { + "epoch": 0.69, + "grad_norm": 2.7435621952165166, + "learning_rate": 7.598662871767371e-06, + "loss": 0.0595, + "step": 701 + }, + { + "epoch": 0.69, + "grad_norm": 1.0199936017098497, + "learning_rate": 7.591823189790054e-06, + "loss": 0.0564, + "step": 702 + }, + { + "epoch": 0.69, + "grad_norm": 5.013801793151265, + "learning_rate": 7.5849768693973415e-06, + "loss": 0.0506, + "step": 703 + }, + { + "epoch": 0.7, + "grad_norm": 1.1994253630174772, + "learning_rate": 7.578123928124654e-06, + "loss": 0.0514, + "step": 704 + }, + { + "epoch": 0.7, + "grad_norm": 1.695659698644204, + "learning_rate": 7.571264383524377e-06, + "loss": 0.0868, + "step": 705 + }, + { + "epoch": 0.7, + "grad_norm": 1.080323902841376, + "learning_rate": 7.564398253165804e-06, + "loss": 0.0635, + "step": 706 + }, + { + "epoch": 0.7, + "grad_norm": 1.0873572448586641, + "learning_rate": 7.557525554635097e-06, + "loss": 0.0488, + "step": 707 + }, + { + "epoch": 0.7, + "grad_norm": 1.1921038081951376, + "learning_rate": 7.550646305535243e-06, + "loss": 0.0537, + "step": 708 + }, + { + "epoch": 0.7, + "grad_norm": 2.3417929684907905, + "learning_rate": 7.543760523486008e-06, + "loss": 0.0705, + "step": 709 + }, + { + "epoch": 0.7, + "grad_norm": 0.770501025309428, + "learning_rate": 7.5368682261238865e-06, + "loss": 0.0348, + "step": 710 + }, + { + "epoch": 0.7, + "grad_norm": 1.7913968514488297, + "learning_rate": 7.529969431102063e-06, + "loss": 0.0469, + "step": 711 + }, + { + "epoch": 0.7, + "grad_norm": 1.372638464172713, + "learning_rate": 7.523064156090367e-06, + "loss": 0.0647, + "step": 712 + }, + { + "epoch": 0.7, + "grad_norm": 1.4168525758105495, + "learning_rate": 7.51615241877522e-06, + "loss": 0.0684, + "step": 713 + }, + { + "epoch": 0.71, + "grad_norm": 0.5952202718121256, + "learning_rate": 7.509234236859597e-06, + "loss": 0.0285, + "step": 714 + }, + { + "epoch": 0.71, + "grad_norm": 1.1132503184302827, + "learning_rate": 7.502309628062984e-06, + "loss": 0.055, + "step": 715 + }, + { + "epoch": 0.71, + "grad_norm": 1.2940171549098398, + "learning_rate": 7.49537861012132e-06, + "loss": 0.0568, + "step": 716 + }, + { + "epoch": 0.71, + "grad_norm": 1.432400009253595, + "learning_rate": 7.488441200786969e-06, + "loss": 0.0568, + "step": 717 + }, + { + "epoch": 0.71, + "grad_norm": 1.7357982264667429, + "learning_rate": 7.481497417828657e-06, + "loss": 0.0584, + "step": 718 + }, + { + "epoch": 0.71, + "grad_norm": 1.152811480884973, + "learning_rate": 7.474547279031439e-06, + "loss": 0.0356, + "step": 719 + }, + { + "epoch": 0.71, + "grad_norm": 0.9457070217244251, + "learning_rate": 7.467590802196648e-06, + "loss": 0.0421, + "step": 720 + }, + { + "epoch": 0.71, + "grad_norm": 1.3703852382166088, + "learning_rate": 7.460628005141853e-06, + "loss": 0.0603, + "step": 721 + }, + { + "epoch": 0.71, + "grad_norm": 0.8000228910289057, + "learning_rate": 7.453658905700804e-06, + "loss": 0.0428, + "step": 722 + }, + { + "epoch": 0.71, + "grad_norm": 0.6863165441999962, + "learning_rate": 7.4466835217234034e-06, + "loss": 0.0393, + "step": 723 + }, + { + "epoch": 0.72, + "grad_norm": 1.4274426214090807, + "learning_rate": 7.4397018710756415e-06, + "loss": 0.0666, + "step": 724 + }, + { + "epoch": 0.72, + "grad_norm": 0.615013887200379, + "learning_rate": 7.432713971639565e-06, + "loss": 0.0351, + "step": 725 + }, + { + "epoch": 0.72, + "grad_norm": 2.3432841177404575, + "learning_rate": 7.42571984131322e-06, + "loss": 0.0576, + "step": 726 + }, + { + "epoch": 0.72, + "grad_norm": 1.1694400922031816, + "learning_rate": 7.418719498010618e-06, + "loss": 0.0495, + "step": 727 + }, + { + "epoch": 0.72, + "grad_norm": 1.7107862218528511, + "learning_rate": 7.411712959661677e-06, + "loss": 0.065, + "step": 728 + }, + { + "epoch": 0.72, + "grad_norm": 1.114796883559968, + "learning_rate": 7.40470024421219e-06, + "loss": 0.0451, + "step": 729 + }, + { + "epoch": 0.72, + "grad_norm": 2.510226449257016, + "learning_rate": 7.397681369623766e-06, + "loss": 0.049, + "step": 730 + }, + { + "epoch": 0.72, + "grad_norm": 0.9849390108162892, + "learning_rate": 7.39065635387379e-06, + "loss": 0.0487, + "step": 731 + }, + { + "epoch": 0.72, + "grad_norm": 0.938148015960924, + "learning_rate": 7.383625214955377e-06, + "loss": 0.0524, + "step": 732 + }, + { + "epoch": 0.72, + "grad_norm": 3.344406418148889, + "learning_rate": 7.376587970877327e-06, + "loss": 0.0574, + "step": 733 + }, + { + "epoch": 0.73, + "grad_norm": 1.43575449802591, + "learning_rate": 7.369544639664072e-06, + "loss": 0.0559, + "step": 734 + }, + { + "epoch": 0.73, + "grad_norm": 0.800158124092139, + "learning_rate": 7.362495239355642e-06, + "loss": 0.0402, + "step": 735 + }, + { + "epoch": 0.73, + "grad_norm": 1.162183628676278, + "learning_rate": 7.355439788007604e-06, + "loss": 0.0504, + "step": 736 + }, + { + "epoch": 0.73, + "grad_norm": 0.7051824963315141, + "learning_rate": 7.34837830369103e-06, + "loss": 0.04, + "step": 737 + }, + { + "epoch": 0.73, + "grad_norm": 0.9660305349076502, + "learning_rate": 7.341310804492443e-06, + "loss": 0.041, + "step": 738 + }, + { + "epoch": 0.73, + "grad_norm": 1.2103272636528846, + "learning_rate": 7.334237308513766e-06, + "loss": 0.0588, + "step": 739 + }, + { + "epoch": 0.73, + "grad_norm": 0.6247813317536415, + "learning_rate": 7.327157833872291e-06, + "loss": 0.0388, + "step": 740 + }, + { + "epoch": 0.73, + "grad_norm": 0.950242229839862, + "learning_rate": 7.320072398700615e-06, + "loss": 0.0415, + "step": 741 + }, + { + "epoch": 0.73, + "grad_norm": 1.5224829709523, + "learning_rate": 7.312981021146606e-06, + "loss": 0.0654, + "step": 742 + }, + { + "epoch": 0.73, + "grad_norm": 1.029686304470058, + "learning_rate": 7.30588371937335e-06, + "loss": 0.0532, + "step": 743 + }, + { + "epoch": 0.74, + "grad_norm": 1.1599326200613038, + "learning_rate": 7.298780511559108e-06, + "loss": 0.0569, + "step": 744 + }, + { + "epoch": 0.74, + "grad_norm": 3.997871435156531, + "learning_rate": 7.291671415897268e-06, + "loss": 0.0597, + "step": 745 + }, + { + "epoch": 0.74, + "grad_norm": 2.276453119868504, + "learning_rate": 7.284556450596298e-06, + "loss": 0.0502, + "step": 746 + }, + { + "epoch": 0.74, + "grad_norm": 1.3737928110013409, + "learning_rate": 7.2774356338797004e-06, + "loss": 0.0417, + "step": 747 + }, + { + "epoch": 0.74, + "grad_norm": 0.9285847428586894, + "learning_rate": 7.270308983985963e-06, + "loss": 0.0469, + "step": 748 + }, + { + "epoch": 0.74, + "grad_norm": 0.8504652175843128, + "learning_rate": 7.2631765191685174e-06, + "loss": 0.0373, + "step": 749 + }, + { + "epoch": 0.74, + "grad_norm": 0.8153321056774139, + "learning_rate": 7.2560382576956875e-06, + "loss": 0.0408, + "step": 750 + }, + { + "epoch": 0.74, + "grad_norm": 1.1454444295449129, + "learning_rate": 7.24889421785064e-06, + "loss": 0.0512, + "step": 751 + }, + { + "epoch": 0.74, + "grad_norm": 1.8552009648324275, + "learning_rate": 7.24174441793135e-06, + "loss": 0.0566, + "step": 752 + }, + { + "epoch": 0.74, + "grad_norm": 0.8568618511392941, + "learning_rate": 7.234588876250538e-06, + "loss": 0.0408, + "step": 753 + }, + { + "epoch": 0.75, + "grad_norm": 1.395789889097727, + "learning_rate": 7.227427611135634e-06, + "loss": 0.0487, + "step": 754 + }, + { + "epoch": 0.75, + "grad_norm": 1.1473068375872773, + "learning_rate": 7.22026064092873e-06, + "loss": 0.0453, + "step": 755 + }, + { + "epoch": 0.75, + "grad_norm": 0.9107552740533715, + "learning_rate": 7.2130879839865255e-06, + "loss": 0.0484, + "step": 756 + }, + { + "epoch": 0.75, + "grad_norm": 1.479569569975316, + "learning_rate": 7.205909658680287e-06, + "loss": 0.0704, + "step": 757 + }, + { + "epoch": 0.75, + "grad_norm": 1.614669912893105, + "learning_rate": 7.198725683395802e-06, + "loss": 0.0476, + "step": 758 + }, + { + "epoch": 0.75, + "grad_norm": 0.8291391376525924, + "learning_rate": 7.191536076533326e-06, + "loss": 0.0449, + "step": 759 + }, + { + "epoch": 0.75, + "grad_norm": 3.6819517582924615, + "learning_rate": 7.184340856507541e-06, + "loss": 0.0439, + "step": 760 + }, + { + "epoch": 0.75, + "grad_norm": 0.7986954604883367, + "learning_rate": 7.177140041747503e-06, + "loss": 0.0522, + "step": 761 + }, + { + "epoch": 0.75, + "grad_norm": 1.0191355693655744, + "learning_rate": 7.169933650696602e-06, + "loss": 0.0493, + "step": 762 + }, + { + "epoch": 0.75, + "grad_norm": 0.8276853219246487, + "learning_rate": 7.162721701812506e-06, + "loss": 0.04, + "step": 763 + }, + { + "epoch": 0.75, + "grad_norm": 1.273271861109226, + "learning_rate": 7.155504213567122e-06, + "loss": 0.064, + "step": 764 + }, + { + "epoch": 0.76, + "grad_norm": 0.8211027414183176, + "learning_rate": 7.148281204446541e-06, + "loss": 0.0375, + "step": 765 + }, + { + "epoch": 0.76, + "grad_norm": 1.324686842180071, + "learning_rate": 7.141052692950999e-06, + "loss": 0.0617, + "step": 766 + }, + { + "epoch": 0.76, + "grad_norm": 1.3516945141218741, + "learning_rate": 7.133818697594821e-06, + "loss": 0.0495, + "step": 767 + }, + { + "epoch": 0.76, + "grad_norm": 1.546142032224951, + "learning_rate": 7.126579236906382e-06, + "loss": 0.0664, + "step": 768 + }, + { + "epoch": 0.76, + "grad_norm": 0.6871920840680835, + "learning_rate": 7.1193343294280516e-06, + "loss": 0.0413, + "step": 769 + }, + { + "epoch": 0.76, + "grad_norm": 1.6318268345167921, + "learning_rate": 7.112083993716151e-06, + "loss": 0.0638, + "step": 770 + }, + { + "epoch": 0.76, + "grad_norm": 0.9005360464139678, + "learning_rate": 7.104828248340907e-06, + "loss": 0.042, + "step": 771 + }, + { + "epoch": 0.76, + "grad_norm": 1.7820265958769346, + "learning_rate": 7.0975671118864e-06, + "loss": 0.0428, + "step": 772 + }, + { + "epoch": 0.76, + "grad_norm": 0.8437539128584735, + "learning_rate": 7.090300602950518e-06, + "loss": 0.0471, + "step": 773 + }, + { + "epoch": 0.76, + "grad_norm": 1.8279688633282087, + "learning_rate": 7.083028740144912e-06, + "loss": 0.0299, + "step": 774 + }, + { + "epoch": 0.77, + "grad_norm": 1.302893161500799, + "learning_rate": 7.075751542094944e-06, + "loss": 0.0499, + "step": 775 + }, + { + "epoch": 0.77, + "grad_norm": 1.0146660381609616, + "learning_rate": 7.068469027439642e-06, + "loss": 0.0466, + "step": 776 + }, + { + "epoch": 0.77, + "grad_norm": 0.9711265895426145, + "learning_rate": 7.0611812148316475e-06, + "loss": 0.0426, + "step": 777 + }, + { + "epoch": 0.77, + "grad_norm": 0.43048903542425127, + "learning_rate": 7.0538881229371795e-06, + "loss": 0.0262, + "step": 778 + }, + { + "epoch": 0.77, + "grad_norm": 1.1457379520309836, + "learning_rate": 7.046589770435971e-06, + "loss": 0.0457, + "step": 779 + }, + { + "epoch": 0.77, + "grad_norm": 1.2453652954639685, + "learning_rate": 7.039286176021233e-06, + "loss": 0.0538, + "step": 780 + }, + { + "epoch": 0.77, + "grad_norm": 0.9963928129972024, + "learning_rate": 7.031977358399602e-06, + "loss": 0.0382, + "step": 781 + }, + { + "epoch": 0.77, + "grad_norm": 1.2900988865570673, + "learning_rate": 7.024663336291093e-06, + "loss": 0.0509, + "step": 782 + }, + { + "epoch": 0.77, + "grad_norm": 1.4027443850614405, + "learning_rate": 7.017344128429049e-06, + "loss": 0.07, + "step": 783 + }, + { + "epoch": 0.77, + "grad_norm": 1.0371750020391997, + "learning_rate": 7.0100197535601e-06, + "loss": 0.052, + "step": 784 + }, + { + "epoch": 0.78, + "grad_norm": 1.7843862300889355, + "learning_rate": 7.002690230444104e-06, + "loss": 0.0543, + "step": 785 + }, + { + "epoch": 0.78, + "grad_norm": 0.8603855097473864, + "learning_rate": 6.9953555778541074e-06, + "loss": 0.0342, + "step": 786 + }, + { + "epoch": 0.78, + "grad_norm": 1.0511727640363182, + "learning_rate": 6.988015814576299e-06, + "loss": 0.0574, + "step": 787 + }, + { + "epoch": 0.78, + "grad_norm": 4.439368303862243, + "learning_rate": 6.980670959409952e-06, + "loss": 0.0504, + "step": 788 + }, + { + "epoch": 0.78, + "grad_norm": 1.2912175634676242, + "learning_rate": 6.9733210311673826e-06, + "loss": 0.0512, + "step": 789 + }, + { + "epoch": 0.78, + "grad_norm": 0.7768806755603833, + "learning_rate": 6.965966048673905e-06, + "loss": 0.0344, + "step": 790 + }, + { + "epoch": 0.78, + "grad_norm": 0.9256238998707683, + "learning_rate": 6.958606030767771e-06, + "loss": 0.0298, + "step": 791 + }, + { + "epoch": 0.78, + "grad_norm": 1.87432810891287, + "learning_rate": 6.951240996300135e-06, + "loss": 0.0548, + "step": 792 + }, + { + "epoch": 0.78, + "grad_norm": 1.2413364634857271, + "learning_rate": 6.9438709641350025e-06, + "loss": 0.0632, + "step": 793 + }, + { + "epoch": 0.78, + "grad_norm": 0.5436192413434815, + "learning_rate": 6.9364959531491714e-06, + "loss": 0.0276, + "step": 794 + }, + { + "epoch": 0.79, + "grad_norm": 1.1211151078772463, + "learning_rate": 6.9291159822322e-06, + "loss": 0.0431, + "step": 795 + }, + { + "epoch": 0.79, + "grad_norm": 0.9611971763181043, + "learning_rate": 6.921731070286346e-06, + "loss": 0.0405, + "step": 796 + }, + { + "epoch": 0.79, + "grad_norm": 1.0122635098927624, + "learning_rate": 6.914341236226524e-06, + "loss": 0.0402, + "step": 797 + }, + { + "epoch": 0.79, + "grad_norm": 1.136260979127934, + "learning_rate": 6.906946498980254e-06, + "loss": 0.0461, + "step": 798 + }, + { + "epoch": 0.79, + "grad_norm": 1.5458396979143085, + "learning_rate": 6.899546877487616e-06, + "loss": 0.0456, + "step": 799 + }, + { + "epoch": 0.79, + "grad_norm": 0.913407988966103, + "learning_rate": 6.8921423907012e-06, + "loss": 0.0355, + "step": 800 + }, + { + "epoch": 0.79, + "grad_norm": 0.8881887819273989, + "learning_rate": 6.884733057586057e-06, + "loss": 0.0333, + "step": 801 + }, + { + "epoch": 0.79, + "grad_norm": 2.0538766237007566, + "learning_rate": 6.8773188971196515e-06, + "loss": 0.0637, + "step": 802 + }, + { + "epoch": 0.79, + "grad_norm": 0.8453030323277484, + "learning_rate": 6.869899928291809e-06, + "loss": 0.0356, + "step": 803 + }, + { + "epoch": 0.79, + "grad_norm": 1.4774705218128774, + "learning_rate": 6.862476170104677e-06, + "loss": 0.047, + "step": 804 + }, + { + "epoch": 0.8, + "grad_norm": 2.3921989694543493, + "learning_rate": 6.8550476415726645e-06, + "loss": 0.0651, + "step": 805 + }, + { + "epoch": 0.8, + "grad_norm": 1.1488627023685396, + "learning_rate": 6.847614361722399e-06, + "loss": 0.0478, + "step": 806 + }, + { + "epoch": 0.8, + "grad_norm": 0.6811740696636209, + "learning_rate": 6.840176349592681e-06, + "loss": 0.0296, + "step": 807 + }, + { + "epoch": 0.8, + "grad_norm": 1.4898953199819551, + "learning_rate": 6.83273362423443e-06, + "loss": 0.0609, + "step": 808 + }, + { + "epoch": 0.8, + "grad_norm": 1.3565601020815854, + "learning_rate": 6.825286204710635e-06, + "loss": 0.0597, + "step": 809 + }, + { + "epoch": 0.8, + "grad_norm": 1.1415886329244214, + "learning_rate": 6.817834110096313e-06, + "loss": 0.0368, + "step": 810 + }, + { + "epoch": 0.8, + "grad_norm": 1.77611927624329, + "learning_rate": 6.8103773594784506e-06, + "loss": 0.0449, + "step": 811 + }, + { + "epoch": 0.8, + "grad_norm": 1.4555453825233526, + "learning_rate": 6.8029159719559615e-06, + "loss": 0.0394, + "step": 812 + }, + { + "epoch": 0.8, + "grad_norm": 0.8409635536876771, + "learning_rate": 6.795449966639638e-06, + "loss": 0.0367, + "step": 813 + }, + { + "epoch": 0.8, + "grad_norm": 1.2610559094175553, + "learning_rate": 6.787979362652097e-06, + "loss": 0.0379, + "step": 814 + }, + { + "epoch": 0.81, + "grad_norm": 1.6529011944025385, + "learning_rate": 6.780504179127735e-06, + "loss": 0.0672, + "step": 815 + }, + { + "epoch": 0.81, + "grad_norm": 0.8307567736703948, + "learning_rate": 6.773024435212678e-06, + "loss": 0.024, + "step": 816 + }, + { + "epoch": 0.81, + "grad_norm": 0.9432272440319054, + "learning_rate": 6.765540150064734e-06, + "loss": 0.0345, + "step": 817 + }, + { + "epoch": 0.81, + "grad_norm": 1.7703612656478014, + "learning_rate": 6.758051342853337e-06, + "loss": 0.0462, + "step": 818 + }, + { + "epoch": 0.81, + "grad_norm": 1.314910616474518, + "learning_rate": 6.750558032759512e-06, + "loss": 0.0548, + "step": 819 + }, + { + "epoch": 0.81, + "grad_norm": 1.4226314354369014, + "learning_rate": 6.743060238975811e-06, + "loss": 0.0417, + "step": 820 + }, + { + "epoch": 0.81, + "grad_norm": 1.919616358303647, + "learning_rate": 6.735557980706268e-06, + "loss": 0.0464, + "step": 821 + }, + { + "epoch": 0.81, + "grad_norm": 1.2640004664395663, + "learning_rate": 6.728051277166358e-06, + "loss": 0.0485, + "step": 822 + }, + { + "epoch": 0.81, + "grad_norm": 1.1906980417732456, + "learning_rate": 6.720540147582941e-06, + "loss": 0.0487, + "step": 823 + }, + { + "epoch": 0.81, + "grad_norm": 1.241031549174688, + "learning_rate": 6.713024611194208e-06, + "loss": 0.0509, + "step": 824 + }, + { + "epoch": 0.82, + "grad_norm": 0.988899982913446, + "learning_rate": 6.705504687249642e-06, + "loss": 0.0452, + "step": 825 + }, + { + "epoch": 0.82, + "grad_norm": 0.969809384808336, + "learning_rate": 6.697980395009963e-06, + "loss": 0.0388, + "step": 826 + }, + { + "epoch": 0.82, + "grad_norm": 1.2267200517217354, + "learning_rate": 6.690451753747076e-06, + "loss": 0.0332, + "step": 827 + }, + { + "epoch": 0.82, + "grad_norm": 1.3868305312881741, + "learning_rate": 6.682918782744033e-06, + "loss": 0.0405, + "step": 828 + }, + { + "epoch": 0.82, + "grad_norm": 1.7644839921848126, + "learning_rate": 6.675381501294965e-06, + "loss": 0.0396, + "step": 829 + }, + { + "epoch": 0.82, + "grad_norm": 1.221670982443101, + "learning_rate": 6.667839928705051e-06, + "loss": 0.0503, + "step": 830 + }, + { + "epoch": 0.82, + "grad_norm": 1.824025036996117, + "learning_rate": 6.660294084290461e-06, + "loss": 0.0703, + "step": 831 + }, + { + "epoch": 0.82, + "grad_norm": 1.299267850755918, + "learning_rate": 6.6527439873783004e-06, + "loss": 0.0507, + "step": 832 + }, + { + "epoch": 0.82, + "grad_norm": 0.9081222896678484, + "learning_rate": 6.645189657306572e-06, + "loss": 0.0422, + "step": 833 + }, + { + "epoch": 0.82, + "grad_norm": 0.8290197497835838, + "learning_rate": 6.6376311134241216e-06, + "loss": 0.039, + "step": 834 + }, + { + "epoch": 0.83, + "grad_norm": 1.289255419574439, + "learning_rate": 6.630068375090581e-06, + "loss": 0.0462, + "step": 835 + }, + { + "epoch": 0.83, + "grad_norm": 0.9270669463180224, + "learning_rate": 6.622501461676333e-06, + "loss": 0.033, + "step": 836 + }, + { + "epoch": 0.83, + "grad_norm": 1.1795154469114397, + "learning_rate": 6.6149303925624495e-06, + "loss": 0.0391, + "step": 837 + }, + { + "epoch": 0.83, + "grad_norm": 1.250248808799025, + "learning_rate": 6.607355187140647e-06, + "loss": 0.0436, + "step": 838 + }, + { + "epoch": 0.83, + "grad_norm": 0.8718051208128388, + "learning_rate": 6.599775864813237e-06, + "loss": 0.0347, + "step": 839 + }, + { + "epoch": 0.83, + "grad_norm": 1.0027766395054658, + "learning_rate": 6.592192444993078e-06, + "loss": 0.0429, + "step": 840 + }, + { + "epoch": 0.83, + "grad_norm": 1.3182757491833073, + "learning_rate": 6.584604947103515e-06, + "loss": 0.0701, + "step": 841 + }, + { + "epoch": 0.83, + "grad_norm": 0.812740598782862, + "learning_rate": 6.5770133905783485e-06, + "loss": 0.0387, + "step": 842 + }, + { + "epoch": 0.83, + "grad_norm": 1.082109636231357, + "learning_rate": 6.569417794861768e-06, + "loss": 0.0457, + "step": 843 + }, + { + "epoch": 0.83, + "grad_norm": 0.6750257856342989, + "learning_rate": 6.56181817940831e-06, + "loss": 0.0386, + "step": 844 + }, + { + "epoch": 0.83, + "grad_norm": 2.037029132615747, + "learning_rate": 6.554214563682809e-06, + "loss": 0.0589, + "step": 845 + }, + { + "epoch": 0.84, + "grad_norm": 1.2770572481644458, + "learning_rate": 6.546606967160343e-06, + "loss": 0.0449, + "step": 846 + }, + { + "epoch": 0.84, + "grad_norm": 1.0837849954557062, + "learning_rate": 6.538995409326183e-06, + "loss": 0.0417, + "step": 847 + }, + { + "epoch": 0.84, + "grad_norm": 2.036021742726003, + "learning_rate": 6.531379909675753e-06, + "loss": 0.038, + "step": 848 + }, + { + "epoch": 0.84, + "grad_norm": 1.403469464122417, + "learning_rate": 6.5237604877145685e-06, + "loss": 0.0495, + "step": 849 + }, + { + "epoch": 0.84, + "grad_norm": 0.8107811814670984, + "learning_rate": 6.516137162958192e-06, + "loss": 0.0323, + "step": 850 + }, + { + "epoch": 0.84, + "grad_norm": 1.174091980809628, + "learning_rate": 6.5085099549321826e-06, + "loss": 0.0532, + "step": 851 + }, + { + "epoch": 0.84, + "grad_norm": 1.5788132792682261, + "learning_rate": 6.500878883172045e-06, + "loss": 0.0348, + "step": 852 + }, + { + "epoch": 0.84, + "grad_norm": 0.9128356981769381, + "learning_rate": 6.49324396722318e-06, + "loss": 0.0408, + "step": 853 + }, + { + "epoch": 0.84, + "grad_norm": 1.251124315603189, + "learning_rate": 6.4856052266408375e-06, + "loss": 0.0506, + "step": 854 + }, + { + "epoch": 0.84, + "grad_norm": 1.0456200167180727, + "learning_rate": 6.4779626809900585e-06, + "loss": 0.0267, + "step": 855 + }, + { + "epoch": 0.85, + "grad_norm": 1.6946818933509344, + "learning_rate": 6.470316349845632e-06, + "loss": 0.0451, + "step": 856 + }, + { + "epoch": 0.85, + "grad_norm": 1.807980051547909, + "learning_rate": 6.462666252792044e-06, + "loss": 0.0557, + "step": 857 + }, + { + "epoch": 0.85, + "grad_norm": 3.4075680826831167, + "learning_rate": 6.455012409423427e-06, + "loss": 0.0549, + "step": 858 + }, + { + "epoch": 0.85, + "grad_norm": 1.370907384848696, + "learning_rate": 6.447354839343504e-06, + "loss": 0.0547, + "step": 859 + }, + { + "epoch": 0.85, + "grad_norm": 1.5469420948622832, + "learning_rate": 6.439693562165546e-06, + "loss": 0.0617, + "step": 860 + }, + { + "epoch": 0.85, + "grad_norm": 1.6537681598841625, + "learning_rate": 6.432028597512322e-06, + "loss": 0.04, + "step": 861 + }, + { + "epoch": 0.85, + "grad_norm": 1.3509749104300668, + "learning_rate": 6.424359965016041e-06, + "loss": 0.0471, + "step": 862 + }, + { + "epoch": 0.85, + "grad_norm": 1.9152872083214765, + "learning_rate": 6.416687684318309e-06, + "loss": 0.0447, + "step": 863 + }, + { + "epoch": 0.85, + "grad_norm": 1.4295335018939017, + "learning_rate": 6.409011775070073e-06, + "loss": 0.0585, + "step": 864 + }, + { + "epoch": 0.85, + "grad_norm": 0.7808982276220853, + "learning_rate": 6.40133225693158e-06, + "loss": 0.0393, + "step": 865 + }, + { + "epoch": 0.86, + "grad_norm": 1.8901219228184742, + "learning_rate": 6.393649149572313e-06, + "loss": 0.0394, + "step": 866 + }, + { + "epoch": 0.86, + "grad_norm": 1.4096812791604068, + "learning_rate": 6.385962472670953e-06, + "loss": 0.0492, + "step": 867 + }, + { + "epoch": 0.86, + "grad_norm": 1.6751166911982542, + "learning_rate": 6.3782722459153246e-06, + "loss": 0.0469, + "step": 868 + }, + { + "epoch": 0.86, + "grad_norm": 1.1596847414679285, + "learning_rate": 6.370578489002339e-06, + "loss": 0.042, + "step": 869 + }, + { + "epoch": 0.86, + "grad_norm": 1.0083992591231499, + "learning_rate": 6.362881221637953e-06, + "loss": 0.0428, + "step": 870 + }, + { + "epoch": 0.86, + "grad_norm": 0.8285729394403849, + "learning_rate": 6.355180463537116e-06, + "loss": 0.0412, + "step": 871 + }, + { + "epoch": 0.86, + "grad_norm": 2.4255200540690596, + "learning_rate": 6.347476234423715e-06, + "loss": 0.0441, + "step": 872 + }, + { + "epoch": 0.86, + "grad_norm": 1.0790904056026727, + "learning_rate": 6.339768554030528e-06, + "loss": 0.0322, + "step": 873 + }, + { + "epoch": 0.86, + "grad_norm": 1.0712417336999227, + "learning_rate": 6.3320574420991735e-06, + "loss": 0.0397, + "step": 874 + }, + { + "epoch": 0.86, + "grad_norm": 2.0713829508974753, + "learning_rate": 6.324342918380061e-06, + "loss": 0.0395, + "step": 875 + }, + { + "epoch": 0.87, + "grad_norm": 0.8121714693017822, + "learning_rate": 6.316625002632333e-06, + "loss": 0.0295, + "step": 876 + }, + { + "epoch": 0.87, + "grad_norm": 1.4494665701652905, + "learning_rate": 6.3089037146238275e-06, + "loss": 0.0508, + "step": 877 + }, + { + "epoch": 0.87, + "grad_norm": 0.753263308113668, + "learning_rate": 6.301179074131014e-06, + "loss": 0.0366, + "step": 878 + }, + { + "epoch": 0.87, + "grad_norm": 1.4386877756964491, + "learning_rate": 6.29345110093895e-06, + "loss": 0.053, + "step": 879 + }, + { + "epoch": 0.87, + "grad_norm": 1.0394724656689378, + "learning_rate": 6.28571981484123e-06, + "loss": 0.0389, + "step": 880 + }, + { + "epoch": 0.87, + "grad_norm": 0.9400400857708836, + "learning_rate": 6.277985235639935e-06, + "loss": 0.0367, + "step": 881 + }, + { + "epoch": 0.87, + "grad_norm": 0.9516717348503178, + "learning_rate": 6.2702473831455755e-06, + "loss": 0.0374, + "step": 882 + }, + { + "epoch": 0.87, + "grad_norm": 0.8182276197656334, + "learning_rate": 6.262506277177055e-06, + "loss": 0.0468, + "step": 883 + }, + { + "epoch": 0.87, + "grad_norm": 1.35296221468063, + "learning_rate": 6.254761937561599e-06, + "loss": 0.0449, + "step": 884 + }, + { + "epoch": 0.87, + "grad_norm": 0.9797346235633131, + "learning_rate": 6.247014384134723e-06, + "loss": 0.0328, + "step": 885 + }, + { + "epoch": 0.88, + "grad_norm": 1.0396717421424964, + "learning_rate": 6.2392636367401725e-06, + "loss": 0.0452, + "step": 886 + }, + { + "epoch": 0.88, + "grad_norm": 1.515308122456777, + "learning_rate": 6.2315097152298705e-06, + "loss": 0.0348, + "step": 887 + }, + { + "epoch": 0.88, + "grad_norm": 1.6496972916065884, + "learning_rate": 6.223752639463876e-06, + "loss": 0.0692, + "step": 888 + }, + { + "epoch": 0.88, + "grad_norm": 0.8274307431826953, + "learning_rate": 6.2159924293103205e-06, + "loss": 0.027, + "step": 889 + }, + { + "epoch": 0.88, + "grad_norm": 0.6250106328575086, + "learning_rate": 6.208229104645366e-06, + "loss": 0.0302, + "step": 890 + }, + { + "epoch": 0.88, + "grad_norm": 0.9417114215197753, + "learning_rate": 6.2004626853531545e-06, + "loss": 0.0454, + "step": 891 + }, + { + "epoch": 0.88, + "grad_norm": 1.2897997496298013, + "learning_rate": 6.1926931913257515e-06, + "loss": 0.0305, + "step": 892 + }, + { + "epoch": 0.88, + "grad_norm": 1.1784748626118067, + "learning_rate": 6.184920642463095e-06, + "loss": 0.0327, + "step": 893 + }, + { + "epoch": 0.88, + "grad_norm": 0.8166216570833613, + "learning_rate": 6.177145058672954e-06, + "loss": 0.0315, + "step": 894 + }, + { + "epoch": 0.88, + "grad_norm": 0.798575884334304, + "learning_rate": 6.169366459870866e-06, + "loss": 0.0323, + "step": 895 + }, + { + "epoch": 0.89, + "grad_norm": 1.2946830918663645, + "learning_rate": 6.16158486598009e-06, + "loss": 0.044, + "step": 896 + }, + { + "epoch": 0.89, + "grad_norm": 4.0979367059633836, + "learning_rate": 6.153800296931561e-06, + "loss": 0.0538, + "step": 897 + }, + { + "epoch": 0.89, + "grad_norm": 1.005181417036001, + "learning_rate": 6.146012772663832e-06, + "loss": 0.04, + "step": 898 + }, + { + "epoch": 0.89, + "grad_norm": 1.2335761673269008, + "learning_rate": 6.138222313123022e-06, + "loss": 0.046, + "step": 899 + }, + { + "epoch": 0.89, + "grad_norm": 4.8252291418130575, + "learning_rate": 6.130428938262774e-06, + "loss": 0.0411, + "step": 900 + }, + { + "epoch": 0.89, + "grad_norm": 1.2349381048462125, + "learning_rate": 6.122632668044193e-06, + "loss": 0.0365, + "step": 901 + }, + { + "epoch": 0.89, + "grad_norm": 0.7469611077478491, + "learning_rate": 6.1148335224358016e-06, + "loss": 0.0312, + "step": 902 + }, + { + "epoch": 0.89, + "grad_norm": 0.6468983595440548, + "learning_rate": 6.107031521413488e-06, + "loss": 0.0289, + "step": 903 + }, + { + "epoch": 0.89, + "grad_norm": 1.2638411098954696, + "learning_rate": 6.099226684960454e-06, + "loss": 0.0537, + "step": 904 + }, + { + "epoch": 0.89, + "grad_norm": 1.6411174182139878, + "learning_rate": 6.091419033067159e-06, + "loss": 0.0473, + "step": 905 + }, + { + "epoch": 0.9, + "grad_norm": 1.1672958610885626, + "learning_rate": 6.083608585731283e-06, + "loss": 0.0348, + "step": 906 + }, + { + "epoch": 0.9, + "grad_norm": 1.3478950565393775, + "learning_rate": 6.075795362957657e-06, + "loss": 0.0422, + "step": 907 + }, + { + "epoch": 0.9, + "grad_norm": 1.1232483592786926, + "learning_rate": 6.067979384758225e-06, + "loss": 0.0441, + "step": 908 + }, + { + "epoch": 0.9, + "grad_norm": 0.7482395181364767, + "learning_rate": 6.0601606711519875e-06, + "loss": 0.0333, + "step": 909 + }, + { + "epoch": 0.9, + "grad_norm": 1.1293888728654633, + "learning_rate": 6.0523392421649515e-06, + "loss": 0.0606, + "step": 910 + }, + { + "epoch": 0.9, + "grad_norm": 0.8673764096236666, + "learning_rate": 6.044515117830078e-06, + "loss": 0.0323, + "step": 911 + }, + { + "epoch": 0.9, + "grad_norm": 1.3722075174223107, + "learning_rate": 6.036688318187234e-06, + "loss": 0.0357, + "step": 912 + }, + { + "epoch": 0.9, + "grad_norm": 1.4110866208689847, + "learning_rate": 6.028858863283135e-06, + "loss": 0.0369, + "step": 913 + }, + { + "epoch": 0.9, + "grad_norm": 1.067372279350801, + "learning_rate": 6.021026773171299e-06, + "loss": 0.0346, + "step": 914 + }, + { + "epoch": 0.9, + "grad_norm": 6.430328387623402, + "learning_rate": 6.013192067911997e-06, + "loss": 0.0516, + "step": 915 + }, + { + "epoch": 0.91, + "grad_norm": 1.0292400791593719, + "learning_rate": 6.005354767572194e-06, + "loss": 0.0401, + "step": 916 + }, + { + "epoch": 0.91, + "grad_norm": 1.1345574370447715, + "learning_rate": 5.997514892225499e-06, + "loss": 0.0345, + "step": 917 + }, + { + "epoch": 0.91, + "grad_norm": 0.9843319026076962, + "learning_rate": 5.9896724619521256e-06, + "loss": 0.0416, + "step": 918 + }, + { + "epoch": 0.91, + "grad_norm": 1.022037359513478, + "learning_rate": 5.9818274968388225e-06, + "loss": 0.0422, + "step": 919 + }, + { + "epoch": 0.91, + "grad_norm": 1.0909811445781186, + "learning_rate": 5.973980016978834e-06, + "loss": 0.0375, + "step": 920 + }, + { + "epoch": 0.91, + "grad_norm": 1.5586504018752385, + "learning_rate": 5.966130042471848e-06, + "loss": 0.0493, + "step": 921 + }, + { + "epoch": 0.91, + "grad_norm": 2.0696048332400143, + "learning_rate": 5.95827759342394e-06, + "loss": 0.0463, + "step": 922 + }, + { + "epoch": 0.91, + "grad_norm": 0.7850891902664793, + "learning_rate": 5.950422689947519e-06, + "loss": 0.0368, + "step": 923 + }, + { + "epoch": 0.91, + "grad_norm": 1.6268144662037067, + "learning_rate": 5.942565352161289e-06, + "loss": 0.0685, + "step": 924 + }, + { + "epoch": 0.91, + "grad_norm": 1.6995902180149687, + "learning_rate": 5.934705600190183e-06, + "loss": 0.0283, + "step": 925 + }, + { + "epoch": 0.92, + "grad_norm": 1.8735482858574848, + "learning_rate": 5.926843454165317e-06, + "loss": 0.0388, + "step": 926 + }, + { + "epoch": 0.92, + "grad_norm": 0.9215046522914323, + "learning_rate": 5.918978934223945e-06, + "loss": 0.0451, + "step": 927 + }, + { + "epoch": 0.92, + "grad_norm": 0.8746979507415403, + "learning_rate": 5.911112060509392e-06, + "loss": 0.0362, + "step": 928 + }, + { + "epoch": 0.92, + "grad_norm": 0.8621716940708184, + "learning_rate": 5.903242853171022e-06, + "loss": 0.0363, + "step": 929 + }, + { + "epoch": 0.92, + "grad_norm": 3.136759596361992, + "learning_rate": 5.895371332364167e-06, + "loss": 0.0479, + "step": 930 + }, + { + "epoch": 0.92, + "grad_norm": 1.2815977401133114, + "learning_rate": 5.88749751825009e-06, + "loss": 0.0435, + "step": 931 + }, + { + "epoch": 0.92, + "grad_norm": 1.0950033959553411, + "learning_rate": 5.879621430995927e-06, + "loss": 0.038, + "step": 932 + }, + { + "epoch": 0.92, + "grad_norm": 1.2293706227587744, + "learning_rate": 5.8717430907746365e-06, + "loss": 0.0497, + "step": 933 + }, + { + "epoch": 0.92, + "grad_norm": 0.9493611725216664, + "learning_rate": 5.863862517764942e-06, + "loss": 0.0407, + "step": 934 + }, + { + "epoch": 0.92, + "grad_norm": 1.6413160972295684, + "learning_rate": 5.8559797321512946e-06, + "loss": 0.0482, + "step": 935 + }, + { + "epoch": 0.92, + "grad_norm": 0.7828006455912568, + "learning_rate": 5.848094754123808e-06, + "loss": 0.0265, + "step": 936 + }, + { + "epoch": 0.93, + "grad_norm": 1.0709946055942174, + "learning_rate": 5.8402076038782065e-06, + "loss": 0.0466, + "step": 937 + }, + { + "epoch": 0.93, + "grad_norm": 1.1460909174115927, + "learning_rate": 5.832318301615789e-06, + "loss": 0.0456, + "step": 938 + }, + { + "epoch": 0.93, + "grad_norm": 1.3080987894141531, + "learning_rate": 5.824426867543358e-06, + "loss": 0.0563, + "step": 939 + }, + { + "epoch": 0.93, + "grad_norm": 0.8941656475776596, + "learning_rate": 5.816533321873178e-06, + "loss": 0.0394, + "step": 940 + }, + { + "epoch": 0.93, + "grad_norm": 0.39318900492243813, + "learning_rate": 5.808637684822924e-06, + "loss": 0.0259, + "step": 941 + }, + { + "epoch": 0.93, + "grad_norm": 1.34674884621565, + "learning_rate": 5.800739976615626e-06, + "loss": 0.0431, + "step": 942 + }, + { + "epoch": 0.93, + "grad_norm": 0.7805739408684246, + "learning_rate": 5.792840217479616e-06, + "loss": 0.0341, + "step": 943 + }, + { + "epoch": 0.93, + "grad_norm": 0.7497753410737589, + "learning_rate": 5.784938427648488e-06, + "loss": 0.031, + "step": 944 + }, + { + "epoch": 0.93, + "grad_norm": 1.0072527798290345, + "learning_rate": 5.777034627361025e-06, + "loss": 0.038, + "step": 945 + }, + { + "epoch": 0.93, + "grad_norm": 0.8811001342598082, + "learning_rate": 5.769128836861169e-06, + "loss": 0.0463, + "step": 946 + }, + { + "epoch": 0.94, + "grad_norm": 0.6974121452637448, + "learning_rate": 5.761221076397957e-06, + "loss": 0.0255, + "step": 947 + }, + { + "epoch": 0.94, + "grad_norm": 6.476097592972458, + "learning_rate": 5.753311366225467e-06, + "loss": 0.0501, + "step": 948 + }, + { + "epoch": 0.94, + "grad_norm": 0.8008420028609479, + "learning_rate": 5.7453997266027784e-06, + "loss": 0.0331, + "step": 949 + }, + { + "epoch": 0.94, + "grad_norm": 0.5582987049019141, + "learning_rate": 5.737486177793907e-06, + "loss": 0.0239, + "step": 950 + }, + { + "epoch": 0.94, + "grad_norm": 1.2081504866571806, + "learning_rate": 5.729570740067758e-06, + "loss": 0.053, + "step": 951 + }, + { + "epoch": 0.94, + "grad_norm": 0.6927334913056521, + "learning_rate": 5.721653433698082e-06, + "loss": 0.0322, + "step": 952 + }, + { + "epoch": 0.94, + "grad_norm": 1.0192490892327637, + "learning_rate": 5.713734278963407e-06, + "loss": 0.0478, + "step": 953 + }, + { + "epoch": 0.94, + "grad_norm": 2.4021546573898167, + "learning_rate": 5.705813296146998e-06, + "loss": 0.0281, + "step": 954 + }, + { + "epoch": 0.94, + "grad_norm": 1.283203718705625, + "learning_rate": 5.697890505536805e-06, + "loss": 0.0494, + "step": 955 + }, + { + "epoch": 0.94, + "grad_norm": 0.8723678181245383, + "learning_rate": 5.689965927425407e-06, + "loss": 0.039, + "step": 956 + }, + { + "epoch": 0.95, + "grad_norm": 0.9154126878492291, + "learning_rate": 5.682039582109959e-06, + "loss": 0.039, + "step": 957 + }, + { + "epoch": 0.95, + "grad_norm": 1.0000734940623763, + "learning_rate": 5.674111489892144e-06, + "loss": 0.0477, + "step": 958 + }, + { + "epoch": 0.95, + "grad_norm": 0.797034006036536, + "learning_rate": 5.666181671078123e-06, + "loss": 0.043, + "step": 959 + }, + { + "epoch": 0.95, + "grad_norm": 1.0236055666082888, + "learning_rate": 5.658250145978469e-06, + "loss": 0.0273, + "step": 960 + }, + { + "epoch": 0.95, + "grad_norm": 0.9664406384627875, + "learning_rate": 5.650316934908138e-06, + "loss": 0.0336, + "step": 961 + }, + { + "epoch": 0.95, + "grad_norm": 1.487854141369925, + "learning_rate": 5.642382058186394e-06, + "loss": 0.0265, + "step": 962 + }, + { + "epoch": 0.95, + "grad_norm": 0.9545374665607708, + "learning_rate": 5.634445536136774e-06, + "loss": 0.0446, + "step": 963 + }, + { + "epoch": 0.95, + "grad_norm": 0.8919658607697852, + "learning_rate": 5.626507389087026e-06, + "loss": 0.0366, + "step": 964 + }, + { + "epoch": 0.95, + "grad_norm": 0.7391178186132781, + "learning_rate": 5.61856763736906e-06, + "loss": 0.0311, + "step": 965 + }, + { + "epoch": 0.95, + "grad_norm": 1.9268738865398216, + "learning_rate": 5.610626301318897e-06, + "loss": 0.0498, + "step": 966 + }, + { + "epoch": 0.96, + "grad_norm": 0.9120637509821803, + "learning_rate": 5.6026834012766155e-06, + "loss": 0.0359, + "step": 967 + }, + { + "epoch": 0.96, + "grad_norm": 1.0265774489726658, + "learning_rate": 5.594738957586302e-06, + "loss": 0.025, + "step": 968 + }, + { + "epoch": 0.96, + "grad_norm": 0.5550363052752717, + "learning_rate": 5.586792990595992e-06, + "loss": 0.0299, + "step": 969 + }, + { + "epoch": 0.96, + "grad_norm": 0.6178986655486775, + "learning_rate": 5.578845520657625e-06, + "loss": 0.0249, + "step": 970 + }, + { + "epoch": 0.96, + "grad_norm": 0.6321963136394885, + "learning_rate": 5.570896568126994e-06, + "loss": 0.027, + "step": 971 + }, + { + "epoch": 0.96, + "grad_norm": 1.8370481748494285, + "learning_rate": 5.562946153363681e-06, + "loss": 0.0403, + "step": 972 + }, + { + "epoch": 0.96, + "grad_norm": 1.0627524764235494, + "learning_rate": 5.5549942967310214e-06, + "loss": 0.0336, + "step": 973 + }, + { + "epoch": 0.96, + "grad_norm": 0.3932288711882305, + "learning_rate": 5.547041018596039e-06, + "loss": 0.0198, + "step": 974 + }, + { + "epoch": 0.96, + "grad_norm": 0.8964752757758289, + "learning_rate": 5.539086339329398e-06, + "loss": 0.0444, + "step": 975 + }, + { + "epoch": 0.96, + "grad_norm": 0.49690998582614554, + "learning_rate": 5.531130279305357e-06, + "loss": 0.0273, + "step": 976 + }, + { + "epoch": 0.97, + "grad_norm": 0.9808838263146912, + "learning_rate": 5.523172858901703e-06, + "loss": 0.0477, + "step": 977 + }, + { + "epoch": 0.97, + "grad_norm": 0.8666016629499839, + "learning_rate": 5.515214098499712e-06, + "loss": 0.0361, + "step": 978 + }, + { + "epoch": 0.97, + "grad_norm": 1.028073378457674, + "learning_rate": 5.507254018484094e-06, + "loss": 0.0392, + "step": 979 + }, + { + "epoch": 0.97, + "grad_norm": 1.2707931952783673, + "learning_rate": 5.499292639242933e-06, + "loss": 0.0294, + "step": 980 + }, + { + "epoch": 0.97, + "grad_norm": 1.0376209731413644, + "learning_rate": 5.491329981167647e-06, + "loss": 0.0436, + "step": 981 + }, + { + "epoch": 0.97, + "grad_norm": 1.8080034651931363, + "learning_rate": 5.483366064652925e-06, + "loss": 0.035, + "step": 982 + }, + { + "epoch": 0.97, + "grad_norm": 1.5472502566509414, + "learning_rate": 5.475400910096682e-06, + "loss": 0.0371, + "step": 983 + }, + { + "epoch": 0.97, + "grad_norm": 0.7416379930088657, + "learning_rate": 5.4674345379e-06, + "loss": 0.0339, + "step": 984 + }, + { + "epoch": 0.97, + "grad_norm": 1.3090824691105016, + "learning_rate": 5.4594669684670874e-06, + "loss": 0.0523, + "step": 985 + }, + { + "epoch": 0.97, + "grad_norm": 1.3517769265087425, + "learning_rate": 5.451498222205211e-06, + "loss": 0.0509, + "step": 986 + }, + { + "epoch": 0.98, + "grad_norm": 0.5490271439552697, + "learning_rate": 5.443528319524653e-06, + "loss": 0.0248, + "step": 987 + }, + { + "epoch": 0.98, + "grad_norm": 0.8278875192150178, + "learning_rate": 5.435557280838663e-06, + "loss": 0.0359, + "step": 988 + }, + { + "epoch": 0.98, + "grad_norm": 0.4484530755745902, + "learning_rate": 5.427585126563395e-06, + "loss": 0.0236, + "step": 989 + }, + { + "epoch": 0.98, + "grad_norm": 2.370692615590824, + "learning_rate": 5.419611877117864e-06, + "loss": 0.0392, + "step": 990 + }, + { + "epoch": 0.98, + "grad_norm": 1.0708257474470337, + "learning_rate": 5.411637552923887e-06, + "loss": 0.0496, + "step": 991 + }, + { + "epoch": 0.98, + "grad_norm": 0.5272878667231355, + "learning_rate": 5.403662174406033e-06, + "loss": 0.027, + "step": 992 + }, + { + "epoch": 0.98, + "grad_norm": 0.799116176048434, + "learning_rate": 5.395685761991576e-06, + "loss": 0.0306, + "step": 993 + }, + { + "epoch": 0.98, + "grad_norm": 1.5795420159041744, + "learning_rate": 5.387708336110434e-06, + "loss": 0.0514, + "step": 994 + }, + { + "epoch": 0.98, + "grad_norm": 1.3530404342180486, + "learning_rate": 5.379729917195124e-06, + "loss": 0.0495, + "step": 995 + }, + { + "epoch": 0.98, + "grad_norm": 1.1542699573921147, + "learning_rate": 5.371750525680703e-06, + "loss": 0.0414, + "step": 996 + }, + { + "epoch": 0.99, + "grad_norm": 0.6465003437919752, + "learning_rate": 5.36377018200472e-06, + "loss": 0.0344, + "step": 997 + }, + { + "epoch": 0.99, + "grad_norm": 1.0695478253107236, + "learning_rate": 5.355788906607164e-06, + "loss": 0.024, + "step": 998 + }, + { + "epoch": 0.99, + "grad_norm": 0.819386420050746, + "learning_rate": 5.347806719930414e-06, + "loss": 0.0206, + "step": 999 + }, + { + "epoch": 0.99, + "grad_norm": 1.6586229658287968, + "learning_rate": 5.339823642419174e-06, + "loss": 0.0542, + "step": 1000 + }, + { + "epoch": 0.99, + "grad_norm": 0.8943229287072868, + "learning_rate": 5.331839694520436e-06, + "loss": 0.0306, + "step": 1001 + }, + { + "epoch": 0.99, + "grad_norm": 1.0061767635442918, + "learning_rate": 5.323854896683422e-06, + "loss": 0.0305, + "step": 1002 + }, + { + "epoch": 0.99, + "grad_norm": 1.3117131384625127, + "learning_rate": 5.315869269359527e-06, + "loss": 0.0483, + "step": 1003 + }, + { + "epoch": 0.99, + "grad_norm": 1.483692159272879, + "learning_rate": 5.307882833002271e-06, + "loss": 0.0306, + "step": 1004 + }, + { + "epoch": 0.99, + "grad_norm": 2.4928013172540946, + "learning_rate": 5.299895608067253e-06, + "loss": 0.0272, + "step": 1005 + }, + { + "epoch": 0.99, + "grad_norm": 0.9350981952793932, + "learning_rate": 5.291907615012081e-06, + "loss": 0.0257, + "step": 1006 + }, + { + "epoch": 1.0, + "grad_norm": 1.1880304932342811, + "learning_rate": 5.283918874296338e-06, + "loss": 0.0478, + "step": 1007 + }, + { + "epoch": 1.0, + "grad_norm": 0.567658791017144, + "learning_rate": 5.2759294063815205e-06, + "loss": 0.0207, + "step": 1008 + }, + { + "epoch": 1.0, + "grad_norm": 0.5878617256713423, + "learning_rate": 5.267939231730985e-06, + "loss": 0.0259, + "step": 1009 + }, + { + "epoch": 1.0, + "grad_norm": 0.6296036095600541, + "learning_rate": 5.259948370809902e-06, + "loss": 0.03, + "step": 1010 + }, + { + "epoch": 1.0, + "grad_norm": 1.086582723137742, + "learning_rate": 5.251956844085198e-06, + "loss": 0.0315, + "step": 1011 + }, + { + "epoch": 1.0, + "grad_norm": 0.7493884416834921, + "learning_rate": 5.243964672025502e-06, + "loss": 0.027, + "step": 1012 + }, + { + "epoch": 1.0, + "grad_norm": 0.5755763721876781, + "learning_rate": 5.235971875101101e-06, + "loss": 0.024, + "step": 1013 + }, + { + "epoch": 1.0, + "grad_norm": 1.0336876559829822, + "learning_rate": 5.22797847378388e-06, + "loss": 0.0324, + "step": 1014 + }, + { + "epoch": 1.0, + "grad_norm": 1.7680003907100028, + "learning_rate": 5.219984488547269e-06, + "loss": 0.0341, + "step": 1015 + }, + { + "epoch": 1.0, + "grad_norm": 0.8226456680704546, + "learning_rate": 5.2119899398662e-06, + "loss": 0.0324, + "step": 1016 + }, + { + "epoch": 1.0, + "grad_norm": 0.6982266202238494, + "learning_rate": 5.203994848217043e-06, + "loss": 0.0286, + "step": 1017 + }, + { + "epoch": 1.01, + "grad_norm": 1.7139453038989596, + "learning_rate": 5.195999234077561e-06, + "loss": 0.0263, + "step": 1018 + }, + { + "epoch": 1.01, + "grad_norm": 0.7446444911541764, + "learning_rate": 5.188003117926854e-06, + "loss": 0.0177, + "step": 1019 + }, + { + "epoch": 1.01, + "grad_norm": 1.3514532648365165, + "learning_rate": 5.1800065202453095e-06, + "loss": 0.0361, + "step": 1020 + }, + { + "epoch": 1.01, + "grad_norm": 0.9721865980606375, + "learning_rate": 5.1720094615145455e-06, + "loss": 0.0389, + "step": 1021 + }, + { + "epoch": 1.01, + "grad_norm": 0.3703407080177544, + "learning_rate": 5.164011962217366e-06, + "loss": 0.019, + "step": 1022 + }, + { + "epoch": 1.01, + "grad_norm": 0.9599534838359112, + "learning_rate": 5.156014042837696e-06, + "loss": 0.0285, + "step": 1023 + }, + { + "epoch": 1.01, + "grad_norm": 0.6398162968662446, + "learning_rate": 5.148015723860543e-06, + "loss": 0.0251, + "step": 1024 + }, + { + "epoch": 1.01, + "grad_norm": 0.6225810158973047, + "learning_rate": 5.140017025771936e-06, + "loss": 0.0332, + "step": 1025 + }, + { + "epoch": 1.01, + "grad_norm": 0.7141483456674433, + "learning_rate": 5.1320179690588735e-06, + "loss": 0.0286, + "step": 1026 + }, + { + "epoch": 1.01, + "grad_norm": 0.9907387551938683, + "learning_rate": 5.124018574209272e-06, + "loss": 0.0432, + "step": 1027 + }, + { + "epoch": 1.02, + "grad_norm": 0.7904414734004622, + "learning_rate": 5.116018861711919e-06, + "loss": 0.0287, + "step": 1028 + }, + { + "epoch": 1.02, + "grad_norm": 1.3076855298975811, + "learning_rate": 5.108018852056411e-06, + "loss": 0.0428, + "step": 1029 + }, + { + "epoch": 1.02, + "grad_norm": 0.47849407068874555, + "learning_rate": 5.100018565733106e-06, + "loss": 0.0245, + "step": 1030 + }, + { + "epoch": 1.02, + "grad_norm": 1.2447035809415234, + "learning_rate": 5.092018023233072e-06, + "loss": 0.0332, + "step": 1031 + }, + { + "epoch": 1.02, + "grad_norm": 0.8381997558957843, + "learning_rate": 5.084017245048034e-06, + "loss": 0.0321, + "step": 1032 + }, + { + "epoch": 1.02, + "grad_norm": 0.7945475267321247, + "learning_rate": 5.0760162516703156e-06, + "loss": 0.0382, + "step": 1033 + }, + { + "epoch": 1.02, + "grad_norm": 0.8671733382239873, + "learning_rate": 5.068015063592799e-06, + "loss": 0.0372, + "step": 1034 + }, + { + "epoch": 1.02, + "grad_norm": 0.8604675588518298, + "learning_rate": 5.06001370130886e-06, + "loss": 0.0178, + "step": 1035 + }, + { + "epoch": 1.02, + "grad_norm": 1.0987612417558899, + "learning_rate": 5.052012185312322e-06, + "loss": 0.0402, + "step": 1036 + }, + { + "epoch": 1.02, + "grad_norm": 1.107296934615035, + "learning_rate": 5.044010536097402e-06, + "loss": 0.03, + "step": 1037 + }, + { + "epoch": 1.03, + "grad_norm": 0.8614135705236582, + "learning_rate": 5.036008774158658e-06, + "loss": 0.034, + "step": 1038 + }, + { + "epoch": 1.03, + "grad_norm": 0.7233307057992016, + "learning_rate": 5.028006919990936e-06, + "loss": 0.027, + "step": 1039 + }, + { + "epoch": 1.03, + "grad_norm": 0.6366263075262523, + "learning_rate": 5.0200049940893225e-06, + "loss": 0.03, + "step": 1040 + }, + { + "epoch": 1.03, + "grad_norm": 0.5525916393083419, + "learning_rate": 5.012003016949082e-06, + "loss": 0.0271, + "step": 1041 + }, + { + "epoch": 1.03, + "grad_norm": 0.8220356610097129, + "learning_rate": 5.004001009065611e-06, + "loss": 0.031, + "step": 1042 + }, + { + "epoch": 1.03, + "grad_norm": 0.8725303219826496, + "learning_rate": 4.99599899093439e-06, + "loss": 0.0317, + "step": 1043 + }, + { + "epoch": 1.03, + "grad_norm": 0.969863129435251, + "learning_rate": 4.98799698305092e-06, + "loss": 0.0257, + "step": 1044 + }, + { + "epoch": 1.03, + "grad_norm": 0.7438173977745536, + "learning_rate": 4.979995005910679e-06, + "loss": 0.0189, + "step": 1045 + }, + { + "epoch": 1.03, + "grad_norm": 1.3051006180712026, + "learning_rate": 4.971993080009065e-06, + "loss": 0.0304, + "step": 1046 + }, + { + "epoch": 1.03, + "grad_norm": 0.5562679937676483, + "learning_rate": 4.9639912258413435e-06, + "loss": 0.0237, + "step": 1047 + }, + { + "epoch": 1.04, + "grad_norm": 0.8810611430495988, + "learning_rate": 4.955989463902599e-06, + "loss": 0.0297, + "step": 1048 + }, + { + "epoch": 1.04, + "grad_norm": 1.7041819608996103, + "learning_rate": 4.94798781468768e-06, + "loss": 0.0306, + "step": 1049 + }, + { + "epoch": 1.04, + "grad_norm": 0.850027647311031, + "learning_rate": 4.939986298691141e-06, + "loss": 0.0218, + "step": 1050 + }, + { + "epoch": 1.04, + "grad_norm": 2.8282525428235044, + "learning_rate": 4.931984936407202e-06, + "loss": 0.0393, + "step": 1051 + }, + { + "epoch": 1.04, + "grad_norm": 2.018508686889688, + "learning_rate": 4.923983748329685e-06, + "loss": 0.0312, + "step": 1052 + }, + { + "epoch": 1.04, + "grad_norm": 2.6184868153222927, + "learning_rate": 4.9159827549519676e-06, + "loss": 0.0345, + "step": 1053 + }, + { + "epoch": 1.04, + "grad_norm": 1.964887841469877, + "learning_rate": 4.907981976766928e-06, + "loss": 0.0469, + "step": 1054 + }, + { + "epoch": 1.04, + "grad_norm": 0.7869754204093123, + "learning_rate": 4.899981434266895e-06, + "loss": 0.0314, + "step": 1055 + }, + { + "epoch": 1.04, + "grad_norm": 0.514845941566159, + "learning_rate": 4.891981147943589e-06, + "loss": 0.0255, + "step": 1056 + }, + { + "epoch": 1.04, + "grad_norm": 0.8788715161807736, + "learning_rate": 4.883981138288081e-06, + "loss": 0.0244, + "step": 1057 + }, + { + "epoch": 1.05, + "grad_norm": 1.046570067083794, + "learning_rate": 4.8759814257907275e-06, + "loss": 0.0316, + "step": 1058 + }, + { + "epoch": 1.05, + "grad_norm": 1.0154582399674494, + "learning_rate": 4.867982030941127e-06, + "loss": 0.0345, + "step": 1059 + }, + { + "epoch": 1.05, + "grad_norm": 1.3103756035719953, + "learning_rate": 4.859982974228065e-06, + "loss": 0.0248, + "step": 1060 + }, + { + "epoch": 1.05, + "grad_norm": 2.23368046234692, + "learning_rate": 4.851984276139458e-06, + "loss": 0.0395, + "step": 1061 + }, + { + "epoch": 1.05, + "grad_norm": 2.1098419076483146, + "learning_rate": 4.843985957162304e-06, + "loss": 0.0276, + "step": 1062 + }, + { + "epoch": 1.05, + "grad_norm": 1.942607118035288, + "learning_rate": 4.835988037782635e-06, + "loss": 0.0533, + "step": 1063 + }, + { + "epoch": 1.05, + "grad_norm": 0.6421131635296107, + "learning_rate": 4.827990538485456e-06, + "loss": 0.0239, + "step": 1064 + }, + { + "epoch": 1.05, + "grad_norm": 1.3053339694382833, + "learning_rate": 4.819993479754693e-06, + "loss": 0.0353, + "step": 1065 + }, + { + "epoch": 1.05, + "grad_norm": 0.9986906730806447, + "learning_rate": 4.811996882073148e-06, + "loss": 0.0265, + "step": 1066 + }, + { + "epoch": 1.05, + "grad_norm": 0.8181143387578583, + "learning_rate": 4.804000765922441e-06, + "loss": 0.0216, + "step": 1067 + }, + { + "epoch": 1.06, + "grad_norm": 1.502546600151344, + "learning_rate": 4.796005151782958e-06, + "loss": 0.0445, + "step": 1068 + }, + { + "epoch": 1.06, + "grad_norm": 0.7044947654375584, + "learning_rate": 4.788010060133802e-06, + "loss": 0.0251, + "step": 1069 + }, + { + "epoch": 1.06, + "grad_norm": 0.6013429646716092, + "learning_rate": 4.780015511452732e-06, + "loss": 0.0256, + "step": 1070 + }, + { + "epoch": 1.06, + "grad_norm": 1.5427439673220715, + "learning_rate": 4.772021526216123e-06, + "loss": 0.0263, + "step": 1071 + }, + { + "epoch": 1.06, + "grad_norm": 0.769644548462023, + "learning_rate": 4.764028124898901e-06, + "loss": 0.0335, + "step": 1072 + }, + { + "epoch": 1.06, + "grad_norm": 0.7447494663693941, + "learning_rate": 4.756035327974499e-06, + "loss": 0.0223, + "step": 1073 + }, + { + "epoch": 1.06, + "grad_norm": 0.7204796166780826, + "learning_rate": 4.748043155914804e-06, + "loss": 0.03, + "step": 1074 + }, + { + "epoch": 1.06, + "grad_norm": 1.0177867742746178, + "learning_rate": 4.740051629190099e-06, + "loss": 0.0341, + "step": 1075 + }, + { + "epoch": 1.06, + "grad_norm": 0.7421615617814211, + "learning_rate": 4.732060768269016e-06, + "loss": 0.0273, + "step": 1076 + }, + { + "epoch": 1.06, + "grad_norm": 3.8620381131051364, + "learning_rate": 4.724070593618482e-06, + "loss": 0.0263, + "step": 1077 + }, + { + "epoch": 1.07, + "grad_norm": 1.20839082094382, + "learning_rate": 4.716081125703665e-06, + "loss": 0.0272, + "step": 1078 + }, + { + "epoch": 1.07, + "grad_norm": 1.2494484460524444, + "learning_rate": 4.708092384987921e-06, + "loss": 0.0414, + "step": 1079 + }, + { + "epoch": 1.07, + "grad_norm": 0.9688768305749882, + "learning_rate": 4.70010439193275e-06, + "loss": 0.0307, + "step": 1080 + }, + { + "epoch": 1.07, + "grad_norm": 0.9176018744878512, + "learning_rate": 4.6921171669977304e-06, + "loss": 0.0333, + "step": 1081 + }, + { + "epoch": 1.07, + "grad_norm": 1.4328823254667193, + "learning_rate": 4.684130730640475e-06, + "loss": 0.037, + "step": 1082 + }, + { + "epoch": 1.07, + "grad_norm": 1.0825269206875192, + "learning_rate": 4.676145103316579e-06, + "loss": 0.0403, + "step": 1083 + }, + { + "epoch": 1.07, + "grad_norm": 1.159048100788253, + "learning_rate": 4.6681603054795654e-06, + "loss": 0.0327, + "step": 1084 + }, + { + "epoch": 1.07, + "grad_norm": 0.8399124550508421, + "learning_rate": 4.660176357580827e-06, + "loss": 0.0295, + "step": 1085 + }, + { + "epoch": 1.07, + "grad_norm": 0.42909450654776243, + "learning_rate": 4.652193280069588e-06, + "loss": 0.0223, + "step": 1086 + }, + { + "epoch": 1.07, + "grad_norm": 0.5824237081174857, + "learning_rate": 4.644211093392837e-06, + "loss": 0.0277, + "step": 1087 + }, + { + "epoch": 1.08, + "grad_norm": 0.7172546484982484, + "learning_rate": 4.636229817995281e-06, + "loss": 0.027, + "step": 1088 + }, + { + "epoch": 1.08, + "grad_norm": 0.4487630330036912, + "learning_rate": 4.6282494743193e-06, + "loss": 0.0156, + "step": 1089 + }, + { + "epoch": 1.08, + "grad_norm": 0.9578673232337767, + "learning_rate": 4.620270082804879e-06, + "loss": 0.035, + "step": 1090 + }, + { + "epoch": 1.08, + "grad_norm": 0.8541729174877927, + "learning_rate": 4.612291663889567e-06, + "loss": 0.0333, + "step": 1091 + }, + { + "epoch": 1.08, + "grad_norm": 1.413518111201729, + "learning_rate": 4.604314238008426e-06, + "loss": 0.0293, + "step": 1092 + }, + { + "epoch": 1.08, + "grad_norm": 1.013397355900455, + "learning_rate": 4.596337825593969e-06, + "loss": 0.0198, + "step": 1093 + }, + { + "epoch": 1.08, + "grad_norm": 0.8488149353686695, + "learning_rate": 4.588362447076115e-06, + "loss": 0.0308, + "step": 1094 + }, + { + "epoch": 1.08, + "grad_norm": 0.808337943132434, + "learning_rate": 4.5803881228821375e-06, + "loss": 0.0277, + "step": 1095 + }, + { + "epoch": 1.08, + "grad_norm": 0.9732316718127773, + "learning_rate": 4.572414873436606e-06, + "loss": 0.0268, + "step": 1096 + }, + { + "epoch": 1.08, + "grad_norm": 0.6459252848175363, + "learning_rate": 4.564442719161338e-06, + "loss": 0.0279, + "step": 1097 + }, + { + "epoch": 1.08, + "grad_norm": 0.6935117069628923, + "learning_rate": 4.556471680475348e-06, + "loss": 0.0239, + "step": 1098 + }, + { + "epoch": 1.09, + "grad_norm": 0.607783773980419, + "learning_rate": 4.548501777794792e-06, + "loss": 0.0186, + "step": 1099 + }, + { + "epoch": 1.09, + "grad_norm": 1.3877976209094554, + "learning_rate": 4.540533031532913e-06, + "loss": 0.0331, + "step": 1100 + }, + { + "epoch": 1.09, + "grad_norm": 1.6712739416461726, + "learning_rate": 4.532565462099999e-06, + "loss": 0.0432, + "step": 1101 + }, + { + "epoch": 1.09, + "grad_norm": 0.7879664908382655, + "learning_rate": 4.524599089903319e-06, + "loss": 0.028, + "step": 1102 + }, + { + "epoch": 1.09, + "grad_norm": 0.7875123164738963, + "learning_rate": 4.516633935347075e-06, + "loss": 0.0301, + "step": 1103 + }, + { + "epoch": 1.09, + "grad_norm": 0.9561176864842041, + "learning_rate": 4.508670018832353e-06, + "loss": 0.0316, + "step": 1104 + }, + { + "epoch": 1.09, + "grad_norm": 0.5519096705716723, + "learning_rate": 4.5007073607570674e-06, + "loss": 0.024, + "step": 1105 + }, + { + "epoch": 1.09, + "grad_norm": 0.958646382749322, + "learning_rate": 4.492745981515907e-06, + "loss": 0.036, + "step": 1106 + }, + { + "epoch": 1.09, + "grad_norm": 0.924537960903203, + "learning_rate": 4.48478590150029e-06, + "loss": 0.0306, + "step": 1107 + }, + { + "epoch": 1.09, + "grad_norm": 0.852576984550701, + "learning_rate": 4.4768271410983e-06, + "loss": 0.0354, + "step": 1108 + }, + { + "epoch": 1.1, + "grad_norm": 0.8059026291970833, + "learning_rate": 4.468869720694647e-06, + "loss": 0.022, + "step": 1109 + }, + { + "epoch": 1.1, + "grad_norm": 0.8622128063886197, + "learning_rate": 4.460913660670604e-06, + "loss": 0.0302, + "step": 1110 + }, + { + "epoch": 1.1, + "grad_norm": 1.0005211058131525, + "learning_rate": 4.452958981403963e-06, + "loss": 0.0244, + "step": 1111 + }, + { + "epoch": 1.1, + "grad_norm": 0.49953654433337896, + "learning_rate": 4.445005703268981e-06, + "loss": 0.0246, + "step": 1112 + }, + { + "epoch": 1.1, + "grad_norm": 2.5211757187596917, + "learning_rate": 4.4370538466363216e-06, + "loss": 0.0282, + "step": 1113 + }, + { + "epoch": 1.1, + "grad_norm": 0.7049283173553994, + "learning_rate": 4.429103431873009e-06, + "loss": 0.0275, + "step": 1114 + }, + { + "epoch": 1.1, + "grad_norm": 0.8760836408069194, + "learning_rate": 4.421154479342377e-06, + "loss": 0.0258, + "step": 1115 + }, + { + "epoch": 1.1, + "grad_norm": 0.5497290975156301, + "learning_rate": 4.413207009404012e-06, + "loss": 0.0234, + "step": 1116 + }, + { + "epoch": 1.1, + "grad_norm": 0.43160638727821743, + "learning_rate": 4.4052610424137e-06, + "loss": 0.0204, + "step": 1117 + }, + { + "epoch": 1.1, + "grad_norm": 0.5232450640935504, + "learning_rate": 4.397316598723385e-06, + "loss": 0.0224, + "step": 1118 + }, + { + "epoch": 1.11, + "grad_norm": 0.47825044777409836, + "learning_rate": 4.389373698681105e-06, + "loss": 0.0206, + "step": 1119 + }, + { + "epoch": 1.11, + "grad_norm": 0.991085904207292, + "learning_rate": 4.381432362630942e-06, + "loss": 0.0321, + "step": 1120 + }, + { + "epoch": 1.11, + "grad_norm": 0.987193721368169, + "learning_rate": 4.373492610912976e-06, + "loss": 0.0312, + "step": 1121 + }, + { + "epoch": 1.11, + "grad_norm": 0.6093671176495833, + "learning_rate": 4.365554463863228e-06, + "loss": 0.0219, + "step": 1122 + }, + { + "epoch": 1.11, + "grad_norm": 0.496517302197376, + "learning_rate": 4.3576179418136075e-06, + "loss": 0.0203, + "step": 1123 + }, + { + "epoch": 1.11, + "grad_norm": 0.6130892595308474, + "learning_rate": 4.349683065091864e-06, + "loss": 0.0306, + "step": 1124 + }, + { + "epoch": 1.11, + "grad_norm": 0.7776082447658934, + "learning_rate": 4.3417498540215325e-06, + "loss": 0.0398, + "step": 1125 + }, + { + "epoch": 1.11, + "grad_norm": 1.3171830830330598, + "learning_rate": 4.33381832892188e-06, + "loss": 0.038, + "step": 1126 + }, + { + "epoch": 1.11, + "grad_norm": 0.6685509894900467, + "learning_rate": 4.3258885101078565e-06, + "loss": 0.0302, + "step": 1127 + }, + { + "epoch": 1.11, + "grad_norm": 1.1144124292702757, + "learning_rate": 4.317960417890043e-06, + "loss": 0.0275, + "step": 1128 + }, + { + "epoch": 1.12, + "grad_norm": 1.501368764729424, + "learning_rate": 4.3100340725745934e-06, + "loss": 0.0374, + "step": 1129 + }, + { + "epoch": 1.12, + "grad_norm": 1.2663880435731152, + "learning_rate": 4.3021094944631955e-06, + "loss": 0.0341, + "step": 1130 + }, + { + "epoch": 1.12, + "grad_norm": 0.9252347942434317, + "learning_rate": 4.294186703853004e-06, + "loss": 0.0277, + "step": 1131 + }, + { + "epoch": 1.12, + "grad_norm": 0.6717103063934182, + "learning_rate": 4.286265721036595e-06, + "loss": 0.027, + "step": 1132 + }, + { + "epoch": 1.12, + "grad_norm": 1.1171647120173334, + "learning_rate": 4.27834656630192e-06, + "loss": 0.0428, + "step": 1133 + }, + { + "epoch": 1.12, + "grad_norm": 0.7077621527686202, + "learning_rate": 4.270429259932243e-06, + "loss": 0.0268, + "step": 1134 + }, + { + "epoch": 1.12, + "grad_norm": 1.1261178104190204, + "learning_rate": 4.262513822206095e-06, + "loss": 0.0346, + "step": 1135 + }, + { + "epoch": 1.12, + "grad_norm": 0.5321911431043505, + "learning_rate": 4.254600273397223e-06, + "loss": 0.0205, + "step": 1136 + }, + { + "epoch": 1.12, + "grad_norm": 0.6395519526652998, + "learning_rate": 4.246688633774534e-06, + "loss": 0.0236, + "step": 1137 + }, + { + "epoch": 1.12, + "grad_norm": 0.5594885034804874, + "learning_rate": 4.238778923602045e-06, + "loss": 0.014, + "step": 1138 + }, + { + "epoch": 1.13, + "grad_norm": 0.8895582256492076, + "learning_rate": 4.230871163138831e-06, + "loss": 0.0257, + "step": 1139 + }, + { + "epoch": 1.13, + "grad_norm": 0.8120185835677661, + "learning_rate": 4.2229653726389765e-06, + "loss": 0.0261, + "step": 1140 + }, + { + "epoch": 1.13, + "grad_norm": 0.7699512160395745, + "learning_rate": 4.215061572351513e-06, + "loss": 0.0371, + "step": 1141 + }, + { + "epoch": 1.13, + "grad_norm": 0.7904391026632073, + "learning_rate": 4.207159782520383e-06, + "loss": 0.0143, + "step": 1142 + }, + { + "epoch": 1.13, + "grad_norm": 1.04855099373956, + "learning_rate": 4.199260023384376e-06, + "loss": 0.0431, + "step": 1143 + }, + { + "epoch": 1.13, + "grad_norm": 1.2533318783486824, + "learning_rate": 4.1913623151770765e-06, + "loss": 0.0318, + "step": 1144 + }, + { + "epoch": 1.13, + "grad_norm": 0.9220419806025165, + "learning_rate": 4.183466678126822e-06, + "loss": 0.0317, + "step": 1145 + }, + { + "epoch": 1.13, + "grad_norm": 0.647797635933002, + "learning_rate": 4.175573132456644e-06, + "loss": 0.0221, + "step": 1146 + }, + { + "epoch": 1.13, + "grad_norm": 0.5143353606003064, + "learning_rate": 4.167681698384211e-06, + "loss": 0.026, + "step": 1147 + }, + { + "epoch": 1.13, + "grad_norm": 0.9659467748981236, + "learning_rate": 4.1597923961217935e-06, + "loss": 0.028, + "step": 1148 + }, + { + "epoch": 1.14, + "grad_norm": 0.8694130191332563, + "learning_rate": 4.151905245876194e-06, + "loss": 0.0302, + "step": 1149 + }, + { + "epoch": 1.14, + "grad_norm": 0.587958711576844, + "learning_rate": 4.144020267848707e-06, + "loss": 0.02, + "step": 1150 + }, + { + "epoch": 1.14, + "grad_norm": 0.7156546748797903, + "learning_rate": 4.13613748223506e-06, + "loss": 0.0158, + "step": 1151 + }, + { + "epoch": 1.14, + "grad_norm": 1.0851432279340858, + "learning_rate": 4.128256909225366e-06, + "loss": 0.0286, + "step": 1152 + }, + { + "epoch": 1.14, + "grad_norm": 0.7096178232213832, + "learning_rate": 4.120378569004074e-06, + "loss": 0.0316, + "step": 1153 + }, + { + "epoch": 1.14, + "grad_norm": 0.5717606844829977, + "learning_rate": 4.112502481749911e-06, + "loss": 0.0194, + "step": 1154 + }, + { + "epoch": 1.14, + "grad_norm": 1.0662768483868281, + "learning_rate": 4.104628667635835e-06, + "loss": 0.0272, + "step": 1155 + }, + { + "epoch": 1.14, + "grad_norm": 1.0802239084531717, + "learning_rate": 4.0967571468289815e-06, + "loss": 0.0419, + "step": 1156 + }, + { + "epoch": 1.14, + "grad_norm": 4.075912303145742, + "learning_rate": 4.0888879394906094e-06, + "loss": 0.0311, + "step": 1157 + }, + { + "epoch": 1.14, + "grad_norm": 1.2670503288679043, + "learning_rate": 4.081021065776058e-06, + "loss": 0.0345, + "step": 1158 + }, + { + "epoch": 1.15, + "grad_norm": 0.5912311253811383, + "learning_rate": 4.073156545834685e-06, + "loss": 0.0306, + "step": 1159 + }, + { + "epoch": 1.15, + "grad_norm": 0.7740612434514832, + "learning_rate": 4.065294399809819e-06, + "loss": 0.0248, + "step": 1160 + }, + { + "epoch": 1.15, + "grad_norm": 0.6660766846869574, + "learning_rate": 4.057434647838713e-06, + "loss": 0.0272, + "step": 1161 + }, + { + "epoch": 1.15, + "grad_norm": 0.4464175518732789, + "learning_rate": 4.049577310052482e-06, + "loss": 0.0187, + "step": 1162 + }, + { + "epoch": 1.15, + "grad_norm": 0.6420961971799071, + "learning_rate": 4.041722406576062e-06, + "loss": 0.0178, + "step": 1163 + }, + { + "epoch": 1.15, + "grad_norm": 0.5933792075717889, + "learning_rate": 4.033869957528153e-06, + "loss": 0.023, + "step": 1164 + }, + { + "epoch": 1.15, + "grad_norm": 0.7545983123520259, + "learning_rate": 4.026019983021168e-06, + "loss": 0.0279, + "step": 1165 + }, + { + "epoch": 1.15, + "grad_norm": 1.0413887638237633, + "learning_rate": 4.018172503161179e-06, + "loss": 0.0371, + "step": 1166 + }, + { + "epoch": 1.15, + "grad_norm": 1.0934504551905797, + "learning_rate": 4.010327538047877e-06, + "loss": 0.0328, + "step": 1167 + }, + { + "epoch": 1.15, + "grad_norm": 0.36078229020799923, + "learning_rate": 4.002485107774503e-06, + "loss": 0.016, + "step": 1168 + }, + { + "epoch": 1.16, + "grad_norm": 0.6444553615010122, + "learning_rate": 3.994645232427809e-06, + "loss": 0.0291, + "step": 1169 + }, + { + "epoch": 1.16, + "grad_norm": 0.6594877072308999, + "learning_rate": 3.986807932088004e-06, + "loss": 0.0232, + "step": 1170 + }, + { + "epoch": 1.16, + "grad_norm": 0.9328266110281278, + "learning_rate": 3.978973226828702e-06, + "loss": 0.0312, + "step": 1171 + }, + { + "epoch": 1.16, + "grad_norm": 0.5447087731555137, + "learning_rate": 3.971141136716866e-06, + "loss": 0.0189, + "step": 1172 + }, + { + "epoch": 1.16, + "grad_norm": 0.9056993750867337, + "learning_rate": 3.963311681812768e-06, + "loss": 0.0228, + "step": 1173 + }, + { + "epoch": 1.16, + "grad_norm": 0.7617244908266496, + "learning_rate": 3.955484882169923e-06, + "loss": 0.0207, + "step": 1174 + }, + { + "epoch": 1.16, + "grad_norm": 2.053093794917535, + "learning_rate": 3.947660757835049e-06, + "loss": 0.0368, + "step": 1175 + }, + { + "epoch": 1.16, + "grad_norm": 0.7773155500120877, + "learning_rate": 3.939839328848014e-06, + "loss": 0.0275, + "step": 1176 + }, + { + "epoch": 1.16, + "grad_norm": 1.1309111614765357, + "learning_rate": 3.932020615241777e-06, + "loss": 0.0391, + "step": 1177 + }, + { + "epoch": 1.16, + "grad_norm": 0.7772381331788334, + "learning_rate": 3.9242046370423434e-06, + "loss": 0.0228, + "step": 1178 + }, + { + "epoch": 1.17, + "grad_norm": 1.0497661692021192, + "learning_rate": 3.9163914142687185e-06, + "loss": 0.0197, + "step": 1179 + }, + { + "epoch": 1.17, + "grad_norm": 0.802443273282112, + "learning_rate": 3.9085809669328415e-06, + "loss": 0.0287, + "step": 1180 + }, + { + "epoch": 1.17, + "grad_norm": 1.6422605212693915, + "learning_rate": 3.900773315039548e-06, + "loss": 0.0263, + "step": 1181 + }, + { + "epoch": 1.17, + "grad_norm": 1.4827986025988822, + "learning_rate": 3.892968478586513e-06, + "loss": 0.0385, + "step": 1182 + }, + { + "epoch": 1.17, + "grad_norm": 1.0726420346174323, + "learning_rate": 3.885166477564199e-06, + "loss": 0.0315, + "step": 1183 + }, + { + "epoch": 1.17, + "grad_norm": 1.4868137126849361, + "learning_rate": 3.877367331955808e-06, + "loss": 0.0301, + "step": 1184 + }, + { + "epoch": 1.17, + "grad_norm": 0.7793921575940534, + "learning_rate": 3.869571061737226e-06, + "loss": 0.0278, + "step": 1185 + }, + { + "epoch": 1.17, + "grad_norm": 0.7423673491553581, + "learning_rate": 3.861777686876978e-06, + "loss": 0.0172, + "step": 1186 + }, + { + "epoch": 1.17, + "grad_norm": 0.8445082531782259, + "learning_rate": 3.853987227336168e-06, + "loss": 0.0234, + "step": 1187 + }, + { + "epoch": 1.17, + "grad_norm": 1.4866791378338005, + "learning_rate": 3.8461997030684386e-06, + "loss": 0.0413, + "step": 1188 + }, + { + "epoch": 1.17, + "grad_norm": 0.696171863459194, + "learning_rate": 3.838415134019911e-06, + "loss": 0.0278, + "step": 1189 + }, + { + "epoch": 1.18, + "grad_norm": 0.9645765538518006, + "learning_rate": 3.830633540129135e-06, + "loss": 0.0333, + "step": 1190 + }, + { + "epoch": 1.18, + "grad_norm": 1.1588316179391511, + "learning_rate": 3.822854941327046e-06, + "loss": 0.0247, + "step": 1191 + }, + { + "epoch": 1.18, + "grad_norm": 0.8964545735956162, + "learning_rate": 3.815079357536907e-06, + "loss": 0.0244, + "step": 1192 + }, + { + "epoch": 1.18, + "grad_norm": 0.9118777787298978, + "learning_rate": 3.8073068086742514e-06, + "loss": 0.0275, + "step": 1193 + }, + { + "epoch": 1.18, + "grad_norm": 0.777626923225908, + "learning_rate": 3.799537314646848e-06, + "loss": 0.0205, + "step": 1194 + }, + { + "epoch": 1.18, + "grad_norm": 0.6685174415093665, + "learning_rate": 3.791770895354635e-06, + "loss": 0.0206, + "step": 1195 + }, + { + "epoch": 1.18, + "grad_norm": 0.48232058435281616, + "learning_rate": 3.7840075706896824e-06, + "loss": 0.0207, + "step": 1196 + }, + { + "epoch": 1.18, + "grad_norm": 1.0217841454148604, + "learning_rate": 3.776247360536127e-06, + "loss": 0.0393, + "step": 1197 + }, + { + "epoch": 1.18, + "grad_norm": 0.6119942879107466, + "learning_rate": 3.768490284770131e-06, + "loss": 0.0203, + "step": 1198 + }, + { + "epoch": 1.18, + "grad_norm": 0.5206581259536639, + "learning_rate": 3.7607363632598305e-06, + "loss": 0.0196, + "step": 1199 + }, + { + "epoch": 1.19, + "grad_norm": 0.9881570998597756, + "learning_rate": 3.7529856158652792e-06, + "loss": 0.0163, + "step": 1200 + }, + { + "epoch": 1.19, + "grad_norm": 0.6382121125866328, + "learning_rate": 3.7452380624384026e-06, + "loss": 0.0231, + "step": 1201 + }, + { + "epoch": 1.19, + "grad_norm": 0.4224758933081467, + "learning_rate": 3.7374937228229472e-06, + "loss": 0.0139, + "step": 1202 + }, + { + "epoch": 1.19, + "grad_norm": 0.38564278891273984, + "learning_rate": 3.7297526168544253e-06, + "loss": 0.0173, + "step": 1203 + }, + { + "epoch": 1.19, + "grad_norm": 1.0962566818321713, + "learning_rate": 3.722014764360067e-06, + "loss": 0.025, + "step": 1204 + }, + { + "epoch": 1.19, + "grad_norm": 1.2246173963024596, + "learning_rate": 3.714280185158771e-06, + "loss": 0.0406, + "step": 1205 + }, + { + "epoch": 1.19, + "grad_norm": 0.8603176514449102, + "learning_rate": 3.706548899061052e-06, + "loss": 0.026, + "step": 1206 + }, + { + "epoch": 1.19, + "grad_norm": 1.2822870337099297, + "learning_rate": 3.6988209258689877e-06, + "loss": 0.0313, + "step": 1207 + }, + { + "epoch": 1.19, + "grad_norm": 1.2715509865915087, + "learning_rate": 3.6910962853761738e-06, + "loss": 0.0355, + "step": 1208 + }, + { + "epoch": 1.19, + "grad_norm": 1.1853601452520386, + "learning_rate": 3.683374997367668e-06, + "loss": 0.0321, + "step": 1209 + }, + { + "epoch": 1.2, + "grad_norm": 0.902921247787625, + "learning_rate": 3.675657081619941e-06, + "loss": 0.0191, + "step": 1210 + }, + { + "epoch": 1.2, + "grad_norm": 0.7821409311223014, + "learning_rate": 3.6679425579008278e-06, + "loss": 0.0326, + "step": 1211 + }, + { + "epoch": 1.2, + "grad_norm": 0.7903619927425193, + "learning_rate": 3.6602314459694743e-06, + "loss": 0.0301, + "step": 1212 + }, + { + "epoch": 1.2, + "grad_norm": 0.46174044405418857, + "learning_rate": 3.652523765576287e-06, + "loss": 0.022, + "step": 1213 + }, + { + "epoch": 1.2, + "grad_norm": 0.6323491372648098, + "learning_rate": 3.6448195364628857e-06, + "loss": 0.0191, + "step": 1214 + }, + { + "epoch": 1.2, + "grad_norm": 0.9099434352944868, + "learning_rate": 3.6371187783620486e-06, + "loss": 0.0241, + "step": 1215 + }, + { + "epoch": 1.2, + "grad_norm": 0.9207530906993158, + "learning_rate": 3.6294215109976628e-06, + "loss": 0.0222, + "step": 1216 + }, + { + "epoch": 1.2, + "grad_norm": 0.5852047866123646, + "learning_rate": 3.6217277540846775e-06, + "loss": 0.0268, + "step": 1217 + }, + { + "epoch": 1.2, + "grad_norm": 0.7561125846165206, + "learning_rate": 3.614037527329048e-06, + "loss": 0.0286, + "step": 1218 + }, + { + "epoch": 1.2, + "grad_norm": 1.7062712342887891, + "learning_rate": 3.606350850427688e-06, + "loss": 0.0356, + "step": 1219 + }, + { + "epoch": 1.21, + "grad_norm": 0.681472660161719, + "learning_rate": 3.5986677430684224e-06, + "loss": 0.0225, + "step": 1220 + }, + { + "epoch": 1.21, + "grad_norm": 0.8267818175730947, + "learning_rate": 3.5909882249299287e-06, + "loss": 0.0318, + "step": 1221 + }, + { + "epoch": 1.21, + "grad_norm": 0.7355695025915546, + "learning_rate": 3.583312315681693e-06, + "loss": 0.0223, + "step": 1222 + }, + { + "epoch": 1.21, + "grad_norm": 2.833987497480372, + "learning_rate": 3.5756400349839603e-06, + "loss": 0.0337, + "step": 1223 + }, + { + "epoch": 1.21, + "grad_norm": 0.9794172004233616, + "learning_rate": 3.567971402487679e-06, + "loss": 0.0218, + "step": 1224 + }, + { + "epoch": 1.21, + "grad_norm": 3.6501686738425505, + "learning_rate": 3.5603064378344536e-06, + "loss": 0.0262, + "step": 1225 + }, + { + "epoch": 1.21, + "grad_norm": 0.5674790693617395, + "learning_rate": 3.552645160656497e-06, + "loss": 0.0265, + "step": 1226 + }, + { + "epoch": 1.21, + "grad_norm": 1.0102812039517735, + "learning_rate": 3.544987590576574e-06, + "loss": 0.0226, + "step": 1227 + }, + { + "epoch": 1.21, + "grad_norm": 0.5226619099252662, + "learning_rate": 3.537333747207955e-06, + "loss": 0.0183, + "step": 1228 + }, + { + "epoch": 1.21, + "grad_norm": 1.0690310624262982, + "learning_rate": 3.529683650154368e-06, + "loss": 0.0367, + "step": 1229 + }, + { + "epoch": 1.22, + "grad_norm": 2.019580208227227, + "learning_rate": 3.5220373190099428e-06, + "loss": 0.0379, + "step": 1230 + }, + { + "epoch": 1.22, + "grad_norm": 0.6233956991684912, + "learning_rate": 3.5143947733591633e-06, + "loss": 0.0231, + "step": 1231 + }, + { + "epoch": 1.22, + "grad_norm": 0.6281650494544383, + "learning_rate": 3.50675603277682e-06, + "loss": 0.024, + "step": 1232 + }, + { + "epoch": 1.22, + "grad_norm": 0.5383027091207303, + "learning_rate": 3.499121116827956e-06, + "loss": 0.0167, + "step": 1233 + }, + { + "epoch": 1.22, + "grad_norm": 0.7215097341859587, + "learning_rate": 3.491490045067818e-06, + "loss": 0.0258, + "step": 1234 + }, + { + "epoch": 1.22, + "grad_norm": 0.5910503130056679, + "learning_rate": 3.48386283704181e-06, + "loss": 0.0231, + "step": 1235 + }, + { + "epoch": 1.22, + "grad_norm": 2.4693134066224136, + "learning_rate": 3.4762395122854336e-06, + "loss": 0.0363, + "step": 1236 + }, + { + "epoch": 1.22, + "grad_norm": 1.166411864946178, + "learning_rate": 3.4686200903242497e-06, + "loss": 0.0259, + "step": 1237 + }, + { + "epoch": 1.22, + "grad_norm": 1.4817375402839268, + "learning_rate": 3.461004590673819e-06, + "loss": 0.0344, + "step": 1238 + }, + { + "epoch": 1.22, + "grad_norm": 1.046182158547702, + "learning_rate": 3.45339303283966e-06, + "loss": 0.0242, + "step": 1239 + }, + { + "epoch": 1.23, + "grad_norm": 1.292224774416532, + "learning_rate": 3.445785436317193e-06, + "loss": 0.0375, + "step": 1240 + }, + { + "epoch": 1.23, + "grad_norm": 0.861521851112122, + "learning_rate": 3.4381818205916907e-06, + "loss": 0.0218, + "step": 1241 + }, + { + "epoch": 1.23, + "grad_norm": 0.5716383717347113, + "learning_rate": 3.4305822051382345e-06, + "loss": 0.0254, + "step": 1242 + }, + { + "epoch": 1.23, + "grad_norm": 0.8052011902200021, + "learning_rate": 3.422986609421655e-06, + "loss": 0.0266, + "step": 1243 + }, + { + "epoch": 1.23, + "grad_norm": 0.9384565798258717, + "learning_rate": 3.4153950528964867e-06, + "loss": 0.0399, + "step": 1244 + }, + { + "epoch": 1.23, + "grad_norm": 0.9711098718625382, + "learning_rate": 3.4078075550069255e-06, + "loss": 0.0241, + "step": 1245 + }, + { + "epoch": 1.23, + "grad_norm": 0.6655061872931324, + "learning_rate": 3.400224135186765e-06, + "loss": 0.0364, + "step": 1246 + }, + { + "epoch": 1.23, + "grad_norm": 1.5502248667043002, + "learning_rate": 3.392644812859354e-06, + "loss": 0.0416, + "step": 1247 + }, + { + "epoch": 1.23, + "grad_norm": 1.844181133418411, + "learning_rate": 3.385069607437552e-06, + "loss": 0.0354, + "step": 1248 + }, + { + "epoch": 1.23, + "grad_norm": 1.1257355441450307, + "learning_rate": 3.3774985383236685e-06, + "loss": 0.0232, + "step": 1249 + }, + { + "epoch": 1.24, + "grad_norm": 0.9739750889298195, + "learning_rate": 3.3699316249094195e-06, + "loss": 0.0395, + "step": 1250 + }, + { + "epoch": 1.24, + "grad_norm": 0.7690071951479069, + "learning_rate": 3.36236888657588e-06, + "loss": 0.0344, + "step": 1251 + }, + { + "epoch": 1.24, + "grad_norm": 2.6660183823158623, + "learning_rate": 3.3548103426934287e-06, + "loss": 0.0213, + "step": 1252 + }, + { + "epoch": 1.24, + "grad_norm": 0.6154954723041898, + "learning_rate": 3.3472560126217004e-06, + "loss": 0.0192, + "step": 1253 + }, + { + "epoch": 1.24, + "grad_norm": 0.6999616838858199, + "learning_rate": 3.3397059157095412e-06, + "loss": 0.0222, + "step": 1254 + }, + { + "epoch": 1.24, + "grad_norm": 1.625168930364848, + "learning_rate": 3.33216007129495e-06, + "loss": 0.0281, + "step": 1255 + }, + { + "epoch": 1.24, + "grad_norm": 0.7423624969087769, + "learning_rate": 3.3246184987050366e-06, + "loss": 0.0285, + "step": 1256 + }, + { + "epoch": 1.24, + "grad_norm": 0.5785260967877592, + "learning_rate": 3.3170812172559695e-06, + "loss": 0.0258, + "step": 1257 + }, + { + "epoch": 1.24, + "grad_norm": 1.053003162298107, + "learning_rate": 3.309548246252925e-06, + "loss": 0.0259, + "step": 1258 + }, + { + "epoch": 1.24, + "grad_norm": 0.9626579997361799, + "learning_rate": 3.3020196049900386e-06, + "loss": 0.0341, + "step": 1259 + }, + { + "epoch": 1.25, + "grad_norm": 1.5378411567759094, + "learning_rate": 3.2944953127503593e-06, + "loss": 0.0229, + "step": 1260 + }, + { + "epoch": 1.25, + "grad_norm": 0.40918902129839757, + "learning_rate": 3.2869753888057936e-06, + "loss": 0.0174, + "step": 1261 + }, + { + "epoch": 1.25, + "grad_norm": 0.948390041474773, + "learning_rate": 3.2794598524170606e-06, + "loss": 0.021, + "step": 1262 + }, + { + "epoch": 1.25, + "grad_norm": 0.9761585653303926, + "learning_rate": 3.271948722833643e-06, + "loss": 0.0245, + "step": 1263 + }, + { + "epoch": 1.25, + "grad_norm": 1.1402260326532885, + "learning_rate": 3.264442019293734e-06, + "loss": 0.0289, + "step": 1264 + }, + { + "epoch": 1.25, + "grad_norm": 0.8272407776944543, + "learning_rate": 3.2569397610241915e-06, + "loss": 0.022, + "step": 1265 + }, + { + "epoch": 1.25, + "grad_norm": 1.1065240923106912, + "learning_rate": 3.249441967240489e-06, + "loss": 0.0387, + "step": 1266 + }, + { + "epoch": 1.25, + "grad_norm": 0.7964507556545418, + "learning_rate": 3.241948657146663e-06, + "loss": 0.0258, + "step": 1267 + }, + { + "epoch": 1.25, + "grad_norm": 0.9722127655862852, + "learning_rate": 3.2344598499352663e-06, + "loss": 0.0233, + "step": 1268 + }, + { + "epoch": 1.25, + "grad_norm": 0.6702923565192391, + "learning_rate": 3.226975564787322e-06, + "loss": 0.02, + "step": 1269 + }, + { + "epoch": 1.25, + "grad_norm": 0.6781473563830941, + "learning_rate": 3.2194958208722656e-06, + "loss": 0.027, + "step": 1270 + }, + { + "epoch": 1.26, + "grad_norm": 0.8764587765192495, + "learning_rate": 3.2120206373479024e-06, + "loss": 0.0259, + "step": 1271 + }, + { + "epoch": 1.26, + "grad_norm": 1.204359727983454, + "learning_rate": 3.204550033360362e-06, + "loss": 0.0261, + "step": 1272 + }, + { + "epoch": 1.26, + "grad_norm": 4.3643673401914205, + "learning_rate": 3.1970840280440384e-06, + "loss": 0.0189, + "step": 1273 + }, + { + "epoch": 1.26, + "grad_norm": 0.7324647980496716, + "learning_rate": 3.1896226405215503e-06, + "loss": 0.0221, + "step": 1274 + }, + { + "epoch": 1.26, + "grad_norm": 0.8575165229725605, + "learning_rate": 3.1821658899036876e-06, + "loss": 0.0276, + "step": 1275 + }, + { + "epoch": 1.26, + "grad_norm": 0.7541136711033276, + "learning_rate": 3.174713795289366e-06, + "loss": 0.0322, + "step": 1276 + }, + { + "epoch": 1.26, + "grad_norm": 0.9834424977627805, + "learning_rate": 3.1672663757655707e-06, + "loss": 0.0441, + "step": 1277 + }, + { + "epoch": 1.26, + "grad_norm": 1.3702494488264347, + "learning_rate": 3.1598236504073215e-06, + "loss": 0.0329, + "step": 1278 + }, + { + "epoch": 1.26, + "grad_norm": 1.02349427189603, + "learning_rate": 3.152385638277603e-06, + "loss": 0.0442, + "step": 1279 + }, + { + "epoch": 1.26, + "grad_norm": 1.0267724041865374, + "learning_rate": 3.144952358427339e-06, + "loss": 0.0265, + "step": 1280 + }, + { + "epoch": 1.27, + "grad_norm": 0.4656151906951841, + "learning_rate": 3.137523829895326e-06, + "loss": 0.0178, + "step": 1281 + }, + { + "epoch": 1.27, + "grad_norm": 0.9467462721261859, + "learning_rate": 3.1301000717081926e-06, + "loss": 0.0325, + "step": 1282 + }, + { + "epoch": 1.27, + "grad_norm": 1.1709202769320373, + "learning_rate": 3.1226811028803514e-06, + "loss": 0.0333, + "step": 1283 + }, + { + "epoch": 1.27, + "grad_norm": 1.3576000887150794, + "learning_rate": 3.115266942413946e-06, + "loss": 0.0301, + "step": 1284 + }, + { + "epoch": 1.27, + "grad_norm": 0.7941095758553105, + "learning_rate": 3.107857609298802e-06, + "loss": 0.021, + "step": 1285 + }, + { + "epoch": 1.27, + "grad_norm": 0.7929011356573532, + "learning_rate": 3.1004531225123857e-06, + "loss": 0.0256, + "step": 1286 + }, + { + "epoch": 1.27, + "grad_norm": 0.8786043128992731, + "learning_rate": 3.093053501019748e-06, + "loss": 0.0258, + "step": 1287 + }, + { + "epoch": 1.27, + "grad_norm": 1.260719000880376, + "learning_rate": 3.0856587637734783e-06, + "loss": 0.0282, + "step": 1288 + }, + { + "epoch": 1.27, + "grad_norm": 1.0540374948932114, + "learning_rate": 3.0782689297136554e-06, + "loss": 0.0309, + "step": 1289 + }, + { + "epoch": 1.27, + "grad_norm": 0.5725262960615425, + "learning_rate": 3.070884017767801e-06, + "loss": 0.022, + "step": 1290 + }, + { + "epoch": 1.28, + "grad_norm": 0.959101589090888, + "learning_rate": 3.0635040468508294e-06, + "loss": 0.028, + "step": 1291 + }, + { + "epoch": 1.28, + "grad_norm": 0.5386872857085101, + "learning_rate": 3.0561290358649996e-06, + "loss": 0.0151, + "step": 1292 + }, + { + "epoch": 1.28, + "grad_norm": 0.569838897864246, + "learning_rate": 3.048759003699866e-06, + "loss": 0.0207, + "step": 1293 + }, + { + "epoch": 1.28, + "grad_norm": 0.581532645283473, + "learning_rate": 3.0413939692322304e-06, + "loss": 0.0225, + "step": 1294 + }, + { + "epoch": 1.28, + "grad_norm": 0.5028362927727279, + "learning_rate": 3.0340339513260976e-06, + "loss": 0.0235, + "step": 1295 + }, + { + "epoch": 1.28, + "grad_norm": 0.8007831617356519, + "learning_rate": 3.0266789688326187e-06, + "loss": 0.0227, + "step": 1296 + }, + { + "epoch": 1.28, + "grad_norm": 0.7695969527513412, + "learning_rate": 3.0193290405900494e-06, + "loss": 0.0196, + "step": 1297 + }, + { + "epoch": 1.28, + "grad_norm": 1.0289670306604963, + "learning_rate": 3.0119841854237027e-06, + "loss": 0.0307, + "step": 1298 + }, + { + "epoch": 1.28, + "grad_norm": 0.6835324484005986, + "learning_rate": 3.004644422145894e-06, + "loss": 0.0292, + "step": 1299 + }, + { + "epoch": 1.28, + "grad_norm": 0.581528963254537, + "learning_rate": 2.9973097695558982e-06, + "loss": 0.0259, + "step": 1300 + }, + { + "epoch": 1.29, + "grad_norm": 1.8906463659166122, + "learning_rate": 2.9899802464399018e-06, + "loss": 0.0441, + "step": 1301 + }, + { + "epoch": 1.29, + "grad_norm": 0.8464019379319843, + "learning_rate": 2.982655871570952e-06, + "loss": 0.0156, + "step": 1302 + }, + { + "epoch": 1.29, + "grad_norm": 1.3655786669874588, + "learning_rate": 2.9753366637089075e-06, + "loss": 0.0263, + "step": 1303 + }, + { + "epoch": 1.29, + "grad_norm": 1.4302998086198895, + "learning_rate": 2.9680226416003986e-06, + "loss": 0.0331, + "step": 1304 + }, + { + "epoch": 1.29, + "grad_norm": 0.6007442648446595, + "learning_rate": 2.960713823978769e-06, + "loss": 0.0188, + "step": 1305 + }, + { + "epoch": 1.29, + "grad_norm": 0.5396731652610417, + "learning_rate": 2.9534102295640305e-06, + "loss": 0.0216, + "step": 1306 + }, + { + "epoch": 1.29, + "grad_norm": 1.2810585899025055, + "learning_rate": 2.9461118770628226e-06, + "loss": 0.0248, + "step": 1307 + }, + { + "epoch": 1.29, + "grad_norm": 0.8106101777855437, + "learning_rate": 2.9388187851683537e-06, + "loss": 0.0223, + "step": 1308 + }, + { + "epoch": 1.29, + "grad_norm": 0.6780089328890525, + "learning_rate": 2.9315309725603596e-06, + "loss": 0.0251, + "step": 1309 + }, + { + "epoch": 1.29, + "grad_norm": 0.5254825720095616, + "learning_rate": 2.9242484579050566e-06, + "loss": 0.0233, + "step": 1310 + }, + { + "epoch": 1.3, + "grad_norm": 0.5856391286139735, + "learning_rate": 2.9169712598550885e-06, + "loss": 0.0183, + "step": 1311 + }, + { + "epoch": 1.3, + "grad_norm": 0.6144133333430777, + "learning_rate": 2.9096993970494825e-06, + "loss": 0.0207, + "step": 1312 + }, + { + "epoch": 1.3, + "grad_norm": 0.7793656594295922, + "learning_rate": 2.9024328881136e-06, + "loss": 0.0275, + "step": 1313 + }, + { + "epoch": 1.3, + "grad_norm": 0.8547383245915084, + "learning_rate": 2.895171751659093e-06, + "loss": 0.024, + "step": 1314 + }, + { + "epoch": 1.3, + "grad_norm": 1.136725575523068, + "learning_rate": 2.887916006283849e-06, + "loss": 0.0296, + "step": 1315 + }, + { + "epoch": 1.3, + "grad_norm": 1.000339457408409, + "learning_rate": 2.8806656705719492e-06, + "loss": 0.0216, + "step": 1316 + }, + { + "epoch": 1.3, + "grad_norm": 0.5570093686866413, + "learning_rate": 2.8734207630936195e-06, + "loss": 0.0209, + "step": 1317 + }, + { + "epoch": 1.3, + "grad_norm": 0.4454859134820076, + "learning_rate": 2.86618130240518e-06, + "loss": 0.0147, + "step": 1318 + }, + { + "epoch": 1.3, + "grad_norm": 0.563113334058305, + "learning_rate": 2.858947307049001e-06, + "loss": 0.0187, + "step": 1319 + }, + { + "epoch": 1.3, + "grad_norm": 0.8038186022459026, + "learning_rate": 2.851718795553461e-06, + "loss": 0.0214, + "step": 1320 + }, + { + "epoch": 1.31, + "grad_norm": 1.8962721319951323, + "learning_rate": 2.84449578643288e-06, + "loss": 0.0332, + "step": 1321 + }, + { + "epoch": 1.31, + "grad_norm": 1.0943982653356987, + "learning_rate": 2.8372782981874964e-06, + "loss": 0.0203, + "step": 1322 + }, + { + "epoch": 1.31, + "grad_norm": 0.6210789822485363, + "learning_rate": 2.830066349303401e-06, + "loss": 0.0207, + "step": 1323 + }, + { + "epoch": 1.31, + "grad_norm": 0.7558588675119113, + "learning_rate": 2.8228599582524975e-06, + "loss": 0.0279, + "step": 1324 + }, + { + "epoch": 1.31, + "grad_norm": 1.2117255873190556, + "learning_rate": 2.815659143492461e-06, + "loss": 0.0197, + "step": 1325 + }, + { + "epoch": 1.31, + "grad_norm": 0.8905208919281724, + "learning_rate": 2.8084639234666753e-06, + "loss": 0.0254, + "step": 1326 + }, + { + "epoch": 1.31, + "grad_norm": 0.5673813607386939, + "learning_rate": 2.8012743166042002e-06, + "loss": 0.0195, + "step": 1327 + }, + { + "epoch": 1.31, + "grad_norm": 0.7629642074483579, + "learning_rate": 2.794090341319715e-06, + "loss": 0.0234, + "step": 1328 + }, + { + "epoch": 1.31, + "grad_norm": 1.0756254635065226, + "learning_rate": 2.786912016013478e-06, + "loss": 0.0197, + "step": 1329 + }, + { + "epoch": 1.31, + "grad_norm": 0.7397629259597135, + "learning_rate": 2.7797393590712706e-06, + "loss": 0.0286, + "step": 1330 + }, + { + "epoch": 1.32, + "grad_norm": 1.3606802714238877, + "learning_rate": 2.7725723888643663e-06, + "loss": 0.0195, + "step": 1331 + }, + { + "epoch": 1.32, + "grad_norm": 1.1951944803879382, + "learning_rate": 2.765411123749463e-06, + "loss": 0.0227, + "step": 1332 + }, + { + "epoch": 1.32, + "grad_norm": 0.572616845327841, + "learning_rate": 2.758255582068651e-06, + "loss": 0.0214, + "step": 1333 + }, + { + "epoch": 1.32, + "grad_norm": 0.6254891645360283, + "learning_rate": 2.75110578214936e-06, + "loss": 0.0226, + "step": 1334 + }, + { + "epoch": 1.32, + "grad_norm": 0.8258197500603863, + "learning_rate": 2.7439617423043146e-06, + "loss": 0.0253, + "step": 1335 + }, + { + "epoch": 1.32, + "grad_norm": 0.5567925238200621, + "learning_rate": 2.736823480831482e-06, + "loss": 0.0103, + "step": 1336 + }, + { + "epoch": 1.32, + "grad_norm": 1.9486102645398342, + "learning_rate": 2.7296910160140365e-06, + "loss": 0.036, + "step": 1337 + }, + { + "epoch": 1.32, + "grad_norm": 5.9259932643996045, + "learning_rate": 2.7225643661203004e-06, + "loss": 0.0322, + "step": 1338 + }, + { + "epoch": 1.32, + "grad_norm": 0.691222736049963, + "learning_rate": 2.7154435494037033e-06, + "loss": 0.0148, + "step": 1339 + }, + { + "epoch": 1.32, + "grad_norm": 0.8575355131535669, + "learning_rate": 2.708328584102734e-06, + "loss": 0.0181, + "step": 1340 + }, + { + "epoch": 1.33, + "grad_norm": 1.1963430964304536, + "learning_rate": 2.7012194884408942e-06, + "loss": 0.0282, + "step": 1341 + }, + { + "epoch": 1.33, + "grad_norm": 0.9228574782155844, + "learning_rate": 2.6941162806266506e-06, + "loss": 0.0259, + "step": 1342 + }, + { + "epoch": 1.33, + "grad_norm": 0.6683360136989352, + "learning_rate": 2.6870189788533953e-06, + "loss": 0.0233, + "step": 1343 + }, + { + "epoch": 1.33, + "grad_norm": 0.6158335243969865, + "learning_rate": 2.679927601299386e-06, + "loss": 0.0224, + "step": 1344 + }, + { + "epoch": 1.33, + "grad_norm": 0.4342317868225625, + "learning_rate": 2.6728421661277105e-06, + "loss": 0.0187, + "step": 1345 + }, + { + "epoch": 1.33, + "grad_norm": 0.8702462153937798, + "learning_rate": 2.665762691486235e-06, + "loss": 0.0279, + "step": 1346 + }, + { + "epoch": 1.33, + "grad_norm": 0.9076288865381044, + "learning_rate": 2.65868919550756e-06, + "loss": 0.0256, + "step": 1347 + }, + { + "epoch": 1.33, + "grad_norm": 1.7546622654755015, + "learning_rate": 2.6516216963089698e-06, + "loss": 0.0239, + "step": 1348 + }, + { + "epoch": 1.33, + "grad_norm": 0.5596958413844281, + "learning_rate": 2.6445602119923963e-06, + "loss": 0.0159, + "step": 1349 + }, + { + "epoch": 1.33, + "grad_norm": 0.817812674260711, + "learning_rate": 2.637504760644359e-06, + "loss": 0.0173, + "step": 1350 + }, + { + "epoch": 1.33, + "grad_norm": 0.9082917695452071, + "learning_rate": 2.630455360335929e-06, + "loss": 0.0284, + "step": 1351 + }, + { + "epoch": 1.34, + "grad_norm": 1.0181782420057977, + "learning_rate": 2.623412029122675e-06, + "loss": 0.0235, + "step": 1352 + }, + { + "epoch": 1.34, + "grad_norm": 0.4308595058667782, + "learning_rate": 2.616374785044624e-06, + "loss": 0.0195, + "step": 1353 + }, + { + "epoch": 1.34, + "grad_norm": 1.0010421051313705, + "learning_rate": 2.60934364612621e-06, + "loss": 0.0405, + "step": 1354 + }, + { + "epoch": 1.34, + "grad_norm": 1.1072856388623673, + "learning_rate": 2.602318630376234e-06, + "loss": 0.0346, + "step": 1355 + }, + { + "epoch": 1.34, + "grad_norm": 0.570554903732537, + "learning_rate": 2.59529975578781e-06, + "loss": 0.02, + "step": 1356 + }, + { + "epoch": 1.34, + "grad_norm": 1.763034168480365, + "learning_rate": 2.588287040338323e-06, + "loss": 0.028, + "step": 1357 + }, + { + "epoch": 1.34, + "grad_norm": 0.9818915105741374, + "learning_rate": 2.5812805019893837e-06, + "loss": 0.0255, + "step": 1358 + }, + { + "epoch": 1.34, + "grad_norm": 0.37647009133737225, + "learning_rate": 2.574280158686782e-06, + "loss": 0.0128, + "step": 1359 + }, + { + "epoch": 1.34, + "grad_norm": 0.8141530396717688, + "learning_rate": 2.567286028360435e-06, + "loss": 0.0279, + "step": 1360 + }, + { + "epoch": 1.34, + "grad_norm": 0.7619954885706852, + "learning_rate": 2.560298128924358e-06, + "loss": 0.0212, + "step": 1361 + }, + { + "epoch": 1.35, + "grad_norm": 0.7786618256826614, + "learning_rate": 2.5533164782765974e-06, + "loss": 0.0244, + "step": 1362 + }, + { + "epoch": 1.35, + "grad_norm": 0.8083725721324221, + "learning_rate": 2.5463410942991986e-06, + "loss": 0.0298, + "step": 1363 + }, + { + "epoch": 1.35, + "grad_norm": 0.8305456421254139, + "learning_rate": 2.5393719948581507e-06, + "loss": 0.029, + "step": 1364 + }, + { + "epoch": 1.35, + "grad_norm": 0.4458021272805112, + "learning_rate": 2.532409197803353e-06, + "loss": 0.0178, + "step": 1365 + }, + { + "epoch": 1.35, + "grad_norm": 0.6645665194259693, + "learning_rate": 2.525452720968563e-06, + "loss": 0.0207, + "step": 1366 + }, + { + "epoch": 1.35, + "grad_norm": 0.6310746208910438, + "learning_rate": 2.5185025821713454e-06, + "loss": 0.0175, + "step": 1367 + }, + { + "epoch": 1.35, + "grad_norm": 1.0364074720417116, + "learning_rate": 2.5115587992130342e-06, + "loss": 0.0275, + "step": 1368 + }, + { + "epoch": 1.35, + "grad_norm": 0.6205551171830646, + "learning_rate": 2.504621389878682e-06, + "loss": 0.0301, + "step": 1369 + }, + { + "epoch": 1.35, + "grad_norm": 0.49694932253221913, + "learning_rate": 2.4976903719370193e-06, + "loss": 0.0142, + "step": 1370 + }, + { + "epoch": 1.35, + "grad_norm": 0.5103829516719648, + "learning_rate": 2.4907657631404037e-06, + "loss": 0.023, + "step": 1371 + }, + { + "epoch": 1.36, + "grad_norm": 0.6578952733958501, + "learning_rate": 2.483847581224782e-06, + "loss": 0.0266, + "step": 1372 + }, + { + "epoch": 1.36, + "grad_norm": 1.124638769126836, + "learning_rate": 2.4769358439096347e-06, + "loss": 0.0318, + "step": 1373 + }, + { + "epoch": 1.36, + "grad_norm": 0.8613357003477377, + "learning_rate": 2.470030568897938e-06, + "loss": 0.0329, + "step": 1374 + }, + { + "epoch": 1.36, + "grad_norm": 0.7481385202102532, + "learning_rate": 2.4631317738761155e-06, + "loss": 0.0261, + "step": 1375 + }, + { + "epoch": 1.36, + "grad_norm": 0.5082467310959725, + "learning_rate": 2.456239476513994e-06, + "loss": 0.0202, + "step": 1376 + }, + { + "epoch": 1.36, + "grad_norm": 0.34702336652624877, + "learning_rate": 2.4493536944647566e-06, + "loss": 0.0202, + "step": 1377 + }, + { + "epoch": 1.36, + "grad_norm": 0.6745601361393228, + "learning_rate": 2.442474445364904e-06, + "loss": 0.0223, + "step": 1378 + }, + { + "epoch": 1.36, + "grad_norm": 1.5011297804780708, + "learning_rate": 2.4356017468341977e-06, + "loss": 0.0338, + "step": 1379 + }, + { + "epoch": 1.36, + "grad_norm": 0.934845789299032, + "learning_rate": 2.4287356164756244e-06, + "loss": 0.0204, + "step": 1380 + }, + { + "epoch": 1.36, + "grad_norm": 0.5768390021313147, + "learning_rate": 2.421876071875347e-06, + "loss": 0.0189, + "step": 1381 + }, + { + "epoch": 1.37, + "grad_norm": 0.4666835633560529, + "learning_rate": 2.415023130602661e-06, + "loss": 0.0217, + "step": 1382 + }, + { + "epoch": 1.37, + "grad_norm": 0.7251505144838705, + "learning_rate": 2.408176810209946e-06, + "loss": 0.0245, + "step": 1383 + }, + { + "epoch": 1.37, + "grad_norm": 0.7030905046001982, + "learning_rate": 2.401337128232631e-06, + "loss": 0.0238, + "step": 1384 + }, + { + "epoch": 1.37, + "grad_norm": 0.7104189569845816, + "learning_rate": 2.3945041021891335e-06, + "loss": 0.0212, + "step": 1385 + }, + { + "epoch": 1.37, + "grad_norm": 1.2295751661808245, + "learning_rate": 2.387677749580828e-06, + "loss": 0.0355, + "step": 1386 + }, + { + "epoch": 1.37, + "grad_norm": 0.6330065194593735, + "learning_rate": 2.3808580878919948e-06, + "loss": 0.0348, + "step": 1387 + }, + { + "epoch": 1.37, + "grad_norm": 0.918874073383736, + "learning_rate": 2.3740451345897773e-06, + "loss": 0.0164, + "step": 1388 + }, + { + "epoch": 1.37, + "grad_norm": 0.582892515629951, + "learning_rate": 2.3672389071241354e-06, + "loss": 0.0195, + "step": 1389 + }, + { + "epoch": 1.37, + "grad_norm": 1.0182000574709098, + "learning_rate": 2.3604394229278064e-06, + "loss": 0.0366, + "step": 1390 + }, + { + "epoch": 1.37, + "grad_norm": 0.9912460164229965, + "learning_rate": 2.3536466994162522e-06, + "loss": 0.0272, + "step": 1391 + }, + { + "epoch": 1.38, + "grad_norm": 0.9809821583130328, + "learning_rate": 2.3468607539876186e-06, + "loss": 0.0209, + "step": 1392 + }, + { + "epoch": 1.38, + "grad_norm": 0.6201084241577994, + "learning_rate": 2.3400816040226925e-06, + "loss": 0.0199, + "step": 1393 + }, + { + "epoch": 1.38, + "grad_norm": 0.8996803265244651, + "learning_rate": 2.3333092668848544e-06, + "loss": 0.0333, + "step": 1394 + }, + { + "epoch": 1.38, + "grad_norm": 0.4792146683688987, + "learning_rate": 2.326543759920034e-06, + "loss": 0.0203, + "step": 1395 + }, + { + "epoch": 1.38, + "grad_norm": 0.45049080136019715, + "learning_rate": 2.3197851004566723e-06, + "loss": 0.0215, + "step": 1396 + }, + { + "epoch": 1.38, + "grad_norm": 1.2387095657348046, + "learning_rate": 2.313033305805667e-06, + "loss": 0.0269, + "step": 1397 + }, + { + "epoch": 1.38, + "grad_norm": 0.5598441840881433, + "learning_rate": 2.3062883932603326e-06, + "loss": 0.0252, + "step": 1398 + }, + { + "epoch": 1.38, + "grad_norm": 0.7873601630358676, + "learning_rate": 2.2995503800963593e-06, + "loss": 0.0215, + "step": 1399 + }, + { + "epoch": 1.38, + "grad_norm": 0.907763784643883, + "learning_rate": 2.2928192835717642e-06, + "loss": 0.0264, + "step": 1400 + }, + { + "epoch": 1.38, + "grad_norm": 0.8360874450536064, + "learning_rate": 2.2860951209268485e-06, + "loss": 0.0254, + "step": 1401 + }, + { + "epoch": 1.39, + "grad_norm": 1.1729734710689084, + "learning_rate": 2.2793779093841565e-06, + "loss": 0.0172, + "step": 1402 + }, + { + "epoch": 1.39, + "grad_norm": 0.894340993070462, + "learning_rate": 2.2726676661484265e-06, + "loss": 0.0138, + "step": 1403 + }, + { + "epoch": 1.39, + "grad_norm": 0.609965868879223, + "learning_rate": 2.2659644084065483e-06, + "loss": 0.0214, + "step": 1404 + }, + { + "epoch": 1.39, + "grad_norm": 0.4737416535129027, + "learning_rate": 2.2592681533275223e-06, + "loss": 0.0187, + "step": 1405 + }, + { + "epoch": 1.39, + "grad_norm": 1.0187050959181063, + "learning_rate": 2.2525789180624096e-06, + "loss": 0.0157, + "step": 1406 + }, + { + "epoch": 1.39, + "grad_norm": 0.9494581574836511, + "learning_rate": 2.2458967197442942e-06, + "loss": 0.0264, + "step": 1407 + }, + { + "epoch": 1.39, + "grad_norm": 0.7222904042380601, + "learning_rate": 2.2392215754882363e-06, + "loss": 0.0316, + "step": 1408 + }, + { + "epoch": 1.39, + "grad_norm": 0.5612517935965755, + "learning_rate": 2.232553502391227e-06, + "loss": 0.0208, + "step": 1409 + }, + { + "epoch": 1.39, + "grad_norm": 2.564448457114273, + "learning_rate": 2.2258925175321476e-06, + "loss": 0.0358, + "step": 1410 + }, + { + "epoch": 1.39, + "grad_norm": 0.9895281557974522, + "learning_rate": 2.2192386379717214e-06, + "loss": 0.0322, + "step": 1411 + }, + { + "epoch": 1.4, + "grad_norm": 0.5522292721758334, + "learning_rate": 2.2125918807524795e-06, + "loss": 0.0238, + "step": 1412 + }, + { + "epoch": 1.4, + "grad_norm": 1.0731253951864272, + "learning_rate": 2.205952262898704e-06, + "loss": 0.0333, + "step": 1413 + }, + { + "epoch": 1.4, + "grad_norm": 0.48693201879768505, + "learning_rate": 2.199319801416394e-06, + "loss": 0.0185, + "step": 1414 + }, + { + "epoch": 1.4, + "grad_norm": 0.8312161420257296, + "learning_rate": 2.1926945132932188e-06, + "loss": 0.0296, + "step": 1415 + }, + { + "epoch": 1.4, + "grad_norm": 0.6626417816503829, + "learning_rate": 2.1860764154984736e-06, + "loss": 0.0207, + "step": 1416 + }, + { + "epoch": 1.4, + "grad_norm": 0.6049530155376529, + "learning_rate": 2.179465524983036e-06, + "loss": 0.0172, + "step": 1417 + }, + { + "epoch": 1.4, + "grad_norm": 0.8572149751190988, + "learning_rate": 2.1728618586793292e-06, + "loss": 0.0263, + "step": 1418 + }, + { + "epoch": 1.4, + "grad_norm": 0.553519448666031, + "learning_rate": 2.1662654335012684e-06, + "loss": 0.0133, + "step": 1419 + }, + { + "epoch": 1.4, + "grad_norm": 0.7689498757925647, + "learning_rate": 2.159676266344222e-06, + "loss": 0.0232, + "step": 1420 + }, + { + "epoch": 1.4, + "grad_norm": 0.5344206051884095, + "learning_rate": 2.1530943740849696e-06, + "loss": 0.0204, + "step": 1421 + }, + { + "epoch": 1.41, + "grad_norm": 0.7641828859197944, + "learning_rate": 2.1465197735816585e-06, + "loss": 0.0246, + "step": 1422 + }, + { + "epoch": 1.41, + "grad_norm": 0.5707472485620008, + "learning_rate": 2.139952481673756e-06, + "loss": 0.0209, + "step": 1423 + }, + { + "epoch": 1.41, + "grad_norm": 0.5335231672331546, + "learning_rate": 2.1333925151820175e-06, + "loss": 0.0184, + "step": 1424 + }, + { + "epoch": 1.41, + "grad_norm": 0.37509628954758883, + "learning_rate": 2.126839890908428e-06, + "loss": 0.0132, + "step": 1425 + }, + { + "epoch": 1.41, + "grad_norm": 0.4228335969477914, + "learning_rate": 2.120294625636171e-06, + "loss": 0.0198, + "step": 1426 + }, + { + "epoch": 1.41, + "grad_norm": 0.5467505342742434, + "learning_rate": 2.113756736129581e-06, + "loss": 0.0232, + "step": 1427 + }, + { + "epoch": 1.41, + "grad_norm": 0.6241443244997975, + "learning_rate": 2.1072262391340996e-06, + "loss": 0.0226, + "step": 1428 + }, + { + "epoch": 1.41, + "grad_norm": 0.914095950785102, + "learning_rate": 2.100703151376234e-06, + "loss": 0.037, + "step": 1429 + }, + { + "epoch": 1.41, + "grad_norm": 0.5764144696089817, + "learning_rate": 2.0941874895635184e-06, + "loss": 0.0216, + "step": 1430 + }, + { + "epoch": 1.41, + "grad_norm": 0.5566137561193788, + "learning_rate": 2.087679270384461e-06, + "loss": 0.0139, + "step": 1431 + }, + { + "epoch": 1.42, + "grad_norm": 0.9897236771294801, + "learning_rate": 2.081178510508512e-06, + "loss": 0.0279, + "step": 1432 + }, + { + "epoch": 1.42, + "grad_norm": 0.9771936407490407, + "learning_rate": 2.0746852265860123e-06, + "loss": 0.0223, + "step": 1433 + }, + { + "epoch": 1.42, + "grad_norm": 2.291363849454442, + "learning_rate": 2.0681994352481575e-06, + "loss": 0.015, + "step": 1434 + }, + { + "epoch": 1.42, + "grad_norm": 1.3268806800493107, + "learning_rate": 2.0617211531069503e-06, + "loss": 0.0294, + "step": 1435 + }, + { + "epoch": 1.42, + "grad_norm": 0.7891869090943812, + "learning_rate": 2.0552503967551634e-06, + "loss": 0.0247, + "step": 1436 + }, + { + "epoch": 1.42, + "grad_norm": 1.0008723260609602, + "learning_rate": 2.0487871827662913e-06, + "loss": 0.0326, + "step": 1437 + }, + { + "epoch": 1.42, + "grad_norm": 0.43601598237058325, + "learning_rate": 2.0423315276945103e-06, + "loss": 0.02, + "step": 1438 + }, + { + "epoch": 1.42, + "grad_norm": 0.6944295941030163, + "learning_rate": 2.0358834480746363e-06, + "loss": 0.0269, + "step": 1439 + }, + { + "epoch": 1.42, + "grad_norm": 0.6707427363028934, + "learning_rate": 2.0294429604220833e-06, + "loss": 0.0203, + "step": 1440 + }, + { + "epoch": 1.42, + "grad_norm": 0.685925821741964, + "learning_rate": 2.0230100812328167e-06, + "loss": 0.023, + "step": 1441 + }, + { + "epoch": 1.42, + "grad_norm": 0.5147866054111775, + "learning_rate": 2.0165848269833215e-06, + "loss": 0.0203, + "step": 1442 + }, + { + "epoch": 1.43, + "grad_norm": 1.776632642956283, + "learning_rate": 2.010167214130546e-06, + "loss": 0.0246, + "step": 1443 + }, + { + "epoch": 1.43, + "grad_norm": 1.0411426767089536, + "learning_rate": 2.003757259111869e-06, + "loss": 0.0282, + "step": 1444 + }, + { + "epoch": 1.43, + "grad_norm": 0.957824081037812, + "learning_rate": 1.9973549783450563e-06, + "loss": 0.0188, + "step": 1445 + }, + { + "epoch": 1.43, + "grad_norm": 1.134533925904978, + "learning_rate": 1.9909603882282156e-06, + "loss": 0.023, + "step": 1446 + }, + { + "epoch": 1.43, + "grad_norm": 0.7392234352149661, + "learning_rate": 1.984573505139758e-06, + "loss": 0.0185, + "step": 1447 + }, + { + "epoch": 1.43, + "grad_norm": 0.8419016776814038, + "learning_rate": 1.9781943454383583e-06, + "loss": 0.0292, + "step": 1448 + }, + { + "epoch": 1.43, + "grad_norm": 0.5482575841766507, + "learning_rate": 1.9718229254629018e-06, + "loss": 0.0201, + "step": 1449 + }, + { + "epoch": 1.43, + "grad_norm": 0.7895825808166198, + "learning_rate": 1.9654592615324542e-06, + "loss": 0.0261, + "step": 1450 + }, + { + "epoch": 1.43, + "grad_norm": 0.5391985624677404, + "learning_rate": 1.9591033699462165e-06, + "loss": 0.0166, + "step": 1451 + }, + { + "epoch": 1.43, + "grad_norm": 1.0509339173777723, + "learning_rate": 1.9527552669834797e-06, + "loss": 0.0317, + "step": 1452 + }, + { + "epoch": 1.44, + "grad_norm": 0.6975292110442908, + "learning_rate": 1.9464149689035912e-06, + "loss": 0.0259, + "step": 1453 + }, + { + "epoch": 1.44, + "grad_norm": 0.6687890100923896, + "learning_rate": 1.940082491945902e-06, + "loss": 0.0177, + "step": 1454 + }, + { + "epoch": 1.44, + "grad_norm": 1.086040887866882, + "learning_rate": 1.933757852329734e-06, + "loss": 0.0285, + "step": 1455 + }, + { + "epoch": 1.44, + "grad_norm": 0.6219525438540332, + "learning_rate": 1.927441066254334e-06, + "loss": 0.0237, + "step": 1456 + }, + { + "epoch": 1.44, + "grad_norm": 0.7987299511140524, + "learning_rate": 1.921132149898834e-06, + "loss": 0.0152, + "step": 1457 + }, + { + "epoch": 1.44, + "grad_norm": 0.8747595635173462, + "learning_rate": 1.9148311194222084e-06, + "loss": 0.0267, + "step": 1458 + }, + { + "epoch": 1.44, + "grad_norm": 1.1033645254183688, + "learning_rate": 1.908537990963238e-06, + "loss": 0.0252, + "step": 1459 + }, + { + "epoch": 1.44, + "grad_norm": 1.1394276850699414, + "learning_rate": 1.9022527806404583e-06, + "loss": 0.0399, + "step": 1460 + }, + { + "epoch": 1.44, + "grad_norm": 0.443076629359654, + "learning_rate": 1.8959755045521283e-06, + "loss": 0.0169, + "step": 1461 + }, + { + "epoch": 1.44, + "grad_norm": 0.6779496548512952, + "learning_rate": 1.8897061787761823e-06, + "loss": 0.0272, + "step": 1462 + }, + { + "epoch": 1.45, + "grad_norm": 0.9373551720667195, + "learning_rate": 1.883444819370193e-06, + "loss": 0.0384, + "step": 1463 + }, + { + "epoch": 1.45, + "grad_norm": 0.645241163161573, + "learning_rate": 1.8771914423713277e-06, + "loss": 0.0186, + "step": 1464 + }, + { + "epoch": 1.45, + "grad_norm": 0.4385818493699251, + "learning_rate": 1.8709460637963123e-06, + "loss": 0.0137, + "step": 1465 + }, + { + "epoch": 1.45, + "grad_norm": 1.0003881903051037, + "learning_rate": 1.8647086996413816e-06, + "loss": 0.0257, + "step": 1466 + }, + { + "epoch": 1.45, + "grad_norm": 0.5380060560641039, + "learning_rate": 1.8584793658822452e-06, + "loss": 0.0245, + "step": 1467 + }, + { + "epoch": 1.45, + "grad_norm": 0.5642531276125784, + "learning_rate": 1.8522580784740452e-06, + "loss": 0.0202, + "step": 1468 + }, + { + "epoch": 1.45, + "grad_norm": 0.5922481529196789, + "learning_rate": 1.8460448533513126e-06, + "loss": 0.0236, + "step": 1469 + }, + { + "epoch": 1.45, + "grad_norm": 0.3489071504947676, + "learning_rate": 1.8398397064279282e-06, + "loss": 0.0146, + "step": 1470 + }, + { + "epoch": 1.45, + "grad_norm": 0.46384690248928895, + "learning_rate": 1.833642653597088e-06, + "loss": 0.0188, + "step": 1471 + }, + { + "epoch": 1.45, + "grad_norm": 0.9116631997505819, + "learning_rate": 1.8274537107312502e-06, + "loss": 0.0182, + "step": 1472 + }, + { + "epoch": 1.46, + "grad_norm": 0.40347487957056644, + "learning_rate": 1.8212728936821034e-06, + "loss": 0.0099, + "step": 1473 + }, + { + "epoch": 1.46, + "grad_norm": 0.9256491401658001, + "learning_rate": 1.8151002182805226e-06, + "loss": 0.0196, + "step": 1474 + }, + { + "epoch": 1.46, + "grad_norm": 2.378903999596482, + "learning_rate": 1.8089357003365316e-06, + "loss": 0.0243, + "step": 1475 + }, + { + "epoch": 1.46, + "grad_norm": 5.026642187164877, + "learning_rate": 1.802779355639257e-06, + "loss": 0.0085, + "step": 1476 + }, + { + "epoch": 1.46, + "grad_norm": 0.8339934893740829, + "learning_rate": 1.7966311999568974e-06, + "loss": 0.0338, + "step": 1477 + }, + { + "epoch": 1.46, + "grad_norm": 1.1253234493067137, + "learning_rate": 1.7904912490366723e-06, + "loss": 0.0295, + "step": 1478 + }, + { + "epoch": 1.46, + "grad_norm": 1.4327414591473575, + "learning_rate": 1.7843595186047863e-06, + "loss": 0.0271, + "step": 1479 + }, + { + "epoch": 1.46, + "grad_norm": 0.671546053350665, + "learning_rate": 1.7782360243663905e-06, + "loss": 0.0206, + "step": 1480 + }, + { + "epoch": 1.46, + "grad_norm": 0.7710182655596616, + "learning_rate": 1.7721207820055402e-06, + "loss": 0.0232, + "step": 1481 + }, + { + "epoch": 1.46, + "grad_norm": 25.15180406399679, + "learning_rate": 1.7660138071851545e-06, + "loss": 0.0232, + "step": 1482 + }, + { + "epoch": 1.47, + "grad_norm": 0.8164567044306239, + "learning_rate": 1.7599151155469802e-06, + "loss": 0.0243, + "step": 1483 + }, + { + "epoch": 1.47, + "grad_norm": 0.6379600019853553, + "learning_rate": 1.753824722711544e-06, + "loss": 0.02, + "step": 1484 + }, + { + "epoch": 1.47, + "grad_norm": 0.8499432372174188, + "learning_rate": 1.7477426442781198e-06, + "loss": 0.0261, + "step": 1485 + }, + { + "epoch": 1.47, + "grad_norm": 0.6261917549682076, + "learning_rate": 1.7416688958246847e-06, + "loss": 0.0193, + "step": 1486 + }, + { + "epoch": 1.47, + "grad_norm": 0.5082059473411364, + "learning_rate": 1.7356034929078803e-06, + "loss": 0.0111, + "step": 1487 + }, + { + "epoch": 1.47, + "grad_norm": 0.4353187166263084, + "learning_rate": 1.7295464510629718e-06, + "loss": 0.0193, + "step": 1488 + }, + { + "epoch": 1.47, + "grad_norm": 1.5191009047621435, + "learning_rate": 1.7234977858038137e-06, + "loss": 0.0315, + "step": 1489 + }, + { + "epoch": 1.47, + "grad_norm": 0.6488737261899189, + "learning_rate": 1.7174575126228016e-06, + "loss": 0.0227, + "step": 1490 + }, + { + "epoch": 1.47, + "grad_norm": 0.9264160824660079, + "learning_rate": 1.711425646990838e-06, + "loss": 0.0254, + "step": 1491 + }, + { + "epoch": 1.47, + "grad_norm": 0.4176930071798314, + "learning_rate": 1.7054022043572882e-06, + "loss": 0.0153, + "step": 1492 + }, + { + "epoch": 1.48, + "grad_norm": 0.3590710184553117, + "learning_rate": 1.6993872001499461e-06, + "loss": 0.0193, + "step": 1493 + }, + { + "epoch": 1.48, + "grad_norm": 0.9707860340678295, + "learning_rate": 1.6933806497749955e-06, + "loss": 0.023, + "step": 1494 + }, + { + "epoch": 1.48, + "grad_norm": 0.4370541782227388, + "learning_rate": 1.6873825686169625e-06, + "loss": 0.0171, + "step": 1495 + }, + { + "epoch": 1.48, + "grad_norm": 0.6244044034847467, + "learning_rate": 1.681392972038684e-06, + "loss": 0.0192, + "step": 1496 + }, + { + "epoch": 1.48, + "grad_norm": 0.8926815180721247, + "learning_rate": 1.6754118753812632e-06, + "loss": 0.0242, + "step": 1497 + }, + { + "epoch": 1.48, + "grad_norm": 0.7593507350335365, + "learning_rate": 1.669439293964033e-06, + "loss": 0.0262, + "step": 1498 + }, + { + "epoch": 1.48, + "grad_norm": 0.5578039243484335, + "learning_rate": 1.6634752430845196e-06, + "loss": 0.0297, + "step": 1499 + }, + { + "epoch": 1.48, + "grad_norm": 0.584385071528929, + "learning_rate": 1.6575197380183965e-06, + "loss": 0.0143, + "step": 1500 + }, + { + "epoch": 1.48, + "grad_norm": 0.56402011021067, + "learning_rate": 1.6515727940194493e-06, + "loss": 0.0195, + "step": 1501 + }, + { + "epoch": 1.48, + "grad_norm": 0.44268730746617535, + "learning_rate": 1.6456344263195374e-06, + "loss": 0.013, + "step": 1502 + }, + { + "epoch": 1.49, + "grad_norm": 1.4961967681210577, + "learning_rate": 1.639704650128552e-06, + "loss": 0.0193, + "step": 1503 + }, + { + "epoch": 1.49, + "grad_norm": 0.970299813916526, + "learning_rate": 1.6337834806343783e-06, + "loss": 0.019, + "step": 1504 + }, + { + "epoch": 1.49, + "grad_norm": 0.6257331643439866, + "learning_rate": 1.6278709330028636e-06, + "loss": 0.0206, + "step": 1505 + }, + { + "epoch": 1.49, + "grad_norm": 1.9686357872123381, + "learning_rate": 1.621967022377765e-06, + "loss": 0.0284, + "step": 1506 + }, + { + "epoch": 1.49, + "grad_norm": 2.274058130322033, + "learning_rate": 1.6160717638807205e-06, + "loss": 0.0226, + "step": 1507 + }, + { + "epoch": 1.49, + "grad_norm": 0.7747602086226845, + "learning_rate": 1.6101851726112067e-06, + "loss": 0.0116, + "step": 1508 + }, + { + "epoch": 1.49, + "grad_norm": 0.8105945738690109, + "learning_rate": 1.6043072636465017e-06, + "loss": 0.0217, + "step": 1509 + }, + { + "epoch": 1.49, + "grad_norm": 2.6793023565390253, + "learning_rate": 1.598438052041643e-06, + "loss": 0.0285, + "step": 1510 + }, + { + "epoch": 1.49, + "grad_norm": 0.5613810363225675, + "learning_rate": 1.5925775528293985e-06, + "loss": 0.025, + "step": 1511 + }, + { + "epoch": 1.49, + "grad_norm": 0.688507515991181, + "learning_rate": 1.586725781020214e-06, + "loss": 0.0186, + "step": 1512 + }, + { + "epoch": 1.5, + "grad_norm": 1.3351929076892877, + "learning_rate": 1.5808827516021851e-06, + "loss": 0.0199, + "step": 1513 + }, + { + "epoch": 1.5, + "grad_norm": 2.1889487189572696, + "learning_rate": 1.5750484795410143e-06, + "loss": 0.0307, + "step": 1514 + }, + { + "epoch": 1.5, + "grad_norm": 0.48865042364173655, + "learning_rate": 1.5692229797799752e-06, + "loss": 0.0215, + "step": 1515 + }, + { + "epoch": 1.5, + "grad_norm": 0.8347701023901559, + "learning_rate": 1.5634062672398697e-06, + "loss": 0.0225, + "step": 1516 + }, + { + "epoch": 1.5, + "grad_norm": 0.800077144695363, + "learning_rate": 1.557598356819e-06, + "loss": 0.0303, + "step": 1517 + }, + { + "epoch": 1.5, + "grad_norm": 0.3919408000387406, + "learning_rate": 1.5517992633931178e-06, + "loss": 0.014, + "step": 1518 + }, + { + "epoch": 1.5, + "grad_norm": 0.6177121499869433, + "learning_rate": 1.5460090018153923e-06, + "loss": 0.0227, + "step": 1519 + }, + { + "epoch": 1.5, + "grad_norm": 0.6349338552380158, + "learning_rate": 1.5402275869163736e-06, + "loss": 0.0283, + "step": 1520 + }, + { + "epoch": 1.5, + "grad_norm": 0.5126033869999117, + "learning_rate": 1.5344550335039521e-06, + "loss": 0.0155, + "step": 1521 + }, + { + "epoch": 1.5, + "grad_norm": 0.856223292939775, + "learning_rate": 1.5286913563633194e-06, + "loss": 0.017, + "step": 1522 + }, + { + "epoch": 1.5, + "grad_norm": 1.0083250205891625, + "learning_rate": 1.522936570256937e-06, + "loss": 0.0217, + "step": 1523 + }, + { + "epoch": 1.51, + "grad_norm": 0.4994108438051356, + "learning_rate": 1.517190689924491e-06, + "loss": 0.0184, + "step": 1524 + }, + { + "epoch": 1.51, + "grad_norm": 0.37321497836596046, + "learning_rate": 1.5114537300828558e-06, + "loss": 0.0173, + "step": 1525 + }, + { + "epoch": 1.51, + "grad_norm": 0.7701752941065383, + "learning_rate": 1.505725705426061e-06, + "loss": 0.0244, + "step": 1526 + }, + { + "epoch": 1.51, + "grad_norm": 1.1281368029828174, + "learning_rate": 1.5000066306252475e-06, + "loss": 0.0166, + "step": 1527 + }, + { + "epoch": 1.51, + "grad_norm": 0.7338424153324811, + "learning_rate": 1.4942965203286337e-06, + "loss": 0.0177, + "step": 1528 + }, + { + "epoch": 1.51, + "grad_norm": 0.41491260429600013, + "learning_rate": 1.4885953891614813e-06, + "loss": 0.0153, + "step": 1529 + }, + { + "epoch": 1.51, + "grad_norm": 0.7844248268278589, + "learning_rate": 1.482903251726049e-06, + "loss": 0.0305, + "step": 1530 + }, + { + "epoch": 1.51, + "grad_norm": 0.8885898820999001, + "learning_rate": 1.4772201226015615e-06, + "loss": 0.0309, + "step": 1531 + }, + { + "epoch": 1.51, + "grad_norm": 2.0705376041540378, + "learning_rate": 1.4715460163441703e-06, + "loss": 0.0492, + "step": 1532 + }, + { + "epoch": 1.51, + "grad_norm": 0.60630026470556, + "learning_rate": 1.4658809474869174e-06, + "loss": 0.0163, + "step": 1533 + }, + { + "epoch": 1.52, + "grad_norm": 0.7257005504788903, + "learning_rate": 1.4602249305396966e-06, + "loss": 0.0146, + "step": 1534 + }, + { + "epoch": 1.52, + "grad_norm": 1.2429270738071347, + "learning_rate": 1.4545779799892179e-06, + "loss": 0.0298, + "step": 1535 + }, + { + "epoch": 1.52, + "grad_norm": 0.5803068356342167, + "learning_rate": 1.4489401102989698e-06, + "loss": 0.0143, + "step": 1536 + }, + { + "epoch": 1.52, + "grad_norm": 0.9334893520475549, + "learning_rate": 1.4433113359091805e-06, + "loss": 0.029, + "step": 1537 + }, + { + "epoch": 1.52, + "grad_norm": 0.8061341751112961, + "learning_rate": 1.4376916712367838e-06, + "loss": 0.0186, + "step": 1538 + }, + { + "epoch": 1.52, + "grad_norm": 0.9039562950084497, + "learning_rate": 1.4320811306753796e-06, + "loss": 0.0212, + "step": 1539 + }, + { + "epoch": 1.52, + "grad_norm": 0.4714931876115984, + "learning_rate": 1.426479728595202e-06, + "loss": 0.02, + "step": 1540 + }, + { + "epoch": 1.52, + "grad_norm": 0.7906593826204635, + "learning_rate": 1.4208874793430738e-06, + "loss": 0.0221, + "step": 1541 + }, + { + "epoch": 1.52, + "grad_norm": 0.7280596395913382, + "learning_rate": 1.415304397242378e-06, + "loss": 0.0207, + "step": 1542 + }, + { + "epoch": 1.52, + "grad_norm": 0.8117288716060429, + "learning_rate": 1.409730496593016e-06, + "loss": 0.0312, + "step": 1543 + }, + { + "epoch": 1.53, + "grad_norm": 0.7925630934679432, + "learning_rate": 1.4041657916713741e-06, + "loss": 0.0181, + "step": 1544 + }, + { + "epoch": 1.53, + "grad_norm": 1.3841331652094588, + "learning_rate": 1.398610296730284e-06, + "loss": 0.0262, + "step": 1545 + }, + { + "epoch": 1.53, + "grad_norm": 0.620345714196815, + "learning_rate": 1.3930640259989914e-06, + "loss": 0.0215, + "step": 1546 + }, + { + "epoch": 1.53, + "grad_norm": 1.72655687412436, + "learning_rate": 1.3875269936831133e-06, + "loss": 0.0246, + "step": 1547 + }, + { + "epoch": 1.53, + "grad_norm": 1.6574460872584007, + "learning_rate": 1.3819992139646037e-06, + "loss": 0.0222, + "step": 1548 + }, + { + "epoch": 1.53, + "grad_norm": 0.7069930906531742, + "learning_rate": 1.37648070100172e-06, + "loss": 0.0266, + "step": 1549 + }, + { + "epoch": 1.53, + "grad_norm": 0.6845802946247612, + "learning_rate": 1.3709714689289844e-06, + "loss": 0.0196, + "step": 1550 + }, + { + "epoch": 1.53, + "grad_norm": 0.6832692791063089, + "learning_rate": 1.3654715318571455e-06, + "loss": 0.0161, + "step": 1551 + }, + { + "epoch": 1.53, + "grad_norm": 0.8309536486895222, + "learning_rate": 1.3599809038731503e-06, + "loss": 0.026, + "step": 1552 + }, + { + "epoch": 1.53, + "grad_norm": 1.861610800620917, + "learning_rate": 1.354499599040097e-06, + "loss": 0.0218, + "step": 1553 + }, + { + "epoch": 1.54, + "grad_norm": 0.793696990792352, + "learning_rate": 1.3490276313972073e-06, + "loss": 0.0232, + "step": 1554 + }, + { + "epoch": 1.54, + "grad_norm": 0.6407327606080412, + "learning_rate": 1.3435650149597862e-06, + "loss": 0.0265, + "step": 1555 + }, + { + "epoch": 1.54, + "grad_norm": 0.8759165356024202, + "learning_rate": 1.3381117637191887e-06, + "loss": 0.0225, + "step": 1556 + }, + { + "epoch": 1.54, + "grad_norm": 1.0577346876127711, + "learning_rate": 1.3326678916427803e-06, + "loss": 0.0298, + "step": 1557 + }, + { + "epoch": 1.54, + "grad_norm": 0.4542805258478709, + "learning_rate": 1.3272334126739094e-06, + "loss": 0.0174, + "step": 1558 + }, + { + "epoch": 1.54, + "grad_norm": 0.5182354308636529, + "learning_rate": 1.3218083407318606e-06, + "loss": 0.0179, + "step": 1559 + }, + { + "epoch": 1.54, + "grad_norm": 1.3954805565498662, + "learning_rate": 1.3163926897118252e-06, + "loss": 0.0192, + "step": 1560 + }, + { + "epoch": 1.54, + "grad_norm": 0.811580137556446, + "learning_rate": 1.310986473484866e-06, + "loss": 0.0182, + "step": 1561 + }, + { + "epoch": 1.54, + "grad_norm": 1.1921238444026916, + "learning_rate": 1.3055897058978806e-06, + "loss": 0.0358, + "step": 1562 + }, + { + "epoch": 1.54, + "grad_norm": 2.0037836419531394, + "learning_rate": 1.300202400773563e-06, + "loss": 0.0114, + "step": 1563 + }, + { + "epoch": 1.55, + "grad_norm": 0.9513953738579717, + "learning_rate": 1.2948245719103774e-06, + "loss": 0.0268, + "step": 1564 + }, + { + "epoch": 1.55, + "grad_norm": 1.2881756130443442, + "learning_rate": 1.2894562330825106e-06, + "loss": 0.022, + "step": 1565 + }, + { + "epoch": 1.55, + "grad_norm": 0.9456261923512207, + "learning_rate": 1.2840973980398446e-06, + "loss": 0.0251, + "step": 1566 + }, + { + "epoch": 1.55, + "grad_norm": 0.7087238217511729, + "learning_rate": 1.27874808050792e-06, + "loss": 0.0205, + "step": 1567 + }, + { + "epoch": 1.55, + "grad_norm": 1.8087893817399272, + "learning_rate": 1.273408294187899e-06, + "loss": 0.0249, + "step": 1568 + }, + { + "epoch": 1.55, + "grad_norm": 1.2547387247611936, + "learning_rate": 1.2680780527565313e-06, + "loss": 0.024, + "step": 1569 + }, + { + "epoch": 1.55, + "grad_norm": 1.0368957302101323, + "learning_rate": 1.2627573698661228e-06, + "loss": 0.0374, + "step": 1570 + }, + { + "epoch": 1.55, + "grad_norm": 1.5301292447138974, + "learning_rate": 1.257446259144494e-06, + "loss": 0.0243, + "step": 1571 + }, + { + "epoch": 1.55, + "grad_norm": 0.46242128257381343, + "learning_rate": 1.2521447341949494e-06, + "loss": 0.0174, + "step": 1572 + }, + { + "epoch": 1.55, + "grad_norm": 1.0290998698478622, + "learning_rate": 1.24685280859624e-06, + "loss": 0.0329, + "step": 1573 + }, + { + "epoch": 1.56, + "grad_norm": 0.7094916079841542, + "learning_rate": 1.2415704959025321e-06, + "loss": 0.0197, + "step": 1574 + }, + { + "epoch": 1.56, + "grad_norm": 1.0464668946048912, + "learning_rate": 1.236297809643368e-06, + "loss": 0.0259, + "step": 1575 + }, + { + "epoch": 1.56, + "grad_norm": 2.2412781545389158, + "learning_rate": 1.2310347633236402e-06, + "loss": 0.0296, + "step": 1576 + }, + { + "epoch": 1.56, + "grad_norm": 1.3720843586226656, + "learning_rate": 1.225781370423541e-06, + "loss": 0.0398, + "step": 1577 + }, + { + "epoch": 1.56, + "grad_norm": 1.011296703595368, + "learning_rate": 1.2205376443985444e-06, + "loss": 0.0193, + "step": 1578 + }, + { + "epoch": 1.56, + "grad_norm": 1.5876467993872176, + "learning_rate": 1.2153035986793627e-06, + "loss": 0.0331, + "step": 1579 + }, + { + "epoch": 1.56, + "grad_norm": 0.5603223195065772, + "learning_rate": 1.2100792466719118e-06, + "loss": 0.0208, + "step": 1580 + }, + { + "epoch": 1.56, + "grad_norm": 0.6448719403345857, + "learning_rate": 1.2048646017572857e-06, + "loss": 0.0222, + "step": 1581 + }, + { + "epoch": 1.56, + "grad_norm": 0.28342050570494437, + "learning_rate": 1.1996596772917091e-06, + "loss": 0.0092, + "step": 1582 + }, + { + "epoch": 1.56, + "grad_norm": 0.39879018942234634, + "learning_rate": 1.1944644866065125e-06, + "loss": 0.0176, + "step": 1583 + }, + { + "epoch": 1.57, + "grad_norm": 0.42845470718418127, + "learning_rate": 1.1892790430080952e-06, + "loss": 0.0163, + "step": 1584 + }, + { + "epoch": 1.57, + "grad_norm": 0.504022504721399, + "learning_rate": 1.1841033597778905e-06, + "loss": 0.0155, + "step": 1585 + }, + { + "epoch": 1.57, + "grad_norm": 0.8398692539657486, + "learning_rate": 1.1789374501723328e-06, + "loss": 0.0227, + "step": 1586 + }, + { + "epoch": 1.57, + "grad_norm": 1.0945442042275597, + "learning_rate": 1.173781327422826e-06, + "loss": 0.0203, + "step": 1587 + }, + { + "epoch": 1.57, + "grad_norm": 0.7836709704029713, + "learning_rate": 1.1686350047357036e-06, + "loss": 0.0323, + "step": 1588 + }, + { + "epoch": 1.57, + "grad_norm": 0.6375367175543545, + "learning_rate": 1.1634984952921996e-06, + "loss": 0.0196, + "step": 1589 + }, + { + "epoch": 1.57, + "grad_norm": 0.7287547471933316, + "learning_rate": 1.1583718122484128e-06, + "loss": 0.0208, + "step": 1590 + }, + { + "epoch": 1.57, + "grad_norm": 0.7912568103894504, + "learning_rate": 1.1532549687352752e-06, + "loss": 0.021, + "step": 1591 + }, + { + "epoch": 1.57, + "grad_norm": 0.5528882367856399, + "learning_rate": 1.1481479778585142e-06, + "loss": 0.0193, + "step": 1592 + }, + { + "epoch": 1.57, + "grad_norm": 0.8246650683849523, + "learning_rate": 1.1430508526986261e-06, + "loss": 0.0216, + "step": 1593 + }, + { + "epoch": 1.58, + "grad_norm": 1.3077247280046171, + "learning_rate": 1.137963606310834e-06, + "loss": 0.0232, + "step": 1594 + }, + { + "epoch": 1.58, + "grad_norm": 0.4102947057515644, + "learning_rate": 1.132886251725061e-06, + "loss": 0.0227, + "step": 1595 + }, + { + "epoch": 1.58, + "grad_norm": 1.6409286483577383, + "learning_rate": 1.127818801945893e-06, + "loss": 0.0209, + "step": 1596 + }, + { + "epoch": 1.58, + "grad_norm": 0.6933263716604913, + "learning_rate": 1.1227612699525475e-06, + "loss": 0.0255, + "step": 1597 + }, + { + "epoch": 1.58, + "grad_norm": 0.6940363977524191, + "learning_rate": 1.1177136686988383e-06, + "loss": 0.0205, + "step": 1598 + }, + { + "epoch": 1.58, + "grad_norm": 0.7791018362615008, + "learning_rate": 1.1126760111131474e-06, + "loss": 0.0186, + "step": 1599 + }, + { + "epoch": 1.58, + "grad_norm": 1.473217425311028, + "learning_rate": 1.1076483100983843e-06, + "loss": 0.0193, + "step": 1600 + }, + { + "epoch": 1.58, + "grad_norm": 0.859716563176724, + "learning_rate": 1.1026305785319585e-06, + "loss": 0.0276, + "step": 1601 + }, + { + "epoch": 1.58, + "grad_norm": 0.547744977616838, + "learning_rate": 1.0976228292657447e-06, + "loss": 0.0231, + "step": 1602 + }, + { + "epoch": 1.58, + "grad_norm": 0.7192227614672417, + "learning_rate": 1.0926250751260492e-06, + "loss": 0.0217, + "step": 1603 + }, + { + "epoch": 1.58, + "grad_norm": 0.4437076760782891, + "learning_rate": 1.0876373289135778e-06, + "loss": 0.0214, + "step": 1604 + }, + { + "epoch": 1.59, + "grad_norm": 0.6198481679854476, + "learning_rate": 1.0826596034034066e-06, + "loss": 0.0274, + "step": 1605 + }, + { + "epoch": 1.59, + "grad_norm": 0.6905795095781041, + "learning_rate": 1.0776919113449407e-06, + "loss": 0.0253, + "step": 1606 + }, + { + "epoch": 1.59, + "grad_norm": 0.9346647468986687, + "learning_rate": 1.0727342654618905e-06, + "loss": 0.0297, + "step": 1607 + }, + { + "epoch": 1.59, + "grad_norm": 0.6448428595540737, + "learning_rate": 1.0677866784522317e-06, + "loss": 0.0211, + "step": 1608 + }, + { + "epoch": 1.59, + "grad_norm": 0.7632216839916288, + "learning_rate": 1.0628491629881794e-06, + "loss": 0.0285, + "step": 1609 + }, + { + "epoch": 1.59, + "grad_norm": 1.043944368613153, + "learning_rate": 1.0579217317161494e-06, + "loss": 0.0236, + "step": 1610 + }, + { + "epoch": 1.59, + "grad_norm": 0.6420776815388155, + "learning_rate": 1.0530043972567339e-06, + "loss": 0.0286, + "step": 1611 + }, + { + "epoch": 1.59, + "grad_norm": 0.36874244760640196, + "learning_rate": 1.0480971722046602e-06, + "loss": 0.0119, + "step": 1612 + }, + { + "epoch": 1.59, + "grad_norm": 1.698553180562803, + "learning_rate": 1.0432000691287619e-06, + "loss": 0.0232, + "step": 1613 + }, + { + "epoch": 1.59, + "grad_norm": 0.4035736417059915, + "learning_rate": 1.0383131005719505e-06, + "loss": 0.0126, + "step": 1614 + }, + { + "epoch": 1.6, + "grad_norm": 0.516220342713733, + "learning_rate": 1.0334362790511776e-06, + "loss": 0.0182, + "step": 1615 + }, + { + "epoch": 1.6, + "grad_norm": 0.39365931838262347, + "learning_rate": 1.0285696170574045e-06, + "loss": 0.0126, + "step": 1616 + }, + { + "epoch": 1.6, + "grad_norm": 0.5480992689342562, + "learning_rate": 1.023713127055575e-06, + "loss": 0.0179, + "step": 1617 + }, + { + "epoch": 1.6, + "grad_norm": 0.5196346398987512, + "learning_rate": 1.0188668214845765e-06, + "loss": 0.0155, + "step": 1618 + }, + { + "epoch": 1.6, + "grad_norm": 0.4489352579236505, + "learning_rate": 1.0140307127572125e-06, + "loss": 0.0184, + "step": 1619 + }, + { + "epoch": 1.6, + "grad_norm": 0.690103014170062, + "learning_rate": 1.009204813260164e-06, + "loss": 0.0202, + "step": 1620 + }, + { + "epoch": 1.6, + "grad_norm": 1.3570709210532113, + "learning_rate": 1.004389135353972e-06, + "loss": 0.0365, + "step": 1621 + }, + { + "epoch": 1.6, + "grad_norm": 1.0257681719369882, + "learning_rate": 9.995836913729918e-07, + "loss": 0.0237, + "step": 1622 + }, + { + "epoch": 1.6, + "grad_norm": 0.6168406563119275, + "learning_rate": 9.947884936253666e-07, + "loss": 0.0196, + "step": 1623 + }, + { + "epoch": 1.6, + "grad_norm": 0.7692543213723014, + "learning_rate": 9.90003554392997e-07, + "loss": 0.0232, + "step": 1624 + }, + { + "epoch": 1.61, + "grad_norm": 1.2328193733615247, + "learning_rate": 9.852288859315096e-07, + "loss": 0.0431, + "step": 1625 + }, + { + "epoch": 1.61, + "grad_norm": 1.4613027425912049, + "learning_rate": 9.804645004702208e-07, + "loss": 0.0248, + "step": 1626 + }, + { + "epoch": 1.61, + "grad_norm": 0.7094673929023848, + "learning_rate": 9.757104102121152e-07, + "loss": 0.0189, + "step": 1627 + }, + { + "epoch": 1.61, + "grad_norm": 0.7652661275955153, + "learning_rate": 9.709666273338037e-07, + "loss": 0.0227, + "step": 1628 + }, + { + "epoch": 1.61, + "grad_norm": 0.6515555057917235, + "learning_rate": 9.662331639854977e-07, + "loss": 0.0214, + "step": 1629 + }, + { + "epoch": 1.61, + "grad_norm": 0.4895831342399959, + "learning_rate": 9.615100322909787e-07, + "loss": 0.018, + "step": 1630 + }, + { + "epoch": 1.61, + "grad_norm": 0.609954132677896, + "learning_rate": 9.567972443475648e-07, + "loss": 0.0203, + "step": 1631 + }, + { + "epoch": 1.61, + "grad_norm": 0.5067204919211488, + "learning_rate": 9.520948122260792e-07, + "loss": 0.0214, + "step": 1632 + }, + { + "epoch": 1.61, + "grad_norm": 1.2597206502873675, + "learning_rate": 9.474027479708254e-07, + "loss": 0.0206, + "step": 1633 + }, + { + "epoch": 1.61, + "grad_norm": 0.3841045305323135, + "learning_rate": 9.427210635995482e-07, + "loss": 0.0152, + "step": 1634 + }, + { + "epoch": 1.62, + "grad_norm": 0.5821817358914164, + "learning_rate": 9.380497711034065e-07, + "loss": 0.0203, + "step": 1635 + }, + { + "epoch": 1.62, + "grad_norm": 0.6100463174092755, + "learning_rate": 9.33388882446944e-07, + "loss": 0.0184, + "step": 1636 + }, + { + "epoch": 1.62, + "grad_norm": 0.9490671043170982, + "learning_rate": 9.287384095680558e-07, + "loss": 0.0257, + "step": 1637 + }, + { + "epoch": 1.62, + "grad_norm": 0.41866063189682606, + "learning_rate": 9.240983643779589e-07, + "loss": 0.0099, + "step": 1638 + }, + { + "epoch": 1.62, + "grad_norm": 0.916192123980324, + "learning_rate": 9.19468758761165e-07, + "loss": 0.0196, + "step": 1639 + }, + { + "epoch": 1.62, + "grad_norm": 0.6291804775621946, + "learning_rate": 9.148496045754441e-07, + "loss": 0.0219, + "step": 1640 + }, + { + "epoch": 1.62, + "grad_norm": 0.46506938594026603, + "learning_rate": 9.10240913651797e-07, + "loss": 0.0209, + "step": 1641 + }, + { + "epoch": 1.62, + "grad_norm": 0.3763574887343969, + "learning_rate": 9.056426977944272e-07, + "loss": 0.0175, + "step": 1642 + }, + { + "epoch": 1.62, + "grad_norm": 0.6263963200007582, + "learning_rate": 9.010549687807058e-07, + "loss": 0.026, + "step": 1643 + }, + { + "epoch": 1.62, + "grad_norm": 0.8782893052468342, + "learning_rate": 8.964777383611445e-07, + "loss": 0.0147, + "step": 1644 + }, + { + "epoch": 1.63, + "grad_norm": 0.8499726766140078, + "learning_rate": 8.919110182593682e-07, + "loss": 0.0307, + "step": 1645 + }, + { + "epoch": 1.63, + "grad_norm": 0.870144300950776, + "learning_rate": 8.873548201720788e-07, + "loss": 0.0259, + "step": 1646 + }, + { + "epoch": 1.63, + "grad_norm": 0.7272528604695497, + "learning_rate": 8.828091557690288e-07, + "loss": 0.0296, + "step": 1647 + }, + { + "epoch": 1.63, + "grad_norm": 0.676027817512814, + "learning_rate": 8.78274036692991e-07, + "loss": 0.0284, + "step": 1648 + }, + { + "epoch": 1.63, + "grad_norm": 0.856038483122214, + "learning_rate": 8.73749474559728e-07, + "loss": 0.0297, + "step": 1649 + }, + { + "epoch": 1.63, + "grad_norm": 2.399332352098762, + "learning_rate": 8.692354809579634e-07, + "loss": 0.0443, + "step": 1650 + }, + { + "epoch": 1.63, + "grad_norm": 0.8513838144339223, + "learning_rate": 8.647320674493531e-07, + "loss": 0.0233, + "step": 1651 + }, + { + "epoch": 1.63, + "grad_norm": 2.2022612364336753, + "learning_rate": 8.602392455684522e-07, + "loss": 0.0288, + "step": 1652 + }, + { + "epoch": 1.63, + "grad_norm": 0.6318734527708079, + "learning_rate": 8.557570268226889e-07, + "loss": 0.0227, + "step": 1653 + }, + { + "epoch": 1.63, + "grad_norm": 0.4986624707680987, + "learning_rate": 8.512854226923328e-07, + "loss": 0.012, + "step": 1654 + }, + { + "epoch": 1.64, + "grad_norm": 0.9490939870975297, + "learning_rate": 8.468244446304664e-07, + "loss": 0.0232, + "step": 1655 + }, + { + "epoch": 1.64, + "grad_norm": 0.5687247694401628, + "learning_rate": 8.423741040629557e-07, + "loss": 0.0171, + "step": 1656 + }, + { + "epoch": 1.64, + "grad_norm": 0.9342752487631834, + "learning_rate": 8.379344123884231e-07, + "loss": 0.0256, + "step": 1657 + }, + { + "epoch": 1.64, + "grad_norm": 1.4259907942214256, + "learning_rate": 8.335053809782129e-07, + "loss": 0.0208, + "step": 1658 + }, + { + "epoch": 1.64, + "grad_norm": 0.9361781880015428, + "learning_rate": 8.29087021176368e-07, + "loss": 0.0143, + "step": 1659 + }, + { + "epoch": 1.64, + "grad_norm": 0.8441919046857431, + "learning_rate": 8.246793442995954e-07, + "loss": 0.0167, + "step": 1660 + }, + { + "epoch": 1.64, + "grad_norm": 0.6969608754714764, + "learning_rate": 8.20282361637243e-07, + "loss": 0.0171, + "step": 1661 + }, + { + "epoch": 1.64, + "grad_norm": 0.8202955595934445, + "learning_rate": 8.158960844512654e-07, + "loss": 0.0195, + "step": 1662 + }, + { + "epoch": 1.64, + "grad_norm": 1.3779435824059567, + "learning_rate": 8.115205239761986e-07, + "loss": 0.029, + "step": 1663 + }, + { + "epoch": 1.64, + "grad_norm": 2.1595061577911956, + "learning_rate": 8.071556914191298e-07, + "loss": 0.0151, + "step": 1664 + }, + { + "epoch": 1.65, + "grad_norm": 0.3886150974844245, + "learning_rate": 8.028015979596681e-07, + "loss": 0.0128, + "step": 1665 + }, + { + "epoch": 1.65, + "grad_norm": 0.715937707945813, + "learning_rate": 7.984582547499176e-07, + "loss": 0.0219, + "step": 1666 + }, + { + "epoch": 1.65, + "grad_norm": 0.8299951514667157, + "learning_rate": 7.941256729144464e-07, + "loss": 0.0256, + "step": 1667 + }, + { + "epoch": 1.65, + "grad_norm": 1.101475048961094, + "learning_rate": 7.898038635502631e-07, + "loss": 0.0303, + "step": 1668 + }, + { + "epoch": 1.65, + "grad_norm": 0.6129852664782461, + "learning_rate": 7.854928377267812e-07, + "loss": 0.023, + "step": 1669 + }, + { + "epoch": 1.65, + "grad_norm": 1.654917809207751, + "learning_rate": 7.81192606485795e-07, + "loss": 0.0233, + "step": 1670 + }, + { + "epoch": 1.65, + "grad_norm": 0.7485343464875281, + "learning_rate": 7.769031808414523e-07, + "loss": 0.0197, + "step": 1671 + }, + { + "epoch": 1.65, + "grad_norm": 1.4487323641792147, + "learning_rate": 7.726245717802233e-07, + "loss": 0.0375, + "step": 1672 + }, + { + "epoch": 1.65, + "grad_norm": 0.3217437634939568, + "learning_rate": 7.68356790260873e-07, + "loss": 0.0122, + "step": 1673 + }, + { + "epoch": 1.65, + "grad_norm": 0.4868540639820885, + "learning_rate": 7.640998472144373e-07, + "loss": 0.0196, + "step": 1674 + }, + { + "epoch": 1.66, + "grad_norm": 0.8908143071660273, + "learning_rate": 7.59853753544188e-07, + "loss": 0.0222, + "step": 1675 + }, + { + "epoch": 1.66, + "grad_norm": 0.9843729139133108, + "learning_rate": 7.556185201256105e-07, + "loss": 0.0158, + "step": 1676 + }, + { + "epoch": 1.66, + "grad_norm": 0.8812079648965719, + "learning_rate": 7.513941578063732e-07, + "loss": 0.023, + "step": 1677 + }, + { + "epoch": 1.66, + "grad_norm": 2.041052471945282, + "learning_rate": 7.471806774063e-07, + "loss": 0.0214, + "step": 1678 + }, + { + "epoch": 1.66, + "grad_norm": 0.36991466567386605, + "learning_rate": 7.429780897173427e-07, + "loss": 0.015, + "step": 1679 + }, + { + "epoch": 1.66, + "grad_norm": 0.3146801134651237, + "learning_rate": 7.387864055035571e-07, + "loss": 0.013, + "step": 1680 + }, + { + "epoch": 1.66, + "grad_norm": 0.5791780869947971, + "learning_rate": 7.346056355010683e-07, + "loss": 0.0205, + "step": 1681 + }, + { + "epoch": 1.66, + "grad_norm": 0.4843910836561715, + "learning_rate": 7.304357904180475e-07, + "loss": 0.0183, + "step": 1682 + }, + { + "epoch": 1.66, + "grad_norm": 0.9352327802951779, + "learning_rate": 7.262768809346849e-07, + "loss": 0.0207, + "step": 1683 + }, + { + "epoch": 1.66, + "grad_norm": 0.6355901282179665, + "learning_rate": 7.221289177031609e-07, + "loss": 0.0215, + "step": 1684 + }, + { + "epoch": 1.67, + "grad_norm": 2.827734270635811, + "learning_rate": 7.179919113476192e-07, + "loss": 0.0574, + "step": 1685 + }, + { + "epoch": 1.67, + "grad_norm": 0.26441919617238413, + "learning_rate": 7.138658724641417e-07, + "loss": 0.0091, + "step": 1686 + }, + { + "epoch": 1.67, + "grad_norm": 0.9709198286425481, + "learning_rate": 7.097508116207169e-07, + "loss": 0.0261, + "step": 1687 + }, + { + "epoch": 1.67, + "grad_norm": 0.3166787822548412, + "learning_rate": 7.056467393572158e-07, + "loss": 0.0178, + "step": 1688 + }, + { + "epoch": 1.67, + "grad_norm": 0.721525302234989, + "learning_rate": 7.015536661853656e-07, + "loss": 0.0241, + "step": 1689 + }, + { + "epoch": 1.67, + "grad_norm": 0.5768351297093026, + "learning_rate": 6.974716025887207e-07, + "loss": 0.0194, + "step": 1690 + }, + { + "epoch": 1.67, + "grad_norm": 0.3058055162169507, + "learning_rate": 6.934005590226361e-07, + "loss": 0.0137, + "step": 1691 + }, + { + "epoch": 1.67, + "grad_norm": 0.6765565581741332, + "learning_rate": 6.893405459142439e-07, + "loss": 0.02, + "step": 1692 + }, + { + "epoch": 1.67, + "grad_norm": 0.46982186330771497, + "learning_rate": 6.85291573662421e-07, + "loss": 0.0157, + "step": 1693 + }, + { + "epoch": 1.67, + "grad_norm": 0.8793025805347607, + "learning_rate": 6.812536526377673e-07, + "loss": 0.0214, + "step": 1694 + }, + { + "epoch": 1.67, + "grad_norm": 0.6662828745104178, + "learning_rate": 6.772267931825765e-07, + "loss": 0.0226, + "step": 1695 + }, + { + "epoch": 1.68, + "grad_norm": 0.491202037016575, + "learning_rate": 6.732110056108105e-07, + "loss": 0.02, + "step": 1696 + }, + { + "epoch": 1.68, + "grad_norm": 0.676251852333797, + "learning_rate": 6.692063002080717e-07, + "loss": 0.0303, + "step": 1697 + }, + { + "epoch": 1.68, + "grad_norm": 0.6695814761840161, + "learning_rate": 6.652126872315812e-07, + "loss": 0.0236, + "step": 1698 + }, + { + "epoch": 1.68, + "grad_norm": 0.33574986126203843, + "learning_rate": 6.612301769101464e-07, + "loss": 0.012, + "step": 1699 + }, + { + "epoch": 1.68, + "grad_norm": 0.4367587595934483, + "learning_rate": 6.572587794441381e-07, + "loss": 0.0172, + "step": 1700 + }, + { + "epoch": 1.68, + "grad_norm": 0.62403592781587, + "learning_rate": 6.532985050054635e-07, + "loss": 0.0208, + "step": 1701 + }, + { + "epoch": 1.68, + "grad_norm": 1.0886268911836297, + "learning_rate": 6.493493637375414e-07, + "loss": 0.0266, + "step": 1702 + }, + { + "epoch": 1.68, + "grad_norm": 0.7851741637225625, + "learning_rate": 6.45411365755273e-07, + "loss": 0.0186, + "step": 1703 + }, + { + "epoch": 1.68, + "grad_norm": 0.39687201481838075, + "learning_rate": 6.414845211450243e-07, + "loss": 0.0168, + "step": 1704 + }, + { + "epoch": 1.68, + "grad_norm": 0.858159783352387, + "learning_rate": 6.375688399645863e-07, + "loss": 0.0226, + "step": 1705 + }, + { + "epoch": 1.69, + "grad_norm": 0.569237008902663, + "learning_rate": 6.336643322431624e-07, + "loss": 0.0224, + "step": 1706 + }, + { + "epoch": 1.69, + "grad_norm": 1.092683438598888, + "learning_rate": 6.297710079813346e-07, + "loss": 0.0296, + "step": 1707 + }, + { + "epoch": 1.69, + "grad_norm": 0.7351588601818313, + "learning_rate": 6.258888771510441e-07, + "loss": 0.0195, + "step": 1708 + }, + { + "epoch": 1.69, + "grad_norm": 0.6404335577191063, + "learning_rate": 6.220179496955608e-07, + "loss": 0.0295, + "step": 1709 + }, + { + "epoch": 1.69, + "grad_norm": 1.170165515233352, + "learning_rate": 6.181582355294579e-07, + "loss": 0.0175, + "step": 1710 + }, + { + "epoch": 1.69, + "grad_norm": 0.9322068050386567, + "learning_rate": 6.143097445385904e-07, + "loss": 0.0202, + "step": 1711 + }, + { + "epoch": 1.69, + "grad_norm": 1.7698760246873417, + "learning_rate": 6.104724865800665e-07, + "loss": 0.0218, + "step": 1712 + }, + { + "epoch": 1.69, + "grad_norm": 0.7113173705764911, + "learning_rate": 6.066464714822224e-07, + "loss": 0.017, + "step": 1713 + }, + { + "epoch": 1.69, + "grad_norm": 0.42364594714097703, + "learning_rate": 6.028317090446006e-07, + "loss": 0.0146, + "step": 1714 + }, + { + "epoch": 1.69, + "grad_norm": 1.3595453398378823, + "learning_rate": 5.990282090379201e-07, + "loss": 0.0182, + "step": 1715 + }, + { + "epoch": 1.7, + "grad_norm": 0.6382144279458921, + "learning_rate": 5.952359812040548e-07, + "loss": 0.0266, + "step": 1716 + }, + { + "epoch": 1.7, + "grad_norm": 1.2344671026355398, + "learning_rate": 5.91455035256005e-07, + "loss": 0.0266, + "step": 1717 + }, + { + "epoch": 1.7, + "grad_norm": 0.44343690963147425, + "learning_rate": 5.876853808778782e-07, + "loss": 0.0164, + "step": 1718 + }, + { + "epoch": 1.7, + "grad_norm": 1.3139772726167909, + "learning_rate": 5.839270277248565e-07, + "loss": 0.0297, + "step": 1719 + }, + { + "epoch": 1.7, + "grad_norm": 0.5404404381299035, + "learning_rate": 5.801799854231826e-07, + "loss": 0.0218, + "step": 1720 + }, + { + "epoch": 1.7, + "grad_norm": 0.9120055408801463, + "learning_rate": 5.764442635701229e-07, + "loss": 0.0263, + "step": 1721 + }, + { + "epoch": 1.7, + "grad_norm": 0.7776723247223076, + "learning_rate": 5.727198717339511e-07, + "loss": 0.0166, + "step": 1722 + }, + { + "epoch": 1.7, + "grad_norm": 1.9105654312944207, + "learning_rate": 5.690068194539217e-07, + "loss": 0.0259, + "step": 1723 + }, + { + "epoch": 1.7, + "grad_norm": 0.5182676413757116, + "learning_rate": 5.653051162402445e-07, + "loss": 0.021, + "step": 1724 + }, + { + "epoch": 1.7, + "grad_norm": 0.9792005660232671, + "learning_rate": 5.616147715740611e-07, + "loss": 0.0161, + "step": 1725 + }, + { + "epoch": 1.71, + "grad_norm": 0.37941653477897735, + "learning_rate": 5.579357949074221e-07, + "loss": 0.0119, + "step": 1726 + }, + { + "epoch": 1.71, + "grad_norm": 1.1030201633413343, + "learning_rate": 5.542681956632601e-07, + "loss": 0.0253, + "step": 1727 + }, + { + "epoch": 1.71, + "grad_norm": 0.5322457268616182, + "learning_rate": 5.506119832353662e-07, + "loss": 0.0201, + "step": 1728 + }, + { + "epoch": 1.71, + "grad_norm": 0.787967539652566, + "learning_rate": 5.469671669883675e-07, + "loss": 0.0146, + "step": 1729 + }, + { + "epoch": 1.71, + "grad_norm": 1.3041562263159943, + "learning_rate": 5.433337562577018e-07, + "loss": 0.0244, + "step": 1730 + }, + { + "epoch": 1.71, + "grad_norm": 0.33548978851264033, + "learning_rate": 5.397117603495927e-07, + "loss": 0.0163, + "step": 1731 + }, + { + "epoch": 1.71, + "grad_norm": 0.2796190944664051, + "learning_rate": 5.361011885410311e-07, + "loss": 0.0142, + "step": 1732 + }, + { + "epoch": 1.71, + "grad_norm": 1.4321550120605657, + "learning_rate": 5.325020500797434e-07, + "loss": 0.0251, + "step": 1733 + }, + { + "epoch": 1.71, + "grad_norm": 1.2378411684030717, + "learning_rate": 5.289143541841735e-07, + "loss": 0.0239, + "step": 1734 + }, + { + "epoch": 1.71, + "grad_norm": 0.6006956440056487, + "learning_rate": 5.253381100434574e-07, + "loss": 0.0171, + "step": 1735 + }, + { + "epoch": 1.72, + "grad_norm": 0.8446344159688172, + "learning_rate": 5.217733268173996e-07, + "loss": 0.0167, + "step": 1736 + }, + { + "epoch": 1.72, + "grad_norm": 0.3860698594231629, + "learning_rate": 5.182200136364491e-07, + "loss": 0.0116, + "step": 1737 + }, + { + "epoch": 1.72, + "grad_norm": 0.6743659201298501, + "learning_rate": 5.146781796016798e-07, + "loss": 0.0228, + "step": 1738 + }, + { + "epoch": 1.72, + "grad_norm": 0.6578276056168991, + "learning_rate": 5.111478337847603e-07, + "loss": 0.0145, + "step": 1739 + }, + { + "epoch": 1.72, + "grad_norm": 0.4444967706503058, + "learning_rate": 5.076289852279375e-07, + "loss": 0.0199, + "step": 1740 + }, + { + "epoch": 1.72, + "grad_norm": 0.8905759201198322, + "learning_rate": 5.041216429440088e-07, + "loss": 0.0202, + "step": 1741 + }, + { + "epoch": 1.72, + "grad_norm": 0.8652719179398571, + "learning_rate": 5.006258159163007e-07, + "loss": 0.026, + "step": 1742 + }, + { + "epoch": 1.72, + "grad_norm": 0.920438619981302, + "learning_rate": 4.971415130986457e-07, + "loss": 0.0256, + "step": 1743 + }, + { + "epoch": 1.72, + "grad_norm": 0.4690467940337577, + "learning_rate": 4.936687434153619e-07, + "loss": 0.0139, + "step": 1744 + }, + { + "epoch": 1.72, + "grad_norm": 1.0857169067280574, + "learning_rate": 4.902075157612241e-07, + "loss": 0.0268, + "step": 1745 + }, + { + "epoch": 1.73, + "grad_norm": 0.9071954914058828, + "learning_rate": 4.867578390014466e-07, + "loss": 0.021, + "step": 1746 + }, + { + "epoch": 1.73, + "grad_norm": 2.0027348201423516, + "learning_rate": 4.833197219716595e-07, + "loss": 0.0242, + "step": 1747 + }, + { + "epoch": 1.73, + "grad_norm": 1.7208333147985326, + "learning_rate": 4.798931734778794e-07, + "loss": 0.0225, + "step": 1748 + }, + { + "epoch": 1.73, + "grad_norm": 0.3340296826164514, + "learning_rate": 4.764782022965014e-07, + "loss": 0.014, + "step": 1749 + }, + { + "epoch": 1.73, + "grad_norm": 0.41348222258487555, + "learning_rate": 4.730748171742611e-07, + "loss": 0.0234, + "step": 1750 + }, + { + "epoch": 1.73, + "grad_norm": 1.783608172436412, + "learning_rate": 4.696830268282204e-07, + "loss": 0.0207, + "step": 1751 + }, + { + "epoch": 1.73, + "grad_norm": 0.5594126597283506, + "learning_rate": 4.6630283994574475e-07, + "loss": 0.0188, + "step": 1752 + }, + { + "epoch": 1.73, + "grad_norm": 0.4130436506101321, + "learning_rate": 4.629342651844787e-07, + "loss": 0.0149, + "step": 1753 + }, + { + "epoch": 1.73, + "grad_norm": 0.47117699815033703, + "learning_rate": 4.595773111723245e-07, + "loss": 0.0157, + "step": 1754 + }, + { + "epoch": 1.73, + "grad_norm": 1.06468324325383, + "learning_rate": 4.562319865074222e-07, + "loss": 0.0218, + "step": 1755 + }, + { + "epoch": 1.74, + "grad_norm": 0.4979626901475594, + "learning_rate": 4.528982997581233e-07, + "loss": 0.0201, + "step": 1756 + }, + { + "epoch": 1.74, + "grad_norm": 0.988935035787326, + "learning_rate": 4.4957625946297266e-07, + "loss": 0.0178, + "step": 1757 + }, + { + "epoch": 1.74, + "grad_norm": 0.637273291889344, + "learning_rate": 4.462658741306847e-07, + "loss": 0.016, + "step": 1758 + }, + { + "epoch": 1.74, + "grad_norm": 0.7542039279222552, + "learning_rate": 4.4296715224012187e-07, + "loss": 0.0169, + "step": 1759 + }, + { + "epoch": 1.74, + "grad_norm": 0.7968353487232488, + "learning_rate": 4.3968010224027247e-07, + "loss": 0.0147, + "step": 1760 + }, + { + "epoch": 1.74, + "grad_norm": 0.9266624597530224, + "learning_rate": 4.364047325502324e-07, + "loss": 0.0193, + "step": 1761 + }, + { + "epoch": 1.74, + "grad_norm": 0.4242221780268279, + "learning_rate": 4.331410515591783e-07, + "loss": 0.0136, + "step": 1762 + }, + { + "epoch": 1.74, + "grad_norm": 2.2405700148688243, + "learning_rate": 4.298890676263495e-07, + "loss": 0.0238, + "step": 1763 + }, + { + "epoch": 1.74, + "grad_norm": 2.236346074001916, + "learning_rate": 4.2664878908102556e-07, + "loss": 0.0355, + "step": 1764 + }, + { + "epoch": 1.74, + "grad_norm": 0.8651345063698643, + "learning_rate": 4.2342022422250553e-07, + "loss": 0.0258, + "step": 1765 + }, + { + "epoch": 1.75, + "grad_norm": 0.3813222697994349, + "learning_rate": 4.2020338132008454e-07, + "loss": 0.0121, + "step": 1766 + }, + { + "epoch": 1.75, + "grad_norm": 0.7925799529634308, + "learning_rate": 4.1699826861303804e-07, + "loss": 0.0249, + "step": 1767 + }, + { + "epoch": 1.75, + "grad_norm": 0.9035223172963347, + "learning_rate": 4.138048943105938e-07, + "loss": 0.0237, + "step": 1768 + }, + { + "epoch": 1.75, + "grad_norm": 0.7287711613306799, + "learning_rate": 4.106232665919152e-07, + "loss": 0.0109, + "step": 1769 + }, + { + "epoch": 1.75, + "grad_norm": 0.6004627965162954, + "learning_rate": 4.0745339360607927e-07, + "loss": 0.0136, + "step": 1770 + }, + { + "epoch": 1.75, + "grad_norm": 0.5499328026562983, + "learning_rate": 4.042952834720548e-07, + "loss": 0.0203, + "step": 1771 + }, + { + "epoch": 1.75, + "grad_norm": 0.44565309449455925, + "learning_rate": 4.011489442786831e-07, + "loss": 0.0147, + "step": 1772 + }, + { + "epoch": 1.75, + "grad_norm": 0.3121101306415952, + "learning_rate": 3.9801438408465895e-07, + "loss": 0.0136, + "step": 1773 + }, + { + "epoch": 1.75, + "grad_norm": 0.9336561135203006, + "learning_rate": 3.9489161091850413e-07, + "loss": 0.0228, + "step": 1774 + }, + { + "epoch": 1.75, + "grad_norm": 1.1439685088962397, + "learning_rate": 3.917806327785517e-07, + "loss": 0.0194, + "step": 1775 + }, + { + "epoch": 1.75, + "grad_norm": 0.7667463909847458, + "learning_rate": 3.886814576329245e-07, + "loss": 0.0343, + "step": 1776 + }, + { + "epoch": 1.76, + "grad_norm": 0.39481524338438234, + "learning_rate": 3.855940934195146e-07, + "loss": 0.0182, + "step": 1777 + }, + { + "epoch": 1.76, + "grad_norm": 0.4682956312455625, + "learning_rate": 3.825185480459614e-07, + "loss": 0.0142, + "step": 1778 + }, + { + "epoch": 1.76, + "grad_norm": 0.49595668042956637, + "learning_rate": 3.794548293896355e-07, + "loss": 0.0132, + "step": 1779 + }, + { + "epoch": 1.76, + "grad_norm": 0.21011637441168096, + "learning_rate": 3.7640294529761424e-07, + "loss": 0.0081, + "step": 1780 + }, + { + "epoch": 1.76, + "grad_norm": 5.79580408555057, + "learning_rate": 3.7336290358666206e-07, + "loss": 0.0291, + "step": 1781 + }, + { + "epoch": 1.76, + "grad_norm": 0.4241950547346609, + "learning_rate": 3.703347120432138e-07, + "loss": 0.0164, + "step": 1782 + }, + { + "epoch": 1.76, + "grad_norm": 0.5231212609907914, + "learning_rate": 3.6731837842335085e-07, + "loss": 0.0233, + "step": 1783 + }, + { + "epoch": 1.76, + "grad_norm": 1.9115936016118895, + "learning_rate": 3.6431391045278263e-07, + "loss": 0.028, + "step": 1784 + }, + { + "epoch": 1.76, + "grad_norm": 0.7705099493152656, + "learning_rate": 3.6132131582683086e-07, + "loss": 0.0178, + "step": 1785 + }, + { + "epoch": 1.76, + "grad_norm": 0.7642604555944872, + "learning_rate": 3.583406022104019e-07, + "loss": 0.0203, + "step": 1786 + }, + { + "epoch": 1.77, + "grad_norm": 0.3350098895459638, + "learning_rate": 3.5537177723797335e-07, + "loss": 0.0106, + "step": 1787 + }, + { + "epoch": 1.77, + "grad_norm": 0.6748864749922056, + "learning_rate": 3.52414848513572e-07, + "loss": 0.0232, + "step": 1788 + }, + { + "epoch": 1.77, + "grad_norm": 0.7794439927916365, + "learning_rate": 3.4946982361075524e-07, + "loss": 0.0146, + "step": 1789 + }, + { + "epoch": 1.77, + "grad_norm": 1.0614980884129646, + "learning_rate": 3.4653671007259084e-07, + "loss": 0.0259, + "step": 1790 + }, + { + "epoch": 1.77, + "grad_norm": 3.4421989854447395, + "learning_rate": 3.436155154116383e-07, + "loss": 0.0122, + "step": 1791 + }, + { + "epoch": 1.77, + "grad_norm": 0.9290405002699073, + "learning_rate": 3.407062471099298e-07, + "loss": 0.021, + "step": 1792 + }, + { + "epoch": 1.77, + "grad_norm": 1.1172006958050862, + "learning_rate": 3.3780891261895043e-07, + "loss": 0.0281, + "step": 1793 + }, + { + "epoch": 1.77, + "grad_norm": 2.227579498950002, + "learning_rate": 3.349235193596184e-07, + "loss": 0.0226, + "step": 1794 + }, + { + "epoch": 1.77, + "grad_norm": 0.7512967509133619, + "learning_rate": 3.320500747222677e-07, + "loss": 0.0172, + "step": 1795 + }, + { + "epoch": 1.77, + "grad_norm": 0.4290770474862749, + "learning_rate": 3.291885860666294e-07, + "loss": 0.0173, + "step": 1796 + }, + { + "epoch": 1.78, + "grad_norm": 1.250875314305571, + "learning_rate": 3.263390607218103e-07, + "loss": 0.025, + "step": 1797 + }, + { + "epoch": 1.78, + "grad_norm": 1.944211346524395, + "learning_rate": 3.235015059862767e-07, + "loss": 0.0241, + "step": 1798 + }, + { + "epoch": 1.78, + "grad_norm": 0.4028666512919997, + "learning_rate": 3.206759291278333e-07, + "loss": 0.0133, + "step": 1799 + }, + { + "epoch": 1.78, + "grad_norm": 0.8746495137789382, + "learning_rate": 3.178623373836076e-07, + "loss": 0.0209, + "step": 1800 + }, + { + "epoch": 1.78, + "grad_norm": 0.915974115336977, + "learning_rate": 3.1506073796002734e-07, + "loss": 0.0196, + "step": 1801 + }, + { + "epoch": 1.78, + "grad_norm": 1.713113489833011, + "learning_rate": 3.1227113803280863e-07, + "loss": 0.0423, + "step": 1802 + }, + { + "epoch": 1.78, + "grad_norm": 0.5203902674476263, + "learning_rate": 3.0949354474692937e-07, + "loss": 0.0155, + "step": 1803 + }, + { + "epoch": 1.78, + "grad_norm": 0.37043619740618705, + "learning_rate": 3.0672796521661663e-07, + "loss": 0.0131, + "step": 1804 + }, + { + "epoch": 1.78, + "grad_norm": 1.600749381569276, + "learning_rate": 3.0397440652532585e-07, + "loss": 0.0246, + "step": 1805 + }, + { + "epoch": 1.78, + "grad_norm": 0.5272728545861523, + "learning_rate": 3.0123287572572545e-07, + "loss": 0.0168, + "step": 1806 + }, + { + "epoch": 1.79, + "grad_norm": 1.255654485553308, + "learning_rate": 2.985033798396736e-07, + "loss": 0.0196, + "step": 1807 + }, + { + "epoch": 1.79, + "grad_norm": 1.9078529994683149, + "learning_rate": 2.9578592585820856e-07, + "loss": 0.0237, + "step": 1808 + }, + { + "epoch": 1.79, + "grad_norm": 0.40880978967914217, + "learning_rate": 2.930805207415205e-07, + "loss": 0.0138, + "step": 1809 + }, + { + "epoch": 1.79, + "grad_norm": 0.8889728807411933, + "learning_rate": 2.9038717141894266e-07, + "loss": 0.023, + "step": 1810 + }, + { + "epoch": 1.79, + "grad_norm": 0.887903401671168, + "learning_rate": 2.8770588478892805e-07, + "loss": 0.0218, + "step": 1811 + }, + { + "epoch": 1.79, + "grad_norm": 1.5935263702090745, + "learning_rate": 2.850366677190336e-07, + "loss": 0.0127, + "step": 1812 + }, + { + "epoch": 1.79, + "grad_norm": 0.8674794845738173, + "learning_rate": 2.823795270459029e-07, + "loss": 0.0148, + "step": 1813 + }, + { + "epoch": 1.79, + "grad_norm": 1.4074621926720314, + "learning_rate": 2.797344695752491e-07, + "loss": 0.0364, + "step": 1814 + }, + { + "epoch": 1.79, + "grad_norm": 1.0369490988278562, + "learning_rate": 2.771015020818363e-07, + "loss": 0.0206, + "step": 1815 + }, + { + "epoch": 1.79, + "grad_norm": 0.42829931436334806, + "learning_rate": 2.7448063130946224e-07, + "loss": 0.0153, + "step": 1816 + }, + { + "epoch": 1.8, + "grad_norm": 0.23602416053185918, + "learning_rate": 2.718718639709411e-07, + "loss": 0.0074, + "step": 1817 + }, + { + "epoch": 1.8, + "grad_norm": 5.629591449198259, + "learning_rate": 2.69275206748088e-07, + "loss": 0.0207, + "step": 1818 + }, + { + "epoch": 1.8, + "grad_norm": 0.7405988875650557, + "learning_rate": 2.666906662916985e-07, + "loss": 0.0154, + "step": 1819 + }, + { + "epoch": 1.8, + "grad_norm": 0.461232966739589, + "learning_rate": 2.641182492215361e-07, + "loss": 0.0116, + "step": 1820 + }, + { + "epoch": 1.8, + "grad_norm": 0.7075154031682652, + "learning_rate": 2.615579621263109e-07, + "loss": 0.0211, + "step": 1821 + }, + { + "epoch": 1.8, + "grad_norm": 1.1371845028463687, + "learning_rate": 2.590098115636658e-07, + "loss": 0.0316, + "step": 1822 + }, + { + "epoch": 1.8, + "grad_norm": 0.6774639388623163, + "learning_rate": 2.5647380406015665e-07, + "loss": 0.0187, + "step": 1823 + }, + { + "epoch": 1.8, + "grad_norm": 0.9274279811411574, + "learning_rate": 2.5394994611123934e-07, + "loss": 0.0179, + "step": 1824 + }, + { + "epoch": 1.8, + "grad_norm": 1.306866211141696, + "learning_rate": 2.514382441812502e-07, + "loss": 0.0184, + "step": 1825 + }, + { + "epoch": 1.8, + "grad_norm": 0.603304233210114, + "learning_rate": 2.489387047033909e-07, + "loss": 0.015, + "step": 1826 + }, + { + "epoch": 1.81, + "grad_norm": 0.5328595053040174, + "learning_rate": 2.464513340797114e-07, + "loss": 0.0226, + "step": 1827 + }, + { + "epoch": 1.81, + "grad_norm": 0.866341356192597, + "learning_rate": 2.439761386810935e-07, + "loss": 0.0172, + "step": 1828 + }, + { + "epoch": 1.81, + "grad_norm": 0.5610663210635402, + "learning_rate": 2.4151312484723465e-07, + "loss": 0.0133, + "step": 1829 + }, + { + "epoch": 1.81, + "grad_norm": 0.2493186264229105, + "learning_rate": 2.39062298886632e-07, + "loss": 0.0128, + "step": 1830 + }, + { + "epoch": 1.81, + "grad_norm": 0.6844264402500051, + "learning_rate": 2.3662366707656537e-07, + "loss": 0.0176, + "step": 1831 + }, + { + "epoch": 1.81, + "grad_norm": 1.4233840001854265, + "learning_rate": 2.341972356630845e-07, + "loss": 0.0144, + "step": 1832 + }, + { + "epoch": 1.81, + "grad_norm": 0.7240753632477378, + "learning_rate": 2.3178301086098532e-07, + "loss": 0.0143, + "step": 1833 + }, + { + "epoch": 1.81, + "grad_norm": 1.866915589742617, + "learning_rate": 2.293809988538037e-07, + "loss": 0.0246, + "step": 1834 + }, + { + "epoch": 1.81, + "grad_norm": 1.1967068684214617, + "learning_rate": 2.2699120579379174e-07, + "loss": 0.0261, + "step": 1835 + }, + { + "epoch": 1.81, + "grad_norm": 1.1879307349406338, + "learning_rate": 2.246136378019087e-07, + "loss": 0.0268, + "step": 1836 + }, + { + "epoch": 1.82, + "grad_norm": 0.5166219459046876, + "learning_rate": 2.2224830096779837e-07, + "loss": 0.0195, + "step": 1837 + }, + { + "epoch": 1.82, + "grad_norm": 1.1229789289094003, + "learning_rate": 2.198952013497796e-07, + "loss": 0.0213, + "step": 1838 + }, + { + "epoch": 1.82, + "grad_norm": 0.8698757679037399, + "learning_rate": 2.175543449748263e-07, + "loss": 0.0171, + "step": 1839 + }, + { + "epoch": 1.82, + "grad_norm": 1.0318428935613762, + "learning_rate": 2.1522573783855473e-07, + "loss": 0.0094, + "step": 1840 + }, + { + "epoch": 1.82, + "grad_norm": 1.4064528369110108, + "learning_rate": 2.129093859052067e-07, + "loss": 0.03, + "step": 1841 + }, + { + "epoch": 1.82, + "grad_norm": 0.961657681832751, + "learning_rate": 2.106052951076365e-07, + "loss": 0.0211, + "step": 1842 + }, + { + "epoch": 1.82, + "grad_norm": 1.9449980636138309, + "learning_rate": 2.083134713472923e-07, + "loss": 0.0192, + "step": 1843 + }, + { + "epoch": 1.82, + "grad_norm": 0.7800742218320303, + "learning_rate": 2.0603392049420357e-07, + "loss": 0.0211, + "step": 1844 + }, + { + "epoch": 1.82, + "grad_norm": 1.0808053808935367, + "learning_rate": 2.0376664838696546e-07, + "loss": 0.0109, + "step": 1845 + }, + { + "epoch": 1.82, + "grad_norm": 0.6351185948935967, + "learning_rate": 2.0151166083272222e-07, + "loss": 0.0137, + "step": 1846 + }, + { + "epoch": 1.83, + "grad_norm": 0.5968180495011286, + "learning_rate": 1.9926896360715542e-07, + "loss": 0.0161, + "step": 1847 + }, + { + "epoch": 1.83, + "grad_norm": 0.5208311636465429, + "learning_rate": 1.9703856245446795e-07, + "loss": 0.0146, + "step": 1848 + }, + { + "epoch": 1.83, + "grad_norm": 0.5894146185232907, + "learning_rate": 1.948204630873668e-07, + "loss": 0.0181, + "step": 1849 + }, + { + "epoch": 1.83, + "grad_norm": 0.2825054473544255, + "learning_rate": 1.9261467118705246e-07, + "loss": 0.0106, + "step": 1850 + }, + { + "epoch": 1.83, + "grad_norm": 0.2878943348884669, + "learning_rate": 1.9042119240320067e-07, + "loss": 0.0103, + "step": 1851 + }, + { + "epoch": 1.83, + "grad_norm": 0.5635184171884943, + "learning_rate": 1.8824003235395128e-07, + "loss": 0.0223, + "step": 1852 + }, + { + "epoch": 1.83, + "grad_norm": 0.5714917480875857, + "learning_rate": 1.8607119662589045e-07, + "loss": 0.0214, + "step": 1853 + }, + { + "epoch": 1.83, + "grad_norm": 0.5819822583931356, + "learning_rate": 1.8391469077404134e-07, + "loss": 0.0222, + "step": 1854 + }, + { + "epoch": 1.83, + "grad_norm": 0.7490706862618906, + "learning_rate": 1.8177052032184285e-07, + "loss": 0.014, + "step": 1855 + }, + { + "epoch": 1.83, + "grad_norm": 0.9170654518824781, + "learning_rate": 1.7963869076114193e-07, + "loss": 0.018, + "step": 1856 + }, + { + "epoch": 1.83, + "grad_norm": 0.9597599706808101, + "learning_rate": 1.7751920755217532e-07, + "loss": 0.0235, + "step": 1857 + }, + { + "epoch": 1.84, + "grad_norm": 1.8393265995727257, + "learning_rate": 1.7541207612355894e-07, + "loss": 0.0237, + "step": 1858 + }, + { + "epoch": 1.84, + "grad_norm": 0.5720599297711036, + "learning_rate": 1.7331730187227003e-07, + "loss": 0.0159, + "step": 1859 + }, + { + "epoch": 1.84, + "grad_norm": 0.38535609643181973, + "learning_rate": 1.7123489016363793e-07, + "loss": 0.0123, + "step": 1860 + }, + { + "epoch": 1.84, + "grad_norm": 0.6143963637658444, + "learning_rate": 1.6916484633132558e-07, + "loss": 0.0224, + "step": 1861 + }, + { + "epoch": 1.84, + "grad_norm": 0.5189176558087583, + "learning_rate": 1.671071756773196e-07, + "loss": 0.0171, + "step": 1862 + }, + { + "epoch": 1.84, + "grad_norm": 0.6668235969578423, + "learning_rate": 1.6506188347191477e-07, + "loss": 0.0215, + "step": 1863 + }, + { + "epoch": 1.84, + "grad_norm": 1.1154353763845224, + "learning_rate": 1.6302897495370175e-07, + "loss": 0.0282, + "step": 1864 + }, + { + "epoch": 1.84, + "grad_norm": 0.6912898158788077, + "learning_rate": 1.61008455329551e-07, + "loss": 0.0214, + "step": 1865 + }, + { + "epoch": 1.84, + "grad_norm": 0.755595253610713, + "learning_rate": 1.59000329774604e-07, + "loss": 0.0243, + "step": 1866 + }, + { + "epoch": 1.84, + "grad_norm": 0.43189963611385784, + "learning_rate": 1.5700460343225644e-07, + "loss": 0.0144, + "step": 1867 + }, + { + "epoch": 1.85, + "grad_norm": 0.7980037217430461, + "learning_rate": 1.5502128141414496e-07, + "loss": 0.0232, + "step": 1868 + }, + { + "epoch": 1.85, + "grad_norm": 0.42317850471550683, + "learning_rate": 1.5305036880013612e-07, + "loss": 0.0144, + "step": 1869 + }, + { + "epoch": 1.85, + "grad_norm": 0.3440586152342501, + "learning_rate": 1.5109187063831243e-07, + "loss": 0.0151, + "step": 1870 + }, + { + "epoch": 1.85, + "grad_norm": 0.9689038545074031, + "learning_rate": 1.4914579194495794e-07, + "loss": 0.025, + "step": 1871 + }, + { + "epoch": 1.85, + "grad_norm": 1.0026706796663856, + "learning_rate": 1.4721213770454988e-07, + "loss": 0.0193, + "step": 1872 + }, + { + "epoch": 1.85, + "grad_norm": 1.3776628922159566, + "learning_rate": 1.4529091286973994e-07, + "loss": 0.0323, + "step": 1873 + }, + { + "epoch": 1.85, + "grad_norm": 1.1160811515237832, + "learning_rate": 1.4338212236134518e-07, + "loss": 0.0182, + "step": 1874 + }, + { + "epoch": 1.85, + "grad_norm": 0.27113768389532256, + "learning_rate": 1.414857710683365e-07, + "loss": 0.0141, + "step": 1875 + }, + { + "epoch": 1.85, + "grad_norm": 0.4976224971778871, + "learning_rate": 1.3960186384782025e-07, + "loss": 0.0174, + "step": 1876 + }, + { + "epoch": 1.85, + "grad_norm": 0.5627076451492086, + "learning_rate": 1.3773040552503447e-07, + "loss": 0.0139, + "step": 1877 + }, + { + "epoch": 1.86, + "grad_norm": 0.6144894015224014, + "learning_rate": 1.3587140089332984e-07, + "loss": 0.0235, + "step": 1878 + }, + { + "epoch": 1.86, + "grad_norm": 0.37920150421252663, + "learning_rate": 1.3402485471415871e-07, + "loss": 0.0115, + "step": 1879 + }, + { + "epoch": 1.86, + "grad_norm": 0.3543116999305515, + "learning_rate": 1.3219077171706563e-07, + "loss": 0.0114, + "step": 1880 + }, + { + "epoch": 1.86, + "grad_norm": 0.6932990480913824, + "learning_rate": 1.303691565996712e-07, + "loss": 0.0214, + "step": 1881 + }, + { + "epoch": 1.86, + "grad_norm": 0.48280266582056625, + "learning_rate": 1.2856001402766384e-07, + "loss": 0.0123, + "step": 1882 + }, + { + "epoch": 1.86, + "grad_norm": 1.3600213634007312, + "learning_rate": 1.267633486347858e-07, + "loss": 0.018, + "step": 1883 + }, + { + "epoch": 1.86, + "grad_norm": 0.6233047924767592, + "learning_rate": 1.2497916502282104e-07, + "loss": 0.0118, + "step": 1884 + }, + { + "epoch": 1.86, + "grad_norm": 0.6557622650641682, + "learning_rate": 1.232074677615841e-07, + "loss": 0.0144, + "step": 1885 + }, + { + "epoch": 1.86, + "grad_norm": 0.68680008251895, + "learning_rate": 1.2144826138890897e-07, + "loss": 0.014, + "step": 1886 + }, + { + "epoch": 1.86, + "grad_norm": 1.6753520125550132, + "learning_rate": 1.1970155041063636e-07, + "loss": 0.0185, + "step": 1887 + }, + { + "epoch": 1.87, + "grad_norm": 0.8061089749043806, + "learning_rate": 1.1796733930060256e-07, + "loss": 0.027, + "step": 1888 + }, + { + "epoch": 1.87, + "grad_norm": 0.9511108204414935, + "learning_rate": 1.1624563250062836e-07, + "loss": 0.0249, + "step": 1889 + }, + { + "epoch": 1.87, + "grad_norm": 0.5887898728183794, + "learning_rate": 1.1453643442050744e-07, + "loss": 0.0211, + "step": 1890 + }, + { + "epoch": 1.87, + "grad_norm": 0.4905414507949429, + "learning_rate": 1.1283974943799403e-07, + "loss": 0.0138, + "step": 1891 + }, + { + "epoch": 1.87, + "grad_norm": 0.9247168896017007, + "learning_rate": 1.1115558189879416e-07, + "loss": 0.0122, + "step": 1892 + }, + { + "epoch": 1.87, + "grad_norm": 0.6215487472298206, + "learning_rate": 1.0948393611655229e-07, + "loss": 0.0163, + "step": 1893 + }, + { + "epoch": 1.87, + "grad_norm": 0.7473599642814582, + "learning_rate": 1.0782481637284014e-07, + "loss": 0.0157, + "step": 1894 + }, + { + "epoch": 1.87, + "grad_norm": 1.0402363260233884, + "learning_rate": 1.0617822691714796e-07, + "loss": 0.0153, + "step": 1895 + }, + { + "epoch": 1.87, + "grad_norm": 0.5761737409888579, + "learning_rate": 1.0454417196687216e-07, + "loss": 0.0186, + "step": 1896 + }, + { + "epoch": 1.87, + "grad_norm": 0.5818796082089626, + "learning_rate": 1.0292265570730431e-07, + "loss": 0.0173, + "step": 1897 + }, + { + "epoch": 1.88, + "grad_norm": 0.4937356890575318, + "learning_rate": 1.0131368229161997e-07, + "loss": 0.0116, + "step": 1898 + }, + { + "epoch": 1.88, + "grad_norm": 0.595595287827361, + "learning_rate": 9.971725584086933e-08, + "loss": 0.0182, + "step": 1899 + }, + { + "epoch": 1.88, + "grad_norm": 0.704730771852314, + "learning_rate": 9.813338044396714e-08, + "loss": 0.0166, + "step": 1900 + }, + { + "epoch": 1.88, + "grad_norm": 0.9696721850434132, + "learning_rate": 9.656206015768e-08, + "loss": 0.0162, + "step": 1901 + }, + { + "epoch": 1.88, + "grad_norm": 0.6552313791421823, + "learning_rate": 9.500329900661742e-08, + "loss": 0.0185, + "step": 1902 + }, + { + "epoch": 1.88, + "grad_norm": 0.6329810385027012, + "learning_rate": 9.345710098322247e-08, + "loss": 0.0141, + "step": 1903 + }, + { + "epoch": 1.88, + "grad_norm": 0.47133660028286867, + "learning_rate": 9.192347004775781e-08, + "loss": 0.0258, + "step": 1904 + }, + { + "epoch": 1.88, + "grad_norm": 0.37229356852218276, + "learning_rate": 9.040241012830131e-08, + "loss": 0.0172, + "step": 1905 + }, + { + "epoch": 1.88, + "grad_norm": 1.55541532428076, + "learning_rate": 8.889392512072992e-08, + "loss": 0.0225, + "step": 1906 + }, + { + "epoch": 1.88, + "grad_norm": 0.5276375856301501, + "learning_rate": 8.739801888871468e-08, + "loss": 0.0211, + "step": 1907 + }, + { + "epoch": 1.89, + "grad_norm": 0.4956939691451788, + "learning_rate": 8.591469526370799e-08, + "loss": 0.0156, + "step": 1908 + }, + { + "epoch": 1.89, + "grad_norm": 0.7859967160273817, + "learning_rate": 8.444395804493411e-08, + "loss": 0.0197, + "step": 1909 + }, + { + "epoch": 1.89, + "grad_norm": 0.5770278583139444, + "learning_rate": 8.298581099937975e-08, + "loss": 0.0161, + "step": 1910 + }, + { + "epoch": 1.89, + "grad_norm": 1.1811644683909457, + "learning_rate": 8.154025786178577e-08, + "loss": 0.0281, + "step": 1911 + }, + { + "epoch": 1.89, + "grad_norm": 0.48198193891981583, + "learning_rate": 8.010730233463493e-08, + "loss": 0.0212, + "step": 1912 + }, + { + "epoch": 1.89, + "grad_norm": 0.8232670905597198, + "learning_rate": 7.86869480881447e-08, + "loss": 0.027, + "step": 1913 + }, + { + "epoch": 1.89, + "grad_norm": 1.7847367471690805, + "learning_rate": 7.727919876025669e-08, + "loss": 0.0224, + "step": 1914 + }, + { + "epoch": 1.89, + "grad_norm": 0.3673488997507827, + "learning_rate": 7.588405795662779e-08, + "loss": 0.0147, + "step": 1915 + }, + { + "epoch": 1.89, + "grad_norm": 0.5712286156889304, + "learning_rate": 7.450152925062015e-08, + "loss": 0.016, + "step": 1916 + }, + { + "epoch": 1.89, + "grad_norm": 1.584801234518443, + "learning_rate": 7.3131616183294e-08, + "loss": 0.0221, + "step": 1917 + }, + { + "epoch": 1.9, + "grad_norm": 1.1547907543640876, + "learning_rate": 7.177432226339542e-08, + "loss": 0.0326, + "step": 1918 + }, + { + "epoch": 1.9, + "grad_norm": 0.5996020279426367, + "learning_rate": 7.042965096735076e-08, + "loss": 0.0173, + "step": 1919 + }, + { + "epoch": 1.9, + "grad_norm": 0.9630804667184417, + "learning_rate": 6.909760573925561e-08, + "loss": 0.0194, + "step": 1920 + }, + { + "epoch": 1.9, + "grad_norm": 0.7923608884650656, + "learning_rate": 6.777818999086582e-08, + "loss": 0.0242, + "step": 1921 + }, + { + "epoch": 1.9, + "grad_norm": 0.6136669614491637, + "learning_rate": 6.647140710159039e-08, + "loss": 0.0231, + "step": 1922 + }, + { + "epoch": 1.9, + "grad_norm": 2.444098778295829, + "learning_rate": 6.51772604184825e-08, + "loss": 0.0227, + "step": 1923 + }, + { + "epoch": 1.9, + "grad_norm": 0.7236056147422347, + "learning_rate": 6.389575325622787e-08, + "loss": 0.0245, + "step": 1924 + }, + { + "epoch": 1.9, + "grad_norm": 0.5610244487794093, + "learning_rate": 6.262688889714152e-08, + "loss": 0.0189, + "step": 1925 + }, + { + "epoch": 1.9, + "grad_norm": 0.6533403737022699, + "learning_rate": 6.137067059115431e-08, + "loss": 0.0223, + "step": 1926 + }, + { + "epoch": 1.9, + "grad_norm": 1.2250995109120701, + "learning_rate": 6.012710155580858e-08, + "loss": 0.0266, + "step": 1927 + }, + { + "epoch": 1.91, + "grad_norm": 0.5322808366260482, + "learning_rate": 5.889618497624649e-08, + "loss": 0.0142, + "step": 1928 + }, + { + "epoch": 1.91, + "grad_norm": 0.6284323476920674, + "learning_rate": 5.767792400520556e-08, + "loss": 0.0219, + "step": 1929 + }, + { + "epoch": 1.91, + "grad_norm": 1.5611338062761357, + "learning_rate": 5.647232176300754e-08, + "loss": 0.0192, + "step": 1930 + }, + { + "epoch": 1.91, + "grad_norm": 0.5508596623052354, + "learning_rate": 5.5279381337551286e-08, + "loss": 0.0186, + "step": 1931 + }, + { + "epoch": 1.91, + "grad_norm": 0.6227099224632395, + "learning_rate": 5.409910578430488e-08, + "loss": 0.0132, + "step": 1932 + }, + { + "epoch": 1.91, + "grad_norm": 0.5567745537204065, + "learning_rate": 5.2931498126298495e-08, + "loss": 0.0097, + "step": 1933 + }, + { + "epoch": 1.91, + "grad_norm": 0.5576063453236663, + "learning_rate": 5.177656135411657e-08, + "loss": 0.0161, + "step": 1934 + }, + { + "epoch": 1.91, + "grad_norm": 1.3370801447420126, + "learning_rate": 5.063429842588841e-08, + "loss": 0.0265, + "step": 1935 + }, + { + "epoch": 1.91, + "grad_norm": 0.7544241187243449, + "learning_rate": 4.950471226728371e-08, + "loss": 0.0182, + "step": 1936 + }, + { + "epoch": 1.91, + "grad_norm": 0.5253352786844406, + "learning_rate": 4.838780577150093e-08, + "loss": 0.0174, + "step": 1937 + }, + { + "epoch": 1.92, + "grad_norm": 0.7944907580697882, + "learning_rate": 4.728358179926451e-08, + "loss": 0.0189, + "step": 1938 + }, + { + "epoch": 1.92, + "grad_norm": 1.9651599809552647, + "learning_rate": 4.619204317881376e-08, + "loss": 0.0278, + "step": 1939 + }, + { + "epoch": 1.92, + "grad_norm": 0.6170478992323329, + "learning_rate": 4.511319270589731e-08, + "loss": 0.0187, + "step": 1940 + }, + { + "epoch": 1.92, + "grad_norm": 0.5413410426193749, + "learning_rate": 4.404703314376646e-08, + "loss": 0.0212, + "step": 1941 + }, + { + "epoch": 1.92, + "grad_norm": 0.3773729241544756, + "learning_rate": 4.299356722316683e-08, + "loss": 0.0157, + "step": 1942 + }, + { + "epoch": 1.92, + "grad_norm": 0.5028399673629241, + "learning_rate": 4.1952797642331736e-08, + "loss": 0.0161, + "step": 1943 + }, + { + "epoch": 1.92, + "grad_norm": 0.7421325043346971, + "learning_rate": 4.092472706697603e-08, + "loss": 0.02, + "step": 1944 + }, + { + "epoch": 1.92, + "grad_norm": 0.5828564139036714, + "learning_rate": 3.99093581302884e-08, + "loss": 0.0159, + "step": 1945 + }, + { + "epoch": 1.92, + "grad_norm": 0.3703638920114571, + "learning_rate": 3.890669343292464e-08, + "loss": 0.0153, + "step": 1946 + }, + { + "epoch": 1.92, + "grad_norm": 0.7208034989055797, + "learning_rate": 3.791673554300157e-08, + "loss": 0.0166, + "step": 1947 + }, + { + "epoch": 1.92, + "grad_norm": 0.7710647513091973, + "learning_rate": 3.6939486996090953e-08, + "loss": 0.0229, + "step": 1948 + }, + { + "epoch": 1.93, + "grad_norm": 0.6997720856631476, + "learning_rate": 3.597495029521059e-08, + "loss": 0.0223, + "step": 1949 + }, + { + "epoch": 1.93, + "grad_norm": 0.44352812503571115, + "learning_rate": 3.5023127910820966e-08, + "loss": 0.0106, + "step": 1950 + }, + { + "epoch": 1.93, + "grad_norm": 0.7756413157400694, + "learning_rate": 3.408402228081642e-08, + "loss": 0.0233, + "step": 1951 + }, + { + "epoch": 1.93, + "grad_norm": 0.5743501883431086, + "learning_rate": 3.315763581052067e-08, + "loss": 0.0136, + "step": 1952 + }, + { + "epoch": 1.93, + "grad_norm": 0.662378100288348, + "learning_rate": 3.224397087267961e-08, + "loss": 0.0172, + "step": 1953 + }, + { + "epoch": 1.93, + "grad_norm": 0.45464868521172774, + "learning_rate": 3.1343029807456296e-08, + "loss": 0.0162, + "step": 1954 + }, + { + "epoch": 1.93, + "grad_norm": 0.3151847873751775, + "learning_rate": 3.045481492242319e-08, + "loss": 0.0088, + "step": 1955 + }, + { + "epoch": 1.93, + "grad_norm": 0.9179713108290374, + "learning_rate": 2.9579328492557734e-08, + "loss": 0.0188, + "step": 1956 + }, + { + "epoch": 1.93, + "grad_norm": 0.7442297593300125, + "learning_rate": 2.8716572760236205e-08, + "loss": 0.0176, + "step": 1957 + }, + { + "epoch": 1.93, + "grad_norm": 0.6467132296202227, + "learning_rate": 2.7866549935227638e-08, + "loss": 0.0262, + "step": 1958 + }, + { + "epoch": 1.94, + "grad_norm": 0.4271184977588041, + "learning_rate": 2.702926219468882e-08, + "loss": 0.0181, + "step": 1959 + }, + { + "epoch": 1.94, + "grad_norm": 0.34858153463525593, + "learning_rate": 2.620471168315819e-08, + "loss": 0.0114, + "step": 1960 + }, + { + "epoch": 1.94, + "grad_norm": 0.5191191049249508, + "learning_rate": 2.5392900512549168e-08, + "loss": 0.0148, + "step": 1961 + }, + { + "epoch": 1.94, + "grad_norm": 0.7904320889688824, + "learning_rate": 2.459383076214794e-08, + "loss": 0.0227, + "step": 1962 + }, + { + "epoch": 1.94, + "grad_norm": 0.40664887186710424, + "learning_rate": 2.3807504478604583e-08, + "loss": 0.014, + "step": 1963 + }, + { + "epoch": 1.94, + "grad_norm": 2.404072714467341, + "learning_rate": 2.303392367593027e-08, + "loss": 0.0302, + "step": 1964 + }, + { + "epoch": 1.94, + "grad_norm": 0.8069295138522137, + "learning_rate": 2.2273090335491744e-08, + "loss": 0.0241, + "step": 1965 + }, + { + "epoch": 1.94, + "grad_norm": 0.3797343238694821, + "learning_rate": 2.152500640600519e-08, + "loss": 0.015, + "step": 1966 + }, + { + "epoch": 1.94, + "grad_norm": 0.8463632953185086, + "learning_rate": 2.0789673803530696e-08, + "loss": 0.0201, + "step": 1967 + }, + { + "epoch": 1.94, + "grad_norm": 1.0329783325265296, + "learning_rate": 2.006709441147059e-08, + "loss": 0.0236, + "step": 1968 + }, + { + "epoch": 1.95, + "grad_norm": 0.4942200195158771, + "learning_rate": 1.9357270080561654e-08, + "loss": 0.0204, + "step": 1969 + }, + { + "epoch": 1.95, + "grad_norm": 0.4577934210496395, + "learning_rate": 1.866020262887014e-08, + "loss": 0.0214, + "step": 1970 + }, + { + "epoch": 1.95, + "grad_norm": 0.9348702790058298, + "learning_rate": 1.7975893841790105e-08, + "loss": 0.0175, + "step": 1971 + }, + { + "epoch": 1.95, + "grad_norm": 0.563106283801156, + "learning_rate": 1.7304345472035634e-08, + "loss": 0.0142, + "step": 1972 + }, + { + "epoch": 1.95, + "grad_norm": 0.8118174545670076, + "learning_rate": 1.6645559239638066e-08, + "loss": 0.0139, + "step": 1973 + }, + { + "epoch": 1.95, + "grad_norm": 0.7418989057261591, + "learning_rate": 1.5999536831941e-08, + "loss": 0.0152, + "step": 1974 + }, + { + "epoch": 1.95, + "grad_norm": 1.0174044425443258, + "learning_rate": 1.536627990359585e-08, + "loss": 0.0196, + "step": 1975 + }, + { + "epoch": 1.95, + "grad_norm": 0.5660161998151887, + "learning_rate": 1.474579007655963e-08, + "loss": 0.0149, + "step": 1976 + }, + { + "epoch": 1.95, + "grad_norm": 0.6964625660138856, + "learning_rate": 1.413806894008718e-08, + "loss": 0.0278, + "step": 1977 + }, + { + "epoch": 1.95, + "grad_norm": 0.7449047694433539, + "learning_rate": 1.3543118050730053e-08, + "loss": 0.0095, + "step": 1978 + }, + { + "epoch": 1.96, + "grad_norm": 0.5775894499740872, + "learning_rate": 1.2960938932329858e-08, + "loss": 0.0194, + "step": 1979 + }, + { + "epoch": 1.96, + "grad_norm": 0.5028520606659571, + "learning_rate": 1.2391533076018258e-08, + "loss": 0.0185, + "step": 1980 + }, + { + "epoch": 1.96, + "grad_norm": 0.8258924922257007, + "learning_rate": 1.1834901940209752e-08, + "loss": 0.0302, + "step": 1981 + }, + { + "epoch": 1.96, + "grad_norm": 1.404199557513869, + "learning_rate": 1.129104695059835e-08, + "loss": 0.0208, + "step": 1982 + }, + { + "epoch": 1.96, + "grad_norm": 0.7722768594707403, + "learning_rate": 1.0759969500155897e-08, + "loss": 0.018, + "step": 1983 + }, + { + "epoch": 1.96, + "grad_norm": 1.2411521975184556, + "learning_rate": 1.0241670949127091e-08, + "loss": 0.0228, + "step": 1984 + }, + { + "epoch": 1.96, + "grad_norm": 0.40738726676339176, + "learning_rate": 9.73615262502503e-09, + "loss": 0.0166, + "step": 1985 + }, + { + "epoch": 1.96, + "grad_norm": 0.5418543637769685, + "learning_rate": 9.243415822629553e-09, + "loss": 0.0137, + "step": 1986 + }, + { + "epoch": 1.96, + "grad_norm": 0.5457350500278516, + "learning_rate": 8.763461803983907e-09, + "loss": 0.0168, + "step": 1987 + }, + { + "epoch": 1.96, + "grad_norm": 2.046562261481058, + "learning_rate": 8.29629179839031e-09, + "loss": 0.0236, + "step": 1988 + }, + { + "epoch": 1.97, + "grad_norm": 0.5594564303475206, + "learning_rate": 7.841907002407723e-09, + "loss": 0.0188, + "step": 1989 + }, + { + "epoch": 1.97, + "grad_norm": 0.4520631765372096, + "learning_rate": 7.40030857984686e-09, + "loss": 0.0175, + "step": 1990 + }, + { + "epoch": 1.97, + "grad_norm": 0.7392882226317737, + "learning_rate": 6.971497661771854e-09, + "loss": 0.0201, + "step": 1991 + }, + { + "epoch": 1.97, + "grad_norm": 0.7067878390043856, + "learning_rate": 6.555475346491369e-09, + "loss": 0.019, + "step": 1992 + }, + { + "epoch": 1.97, + "grad_norm": 1.4829992263795246, + "learning_rate": 6.152242699560273e-09, + "loss": 0.023, + "step": 1993 + }, + { + "epoch": 1.97, + "grad_norm": 0.557053916058929, + "learning_rate": 5.761800753775193e-09, + "loss": 0.0181, + "step": 1994 + }, + { + "epoch": 1.97, + "grad_norm": 0.4902359995960106, + "learning_rate": 5.384150509171737e-09, + "loss": 0.0106, + "step": 1995 + }, + { + "epoch": 1.97, + "grad_norm": 0.6514146661773347, + "learning_rate": 5.019292933022279e-09, + "loss": 0.022, + "step": 1996 + }, + { + "epoch": 1.97, + "grad_norm": 0.6339899864140751, + "learning_rate": 4.6672289598337364e-09, + "loss": 0.0182, + "step": 1997 + }, + { + "epoch": 1.97, + "grad_norm": 0.563678714006939, + "learning_rate": 4.327959491344791e-09, + "loss": 0.019, + "step": 1998 + }, + { + "epoch": 1.98, + "grad_norm": 0.46035552482503506, + "learning_rate": 4.001485396523119e-09, + "loss": 0.0158, + "step": 1999 + }, + { + "epoch": 1.98, + "grad_norm": 0.8784396812135417, + "learning_rate": 3.6878075115642785e-09, + "loss": 0.0144, + "step": 2000 + } + ], + "logging_steps": 1.0, + "max_steps": 2024, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1000, + "total_flos": 85478773166080.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}