{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.925925925925926, "eval_steps": 500, "global_step": 324, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018518518518518517, "grad_norm": 11.543508930958351, "learning_rate": 1.3333333333333334e-07, "loss": 2.5502, "step": 1 }, { "epoch": 0.037037037037037035, "grad_norm": 11.036573442393484, "learning_rate": 2.6666666666666667e-07, "loss": 2.524, "step": 2 }, { "epoch": 0.05555555555555555, "grad_norm": 10.633243381981275, "learning_rate": 4e-07, "loss": 2.204, "step": 3 }, { "epoch": 0.07407407407407407, "grad_norm": 10.844156107788931, "learning_rate": 5.333333333333333e-07, "loss": 2.6556, "step": 4 }, { "epoch": 0.09259259259259259, "grad_norm": 10.31689604512179, "learning_rate": 6.666666666666666e-07, "loss": 2.3083, "step": 5 }, { "epoch": 0.1111111111111111, "grad_norm": 9.817262372273788, "learning_rate": 8e-07, "loss": 2.4079, "step": 6 }, { "epoch": 0.12962962962962962, "grad_norm": 9.116167654173315, "learning_rate": 9.333333333333333e-07, "loss": 2.3343, "step": 7 }, { "epoch": 0.14814814814814814, "grad_norm": 7.891190287353295, "learning_rate": 1.0666666666666667e-06, "loss": 2.3883, "step": 8 }, { "epoch": 0.16666666666666666, "grad_norm": 8.42233222280676, "learning_rate": 1.2e-06, "loss": 2.4733, "step": 9 }, { "epoch": 0.18518518518518517, "grad_norm": 6.5902867588718825, "learning_rate": 1.3333333333333332e-06, "loss": 2.2598, "step": 10 }, { "epoch": 0.2037037037037037, "grad_norm": 7.468618276890062, "learning_rate": 1.4666666666666665e-06, "loss": 2.6818, "step": 11 }, { "epoch": 0.2222222222222222, "grad_norm": 6.524430399848726, "learning_rate": 1.6e-06, "loss": 2.0609, "step": 12 }, { "epoch": 0.24074074074074073, "grad_norm": 15.819198637332978, "learning_rate": 1.7333333333333334e-06, "loss": 1.8734, "step": 13 }, { "epoch": 0.25925925925925924, "grad_norm": 12.251404296601525, "learning_rate": 1.8666666666666667e-06, "loss": 2.3952, "step": 14 }, { "epoch": 0.2777777777777778, "grad_norm": 12.014341658055084, "learning_rate": 2e-06, "loss": 2.0763, "step": 15 }, { "epoch": 0.2962962962962963, "grad_norm": 9.119171460936416, "learning_rate": 1.999948316841124e-06, "loss": 2.2581, "step": 16 }, { "epoch": 0.3148148148148148, "grad_norm": 7.0075699362300785, "learning_rate": 1.999793272706794e-06, "loss": 2.3189, "step": 17 }, { "epoch": 0.3333333333333333, "grad_norm": 8.434551205593468, "learning_rate": 1.9995348836233515e-06, "loss": 2.2956, "step": 18 }, { "epoch": 0.35185185185185186, "grad_norm": 9.802253199544783, "learning_rate": 1.999173176299524e-06, "loss": 2.1106, "step": 19 }, { "epoch": 0.37037037037037035, "grad_norm": 7.364889431202562, "learning_rate": 1.9987081881236665e-06, "loss": 2.4001, "step": 20 }, { "epoch": 0.3888888888888889, "grad_norm": 5.5030313904087995, "learning_rate": 1.9981399671598938e-06, "loss": 2.0534, "step": 21 }, { "epoch": 0.4074074074074074, "grad_norm": 4.022498450217217, "learning_rate": 1.997468572143115e-06, "loss": 1.9262, "step": 22 }, { "epoch": 0.42592592592592593, "grad_norm": 4.237115597250525, "learning_rate": 1.9966940724729603e-06, "loss": 2.2743, "step": 23 }, { "epoch": 0.4444444444444444, "grad_norm": 4.300566273621826, "learning_rate": 1.995816548206609e-06, "loss": 2.028, "step": 24 }, { "epoch": 0.46296296296296297, "grad_norm": 4.157703163471443, "learning_rate": 1.994836090050514e-06, "loss": 2.2021, "step": 25 }, { "epoch": 0.48148148148148145, "grad_norm": 4.59475590188255, "learning_rate": 1.993752799351023e-06, "loss": 2.1409, "step": 26 }, { "epoch": 0.5, "grad_norm": 3.553829762084, "learning_rate": 1.992566788083908e-06, "loss": 2.1277, "step": 27 }, { "epoch": 0.5185185185185185, "grad_norm": 2.808767466788676, "learning_rate": 1.9912781788427856e-06, "loss": 2.074, "step": 28 }, { "epoch": 0.5370370370370371, "grad_norm": 2.945994143903197, "learning_rate": 1.989887104826449e-06, "loss": 1.9894, "step": 29 }, { "epoch": 0.5555555555555556, "grad_norm": 2.9859402190241, "learning_rate": 1.988393709825096e-06, "loss": 2.1096, "step": 30 }, { "epoch": 0.5740740740740741, "grad_norm": 2.788646179800959, "learning_rate": 1.9867981482054697e-06, "loss": 2.315, "step": 31 }, { "epoch": 0.5925925925925926, "grad_norm": 2.428878990731119, "learning_rate": 1.9851005848948986e-06, "loss": 2.1129, "step": 32 }, { "epoch": 0.6111111111111112, "grad_norm": 2.326070514005508, "learning_rate": 1.983301195364252e-06, "loss": 2.3507, "step": 33 }, { "epoch": 0.6296296296296297, "grad_norm": 2.2448623338584524, "learning_rate": 1.9814001656098e-06, "loss": 2.2176, "step": 34 }, { "epoch": 0.6481481481481481, "grad_norm": 3.345489216172997, "learning_rate": 1.9793976921339876e-06, "loss": 2.0352, "step": 35 }, { "epoch": 0.6666666666666666, "grad_norm": 2.454245882780074, "learning_rate": 1.9772939819251245e-06, "loss": 1.7644, "step": 36 }, { "epoch": 0.6851851851851852, "grad_norm": 2.2823601110851115, "learning_rate": 1.9750892524359894e-06, "loss": 2.0044, "step": 37 }, { "epoch": 0.7037037037037037, "grad_norm": 2.378703420397497, "learning_rate": 1.9727837315613503e-06, "loss": 1.9992, "step": 38 }, { "epoch": 0.7222222222222222, "grad_norm": 2.2038000284491392, "learning_rate": 1.9703776576144106e-06, "loss": 2.1248, "step": 39 }, { "epoch": 0.7407407407407407, "grad_norm": 1.6625652175528476, "learning_rate": 1.9678712793021747e-06, "loss": 1.7908, "step": 40 }, { "epoch": 0.7592592592592593, "grad_norm": 1.649500064167637, "learning_rate": 1.9652648556997396e-06, "loss": 2.0346, "step": 41 }, { "epoch": 0.7777777777777778, "grad_norm": 2.127402784391995, "learning_rate": 1.962558656223516e-06, "loss": 2.1544, "step": 42 }, { "epoch": 0.7962962962962963, "grad_norm": 2.4572023559040668, "learning_rate": 1.959752960603378e-06, "loss": 1.9295, "step": 43 }, { "epoch": 0.8148148148148148, "grad_norm": 1.511188510592738, "learning_rate": 1.956848058853751e-06, "loss": 2.1473, "step": 44 }, { "epoch": 0.8333333333333334, "grad_norm": 2.6425186462750276, "learning_rate": 1.9538442512436325e-06, "loss": 1.7632, "step": 45 }, { "epoch": 0.8518518518518519, "grad_norm": 2.528104013708182, "learning_rate": 1.9507418482655546e-06, "loss": 1.9125, "step": 46 }, { "epoch": 0.8703703703703703, "grad_norm": 2.660072260955662, "learning_rate": 1.947541170603488e-06, "loss": 1.9839, "step": 47 }, { "epoch": 0.8888888888888888, "grad_norm": 2.303424321729968, "learning_rate": 1.9442425490996984e-06, "loss": 1.8381, "step": 48 }, { "epoch": 0.9074074074074074, "grad_norm": 1.7413263437826438, "learning_rate": 1.940846324720544e-06, "loss": 2.2322, "step": 49 }, { "epoch": 0.9259259259259259, "grad_norm": 3.681741007928878, "learning_rate": 1.9373528485212327e-06, "loss": 2.1221, "step": 50 }, { "epoch": 0.9444444444444444, "grad_norm": 2.729258330107977, "learning_rate": 1.9337624816095357e-06, "loss": 1.8567, "step": 51 }, { "epoch": 0.9629629629629629, "grad_norm": 1.9607649593150183, "learning_rate": 1.9300755951084592e-06, "loss": 2.0553, "step": 52 }, { "epoch": 0.9814814814814815, "grad_norm": 2.119362131138027, "learning_rate": 1.9262925701178863e-06, "loss": 1.936, "step": 53 }, { "epoch": 1.0, "grad_norm": 2.057082578120893, "learning_rate": 1.9224137976751793e-06, "loss": 1.9584, "step": 54 }, { "epoch": 1.0185185185185186, "grad_norm": 2.0207421134902708, "learning_rate": 1.918439678714763e-06, "loss": 1.9837, "step": 55 }, { "epoch": 1.0185185185185186, "grad_norm": 1.938684997881939, "learning_rate": 1.9143706240266807e-06, "loss": 1.9354, "step": 56 }, { "epoch": 1.037037037037037, "grad_norm": 2.0601195298871398, "learning_rate": 1.910207054214133e-06, "loss": 2.0174, "step": 57 }, { "epoch": 1.0555555555555556, "grad_norm": 2.041620934780644, "learning_rate": 1.9059493996499985e-06, "loss": 1.7447, "step": 58 }, { "epoch": 1.074074074074074, "grad_norm": 1.5682604954979573, "learning_rate": 1.9015981004323534e-06, "loss": 2.0106, "step": 59 }, { "epoch": 1.0925925925925926, "grad_norm": 2.865965004078874, "learning_rate": 1.8971536063389742e-06, "loss": 2.2393, "step": 60 }, { "epoch": 1.1111111111111112, "grad_norm": 2.7462581398678787, "learning_rate": 1.89261637678085e-06, "loss": 1.7421, "step": 61 }, { "epoch": 1.1296296296296295, "grad_norm": 3.120548437283878, "learning_rate": 1.8879868807546932e-06, "loss": 1.9877, "step": 62 }, { "epoch": 1.1481481481481481, "grad_norm": 3.242255359642735, "learning_rate": 1.8832655967944605e-06, "loss": 1.9799, "step": 63 }, { "epoch": 1.1666666666666667, "grad_norm": 2.2159733738020275, "learning_rate": 1.8784530129218907e-06, "loss": 2.0581, "step": 64 }, { "epoch": 1.1851851851851851, "grad_norm": 2.08989006018966, "learning_rate": 1.873549626596057e-06, "loss": 1.8653, "step": 65 }, { "epoch": 1.2037037037037037, "grad_norm": 1.4837874153680628, "learning_rate": 1.8685559446619487e-06, "loss": 1.9734, "step": 66 }, { "epoch": 1.2222222222222223, "grad_norm": 2.1071721482630403, "learning_rate": 1.863472483298079e-06, "loss": 1.7762, "step": 67 }, { "epoch": 1.2407407407407407, "grad_norm": 2.6554851825477646, "learning_rate": 1.858299767963131e-06, "loss": 2.2267, "step": 68 }, { "epoch": 1.2592592592592593, "grad_norm": 2.135758261049139, "learning_rate": 1.8530383333416415e-06, "loss": 2.0624, "step": 69 }, { "epoch": 1.2777777777777777, "grad_norm": 2.256153463268274, "learning_rate": 1.847688723288733e-06, "loss": 2.0254, "step": 70 }, { "epoch": 1.2962962962962963, "grad_norm": 1.9270711341308566, "learning_rate": 1.8422514907738986e-06, "loss": 2.0873, "step": 71 }, { "epoch": 1.3148148148148149, "grad_norm": 1.3698407936967985, "learning_rate": 1.8367271978238418e-06, "loss": 1.5655, "step": 72 }, { "epoch": 1.3333333333333333, "grad_norm": 1.7934950271719698, "learning_rate": 1.8311164154643833e-06, "loss": 2.1081, "step": 73 }, { "epoch": 1.3518518518518519, "grad_norm": 1.7554770045810462, "learning_rate": 1.8254197236614353e-06, "loss": 1.6326, "step": 74 }, { "epoch": 1.3703703703703702, "grad_norm": 1.7910726004582642, "learning_rate": 1.8196377112610524e-06, "loss": 1.9896, "step": 75 }, { "epoch": 1.3888888888888888, "grad_norm": 1.43155366985165, "learning_rate": 1.8137709759285662e-06, "loss": 1.8557, "step": 76 }, { "epoch": 1.4074074074074074, "grad_norm": 1.816009532890727, "learning_rate": 1.8078201240868048e-06, "loss": 1.7878, "step": 77 }, { "epoch": 1.425925925925926, "grad_norm": 1.612331881267257, "learning_rate": 1.8017857708534106e-06, "loss": 1.982, "step": 78 }, { "epoch": 1.4444444444444444, "grad_norm": 1.8628647966869196, "learning_rate": 1.7956685399772576e-06, "loss": 1.9704, "step": 79 }, { "epoch": 1.462962962962963, "grad_norm": 1.9936817464029801, "learning_rate": 1.7894690637739762e-06, "loss": 1.8299, "step": 80 }, { "epoch": 1.4814814814814814, "grad_norm": 2.463393533692339, "learning_rate": 1.7831879830605936e-06, "loss": 2.0444, "step": 81 }, { "epoch": 1.5, "grad_norm": 2.4979859149192305, "learning_rate": 1.776825947089294e-06, "loss": 2.0278, "step": 82 }, { "epoch": 1.5185185185185186, "grad_norm": 2.7584711281071606, "learning_rate": 1.7703836134803102e-06, "loss": 1.8715, "step": 83 }, { "epoch": 1.5370370370370372, "grad_norm": 1.9266117476771798, "learning_rate": 1.7638616481539448e-06, "loss": 2.3658, "step": 84 }, { "epoch": 1.5555555555555556, "grad_norm": 2.7609401761288908, "learning_rate": 1.7572607252617377e-06, "loss": 1.9736, "step": 85 }, { "epoch": 1.574074074074074, "grad_norm": 2.114937054090088, "learning_rate": 1.7505815271167822e-06, "loss": 2.0398, "step": 86 }, { "epoch": 1.5925925925925926, "grad_norm": 2.0664911123203513, "learning_rate": 1.743824744123196e-06, "loss": 2.1056, "step": 87 }, { "epoch": 1.6111111111111112, "grad_norm": 2.5153483082090213, "learning_rate": 1.7369910747047571e-06, "loss": 1.8765, "step": 88 }, { "epoch": 1.6296296296296298, "grad_norm": 1.8949983903048848, "learning_rate": 1.7300812252327102e-06, "loss": 2.1245, "step": 89 }, { "epoch": 1.6481481481481481, "grad_norm": 2.7037983362018565, "learning_rate": 1.723095909952751e-06, "loss": 1.5174, "step": 90 }, { "epoch": 1.6666666666666665, "grad_norm": 2.3396050215927673, "learning_rate": 1.7160358509111989e-06, "loss": 2.0559, "step": 91 }, { "epoch": 1.6851851851851851, "grad_norm": 2.1357187531056976, "learning_rate": 1.7089017778803595e-06, "loss": 1.8264, "step": 92 }, { "epoch": 1.7037037037037037, "grad_norm": 2.5298502653457358, "learning_rate": 1.701694428283093e-06, "loss": 2.1282, "step": 93 }, { "epoch": 1.7222222222222223, "grad_norm": 2.0789215851330343, "learning_rate": 1.6944145471165881e-06, "loss": 2.1829, "step": 94 }, { "epoch": 1.7407407407407407, "grad_norm": 1.8110067836025452, "learning_rate": 1.6870628868753545e-06, "loss": 1.7584, "step": 95 }, { "epoch": 1.7592592592592593, "grad_norm": 2.7069181555694666, "learning_rate": 1.6796402074734402e-06, "loss": 1.897, "step": 96 }, { "epoch": 1.7777777777777777, "grad_norm": 2.3956521553142176, "learning_rate": 1.6721472761658836e-06, "loss": 1.9119, "step": 97 }, { "epoch": 1.7962962962962963, "grad_norm": 1.3732811625669847, "learning_rate": 1.664584867469403e-06, "loss": 1.6848, "step": 98 }, { "epoch": 1.8148148148148149, "grad_norm": 1.9512817035138257, "learning_rate": 1.6569537630823382e-06, "loss": 2.0185, "step": 99 }, { "epoch": 1.8333333333333335, "grad_norm": 1.864374052494234, "learning_rate": 1.6492547518038503e-06, "loss": 1.925, "step": 100 }, { "epoch": 1.8518518518518519, "grad_norm": 1.7728078338576356, "learning_rate": 1.6414886294523857e-06, "loss": 1.8965, "step": 101 }, { "epoch": 1.8703703703703702, "grad_norm": 1.8362690886038369, "learning_rate": 1.6336561987834151e-06, "loss": 1.8881, "step": 102 }, { "epoch": 1.8888888888888888, "grad_norm": 3.120191999390615, "learning_rate": 1.6257582694064556e-06, "loss": 1.7192, "step": 103 }, { "epoch": 1.9074074074074074, "grad_norm": 2.3586839267066044, "learning_rate": 1.6177956577013846e-06, "loss": 1.9387, "step": 104 }, { "epoch": 1.925925925925926, "grad_norm": 2.779686602481001, "learning_rate": 1.6097691867340543e-06, "loss": 1.9497, "step": 105 }, { "epoch": 1.9444444444444444, "grad_norm": 2.321935224272705, "learning_rate": 1.6016796861712125e-06, "loss": 1.9367, "step": 106 }, { "epoch": 1.9629629629629628, "grad_norm": 2.3211469537338276, "learning_rate": 1.5935279921947451e-06, "loss": 1.9765, "step": 107 }, { "epoch": 1.9814814814814814, "grad_norm": 1.8048838385036454, "learning_rate": 1.585314947415242e-06, "loss": 2.1524, "step": 108 }, { "epoch": 2.0, "grad_norm": 2.2432536623121866, "learning_rate": 1.5770414007848994e-06, "loss": 1.7596, "step": 109 }, { "epoch": 2.0185185185185186, "grad_norm": 2.1527401042322984, "learning_rate": 1.5687082075097674e-06, "loss": 2.1903, "step": 110 }, { "epoch": 2.0185185185185186, "grad_norm": 2.822074512897879, "learning_rate": 1.5603162289613501e-06, "loss": 2.0324, "step": 111 }, { "epoch": 2.037037037037037, "grad_norm": 1.9685786022400997, "learning_rate": 1.551866332587568e-06, "loss": 1.8009, "step": 112 }, { "epoch": 2.0555555555555554, "grad_norm": 2.515751939304619, "learning_rate": 1.5433593918230955e-06, "loss": 1.9487, "step": 113 }, { "epoch": 2.074074074074074, "grad_norm": 2.349862710312166, "learning_rate": 1.5347962859990742e-06, "loss": 1.9967, "step": 114 }, { "epoch": 2.0925925925925926, "grad_norm": 3.1803776539735233, "learning_rate": 1.5261779002522216e-06, "loss": 2.0633, "step": 115 }, { "epoch": 2.111111111111111, "grad_norm": 2.6762686321709372, "learning_rate": 1.517505125433338e-06, "loss": 2.1631, "step": 116 }, { "epoch": 2.1296296296296298, "grad_norm": 3.17350275984332, "learning_rate": 1.5087788580152206e-06, "loss": 1.7666, "step": 117 }, { "epoch": 2.148148148148148, "grad_norm": 2.7374508335058128, "learning_rate": 1.5e-06, "loss": 1.6363, "step": 118 }, { "epoch": 2.1666666666666665, "grad_norm": 2.518836889589819, "learning_rate": 1.4911694588259037e-06, "loss": 2.0306, "step": 119 }, { "epoch": 2.185185185185185, "grad_norm": 2.0516490709057438, "learning_rate": 1.482288147273456e-06, "loss": 1.7322, "step": 120 }, { "epoch": 2.2037037037037037, "grad_norm": 2.143653181079979, "learning_rate": 1.4733569833711299e-06, "loss": 1.9715, "step": 121 }, { "epoch": 2.2222222222222223, "grad_norm": 1.5702663497071736, "learning_rate": 1.4643768903004504e-06, "loss": 1.6725, "step": 122 }, { "epoch": 2.240740740740741, "grad_norm": 2.1780515590527045, "learning_rate": 1.455348796300571e-06, "loss": 1.8871, "step": 123 }, { "epoch": 2.259259259259259, "grad_norm": 1.7384200856334007, "learning_rate": 1.4462736345723259e-06, "loss": 1.8607, "step": 124 }, { "epoch": 2.2777777777777777, "grad_norm": 2.602970978377197, "learning_rate": 1.437152343181765e-06, "loss": 2.0933, "step": 125 }, { "epoch": 2.2962962962962963, "grad_norm": 2.2409639030493516, "learning_rate": 1.4279858649631928e-06, "loss": 2.1028, "step": 126 }, { "epoch": 2.314814814814815, "grad_norm": 2.083427934167806, "learning_rate": 1.4187751474217096e-06, "loss": 1.7588, "step": 127 }, { "epoch": 2.3333333333333335, "grad_norm": 1.710343556502238, "learning_rate": 1.4095211426352718e-06, "loss": 1.8985, "step": 128 }, { "epoch": 2.351851851851852, "grad_norm": 2.4282958584597645, "learning_rate": 1.4002248071562778e-06, "loss": 1.8267, "step": 129 }, { "epoch": 2.3703703703703702, "grad_norm": 2.2052175185263936, "learning_rate": 1.3908871019126954e-06, "loss": 2.254, "step": 130 }, { "epoch": 2.388888888888889, "grad_norm": 2.4962771616425745, "learning_rate": 1.3815089921087315e-06, "loss": 1.8375, "step": 131 }, { "epoch": 2.4074074074074074, "grad_norm": 2.420921240604477, "learning_rate": 1.3720914471250642e-06, "loss": 1.9705, "step": 132 }, { "epoch": 2.425925925925926, "grad_norm": 1.6871014164962779, "learning_rate": 1.3626354404186404e-06, "loss": 1.866, "step": 133 }, { "epoch": 2.4444444444444446, "grad_norm": 1.5220778910671986, "learning_rate": 1.3531419494220545e-06, "loss": 2.0116, "step": 134 }, { "epoch": 2.462962962962963, "grad_norm": 1.9736590287767704, "learning_rate": 1.343611955442513e-06, "loss": 1.7881, "step": 135 }, { "epoch": 2.4814814814814814, "grad_norm": 1.357453526449638, "learning_rate": 1.334046443560402e-06, "loss": 1.7624, "step": 136 }, { "epoch": 2.5, "grad_norm": 1.7906511346102865, "learning_rate": 1.324446402527462e-06, "loss": 1.7147, "step": 137 }, { "epoch": 2.5185185185185186, "grad_norm": 2.0256913340352, "learning_rate": 1.3148128246645848e-06, "loss": 1.657, "step": 138 }, { "epoch": 2.537037037037037, "grad_norm": 2.4368648915605786, "learning_rate": 1.3051467057592413e-06, "loss": 1.848, "step": 139 }, { "epoch": 2.5555555555555554, "grad_norm": 1.7920760208344662, "learning_rate": 1.2954490449625491e-06, "loss": 2.2794, "step": 140 }, { "epoch": 2.574074074074074, "grad_norm": 2.5934703428783115, "learning_rate": 1.2857208446859957e-06, "loss": 2.1465, "step": 141 }, { "epoch": 2.5925925925925926, "grad_norm": 1.788260906958661, "learning_rate": 1.2759631104978224e-06, "loss": 2.067, "step": 142 }, { "epoch": 2.611111111111111, "grad_norm": 2.7522723362234474, "learning_rate": 1.2661768510190816e-06, "loss": 1.8176, "step": 143 }, { "epoch": 2.6296296296296298, "grad_norm": 2.43143502900473, "learning_rate": 1.2563630778193802e-06, "loss": 2.3366, "step": 144 }, { "epoch": 2.648148148148148, "grad_norm": 1.7241238478367036, "learning_rate": 1.2465228053123172e-06, "loss": 1.9895, "step": 145 }, { "epoch": 2.6666666666666665, "grad_norm": 2.0266143160589802, "learning_rate": 1.2366570506506268e-06, "loss": 1.7781, "step": 146 }, { "epoch": 2.685185185185185, "grad_norm": 1.9459670874156856, "learning_rate": 1.226766833621041e-06, "loss": 2.3116, "step": 147 }, { "epoch": 2.7037037037037037, "grad_norm": 2.248556130449579, "learning_rate": 1.2168531765388755e-06, "loss": 1.8032, "step": 148 }, { "epoch": 2.7222222222222223, "grad_norm": 1.711136470727862, "learning_rate": 1.2069171041423583e-06, "loss": 1.6228, "step": 149 }, { "epoch": 2.7407407407407405, "grad_norm": 2.4614425382704352, "learning_rate": 1.1969596434867062e-06, "loss": 1.9709, "step": 150 }, { "epoch": 2.7592592592592595, "grad_norm": 2.3445742482429788, "learning_rate": 1.186981823837961e-06, "loss": 2.0597, "step": 151 }, { "epoch": 2.7777777777777777, "grad_norm": 1.6706837512637804, "learning_rate": 1.1769846765665992e-06, "loss": 1.6263, "step": 152 }, { "epoch": 2.7962962962962963, "grad_norm": 1.6603060271536991, "learning_rate": 1.1669692350409222e-06, "loss": 1.8723, "step": 153 }, { "epoch": 2.814814814814815, "grad_norm": 1.7552257393882156, "learning_rate": 1.1569365345202413e-06, "loss": 2.224, "step": 154 }, { "epoch": 2.8333333333333335, "grad_norm": 1.3677514217091091, "learning_rate": 1.1468876120478662e-06, "loss": 1.897, "step": 155 }, { "epoch": 2.851851851851852, "grad_norm": 1.4681588115995392, "learning_rate": 1.1368235063439102e-06, "loss": 1.7654, "step": 156 }, { "epoch": 2.8703703703703702, "grad_norm": 1.4166676047405766, "learning_rate": 1.1267452576979218e-06, "loss": 1.7817, "step": 157 }, { "epoch": 2.888888888888889, "grad_norm": 1.5999665116208726, "learning_rate": 1.1166539078613525e-06, "loss": 1.814, "step": 158 }, { "epoch": 2.9074074074074074, "grad_norm": 1.8734358713251535, "learning_rate": 1.106550499939876e-06, "loss": 2.0783, "step": 159 }, { "epoch": 2.925925925925926, "grad_norm": 1.7212322982329384, "learning_rate": 1.0964360782855666e-06, "loss": 2.0753, "step": 160 }, { "epoch": 2.9444444444444446, "grad_norm": 2.144799198000555, "learning_rate": 1.086311688388946e-06, "loss": 1.8936, "step": 161 }, { "epoch": 2.962962962962963, "grad_norm": 1.578076988317517, "learning_rate": 1.076178376770918e-06, "loss": 1.8926, "step": 162 }, { "epoch": 2.9814814814814814, "grad_norm": 2.092387225323448, "learning_rate": 1.0660371908745908e-06, "loss": 1.8244, "step": 163 }, { "epoch": 3.0, "grad_norm": 1.91051937209127, "learning_rate": 1.0558891789570082e-06, "loss": 1.8447, "step": 164 }, { "epoch": 3.0185185185185186, "grad_norm": 2.011878655711519, "learning_rate": 1.0457353899807946e-06, "loss": 1.6429, "step": 165 }, { "epoch": 3.0185185185185186, "grad_norm": 1.501437779159261, "learning_rate": 1.0355768735057273e-06, "loss": 1.8726, "step": 166 }, { "epoch": 3.037037037037037, "grad_norm": 2.2762397392089597, "learning_rate": 1.0254146795802495e-06, "loss": 1.8501, "step": 167 }, { "epoch": 3.0555555555555554, "grad_norm": 1.711019377794848, "learning_rate": 1.015249858632926e-06, "loss": 1.9443, "step": 168 }, { "epoch": 3.074074074074074, "grad_norm": 2.1218173803583733, "learning_rate": 1.0050834613638694e-06, "loss": 1.5682, "step": 169 }, { "epoch": 3.0925925925925926, "grad_norm": 2.2421674612074383, "learning_rate": 9.949165386361303e-07, "loss": 1.8014, "step": 170 }, { "epoch": 3.111111111111111, "grad_norm": 2.0898372243057706, "learning_rate": 9.847501413670742e-07, "loss": 1.8711, "step": 171 }, { "epoch": 3.1296296296296298, "grad_norm": 2.367436693252952, "learning_rate": 9.745853204197508e-07, "loss": 1.9004, "step": 172 }, { "epoch": 3.148148148148148, "grad_norm": 1.756679866289546, "learning_rate": 9.644231264942724e-07, "loss": 1.8121, "step": 173 }, { "epoch": 3.1666666666666665, "grad_norm": 1.8172318892802939, "learning_rate": 9.542646100192055e-07, "loss": 1.9013, "step": 174 }, { "epoch": 3.185185185185185, "grad_norm": 1.7124997061951257, "learning_rate": 9.441108210429921e-07, "loss": 1.7851, "step": 175 }, { "epoch": 3.2037037037037037, "grad_norm": 1.4438554381375786, "learning_rate": 9.339628091254091e-07, "loss": 1.5955, "step": 176 }, { "epoch": 3.2222222222222223, "grad_norm": 1.4447841103018684, "learning_rate": 9.238216232290821e-07, "loss": 2.0907, "step": 177 }, { "epoch": 3.240740740740741, "grad_norm": 1.6937928047736799, "learning_rate": 9.136883116110541e-07, "loss": 1.915, "step": 178 }, { "epoch": 3.259259259259259, "grad_norm": 1.306322824987709, "learning_rate": 9.035639217144334e-07, "loss": 2.0679, "step": 179 }, { "epoch": 3.2777777777777777, "grad_norm": 1.850877358174252, "learning_rate": 8.93449500060124e-07, "loss": 2.024, "step": 180 }, { "epoch": 3.2962962962962963, "grad_norm": 1.5601775038920753, "learning_rate": 8.833460921386477e-07, "loss": 1.9335, "step": 181 }, { "epoch": 3.314814814814815, "grad_norm": 2.270038996895677, "learning_rate": 8.732547423020784e-07, "loss": 2.3019, "step": 182 }, { "epoch": 3.3333333333333335, "grad_norm": 1.3421300711986788, "learning_rate": 8.631764936560899e-07, "loss": 1.8503, "step": 183 }, { "epoch": 3.351851851851852, "grad_norm": 2.071755001265988, "learning_rate": 8.53112387952134e-07, "loss": 1.838, "step": 184 }, { "epoch": 3.3703703703703702, "grad_norm": 1.6174575169544287, "learning_rate": 8.430634654797588e-07, "loss": 2.2364, "step": 185 }, { "epoch": 3.388888888888889, "grad_norm": 1.801580774474325, "learning_rate": 8.330307649590779e-07, "loss": 1.7633, "step": 186 }, { "epoch": 3.4074074074074074, "grad_norm": 2.058657705709402, "learning_rate": 8.230153234334007e-07, "loss": 2.2177, "step": 187 }, { "epoch": 3.425925925925926, "grad_norm": 1.5267427939756337, "learning_rate": 8.130181761620392e-07, "loss": 1.8588, "step": 188 }, { "epoch": 3.4444444444444446, "grad_norm": 1.8491296560891988, "learning_rate": 8.030403565132942e-07, "loss": 2.0561, "step": 189 }, { "epoch": 3.462962962962963, "grad_norm": 1.1987453530026493, "learning_rate": 7.930828958576417e-07, "loss": 2.0565, "step": 190 }, { "epoch": 3.4814814814814814, "grad_norm": 1.7195298906541316, "learning_rate": 7.831468234611247e-07, "loss": 2.0798, "step": 191 }, { "epoch": 3.5, "grad_norm": 1.20797833272688, "learning_rate": 7.73233166378959e-07, "loss": 1.8627, "step": 192 }, { "epoch": 3.5185185185185186, "grad_norm": 1.5640684128902402, "learning_rate": 7.633429493493729e-07, "loss": 2.0137, "step": 193 }, { "epoch": 3.537037037037037, "grad_norm": 1.6824510280578688, "learning_rate": 7.53477194687683e-07, "loss": 2.1517, "step": 194 }, { "epoch": 3.5555555555555554, "grad_norm": 1.4155640553151332, "learning_rate": 7.4363692218062e-07, "loss": 1.9426, "step": 195 }, { "epoch": 3.574074074074074, "grad_norm": 1.3939742232946681, "learning_rate": 7.338231489809182e-07, "loss": 1.7207, "step": 196 }, { "epoch": 3.5925925925925926, "grad_norm": 1.4589160544776356, "learning_rate": 7.240368895021775e-07, "loss": 1.8217, "step": 197 }, { "epoch": 3.611111111111111, "grad_norm": 1.3991775241667967, "learning_rate": 7.142791553140044e-07, "loss": 1.9021, "step": 198 }, { "epoch": 3.6296296296296298, "grad_norm": 1.5300112446112555, "learning_rate": 7.045509550374509e-07, "loss": 1.9647, "step": 199 }, { "epoch": 3.648148148148148, "grad_norm": 1.449273309005635, "learning_rate": 6.948532942407587e-07, "loss": 1.9613, "step": 200 }, { "epoch": 3.6666666666666665, "grad_norm": 1.069899380500529, "learning_rate": 6.851871753354153e-07, "loss": 1.7452, "step": 201 }, { "epoch": 3.685185185185185, "grad_norm": 1.5579308530316032, "learning_rate": 6.755535974725379e-07, "loss": 1.9134, "step": 202 }, { "epoch": 3.7037037037037037, "grad_norm": 1.0814459794670248, "learning_rate": 6.659535564395982e-07, "loss": 1.6609, "step": 203 }, { "epoch": 3.7222222222222223, "grad_norm": 1.8876967693657951, "learning_rate": 6.563880445574872e-07, "loss": 2.0948, "step": 204 }, { "epoch": 3.7407407407407405, "grad_norm": 1.6093595543167938, "learning_rate": 6.468580505779455e-07, "loss": 1.6327, "step": 205 }, { "epoch": 3.7592592592592595, "grad_norm": 1.9559640817344714, "learning_rate": 6.373645595813596e-07, "loss": 1.6376, "step": 206 }, { "epoch": 3.7777777777777777, "grad_norm": 2.0405778845643288, "learning_rate": 6.27908552874936e-07, "loss": 2.1409, "step": 207 }, { "epoch": 3.7962962962962963, "grad_norm": 1.230340254163767, "learning_rate": 6.184910078912686e-07, "loss": 1.686, "step": 208 }, { "epoch": 3.814814814814815, "grad_norm": 2.171420345125834, "learning_rate": 6.091128980873045e-07, "loss": 1.9347, "step": 209 }, { "epoch": 3.8333333333333335, "grad_norm": 1.8008532771859842, "learning_rate": 5.997751928437219e-07, "loss": 2.1292, "step": 210 }, { "epoch": 3.851851851851852, "grad_norm": 1.502892647903443, "learning_rate": 5.904788573647282e-07, "loss": 1.7302, "step": 211 }, { "epoch": 3.8703703703703702, "grad_norm": 1.4720170454603325, "learning_rate": 5.812248525782901e-07, "loss": 1.6652, "step": 212 }, { "epoch": 3.888888888888889, "grad_norm": 1.4078435809618528, "learning_rate": 5.720141350368072e-07, "loss": 1.7847, "step": 213 }, { "epoch": 3.9074074074074074, "grad_norm": 1.2860107867972834, "learning_rate": 5.628476568182349e-07, "loss": 1.818, "step": 214 }, { "epoch": 3.925925925925926, "grad_norm": 1.5761560916907795, "learning_rate": 5.537263654276743e-07, "loss": 1.787, "step": 215 }, { "epoch": 3.9444444444444446, "grad_norm": 1.463921943518727, "learning_rate": 5.446512036994286e-07, "loss": 1.9223, "step": 216 }, { "epoch": 3.962962962962963, "grad_norm": 1.2770391505323755, "learning_rate": 5.356231096995499e-07, "loss": 1.5593, "step": 217 }, { "epoch": 3.9814814814814814, "grad_norm": 1.4711865688844035, "learning_rate": 5.266430166288704e-07, "loss": 2.0863, "step": 218 }, { "epoch": 4.0, "grad_norm": 1.1447313661292717, "learning_rate": 5.177118527265437e-07, "loss": 1.9428, "step": 219 }, { "epoch": 4.018518518518519, "grad_norm": 1.6196943319397998, "learning_rate": 5.088305411740965e-07, "loss": 2.2068, "step": 220 }, { "epoch": 4.018518518518518, "grad_norm": 1.2766493962889875, "learning_rate": 5.000000000000002e-07, "loss": 1.7437, "step": 221 }, { "epoch": 4.037037037037037, "grad_norm": 1.594306405599087, "learning_rate": 4.912211419847793e-07, "loss": 2.0219, "step": 222 }, { "epoch": 4.055555555555555, "grad_norm": 1.227716475966799, "learning_rate": 4.82494874566662e-07, "loss": 2.187, "step": 223 }, { "epoch": 4.074074074074074, "grad_norm": 1.2852396998354376, "learning_rate": 4.738220997477784e-07, "loss": 1.8363, "step": 224 }, { "epoch": 4.092592592592593, "grad_norm": 1.0923893050000644, "learning_rate": 4.6520371400092584e-07, "loss": 1.7177, "step": 225 }, { "epoch": 4.111111111111111, "grad_norm": 1.1495819987216884, "learning_rate": 4.5664060817690476e-07, "loss": 2.0734, "step": 226 }, { "epoch": 4.12962962962963, "grad_norm": 1.1120083230916684, "learning_rate": 4.481336674124323e-07, "loss": 1.7847, "step": 227 }, { "epoch": 4.148148148148148, "grad_norm": 0.9789098979808262, "learning_rate": 4.3968377103865016e-07, "loss": 1.7989, "step": 228 }, { "epoch": 4.166666666666667, "grad_norm": 0.9342477457439083, "learning_rate": 4.3129179249023274e-07, "loss": 1.6785, "step": 229 }, { "epoch": 4.185185185185185, "grad_norm": 1.0718449337061247, "learning_rate": 4.229585992151006e-07, "loss": 1.7953, "step": 230 }, { "epoch": 4.203703703703703, "grad_norm": 1.1500516991492213, "learning_rate": 4.1468505258475784e-07, "loss": 1.3975, "step": 231 }, { "epoch": 4.222222222222222, "grad_norm": 0.9650831232767911, "learning_rate": 4.0647200780525483e-07, "loss": 1.8603, "step": 232 }, { "epoch": 4.2407407407407405, "grad_norm": 1.0207088687244406, "learning_rate": 3.983203138287876e-07, "loss": 1.9807, "step": 233 }, { "epoch": 4.2592592592592595, "grad_norm": 1.1991752171611891, "learning_rate": 3.9023081326594564e-07, "loss": 2.2322, "step": 234 }, { "epoch": 4.277777777777778, "grad_norm": 1.0807801212200088, "learning_rate": 3.822043422986153e-07, "loss": 1.6295, "step": 235 }, { "epoch": 4.296296296296296, "grad_norm": 1.0103392155699495, "learning_rate": 3.742417305935442e-07, "loss": 1.7882, "step": 236 }, { "epoch": 4.314814814814815, "grad_norm": 1.0657639750720669, "learning_rate": 3.663438012165848e-07, "loss": 1.6027, "step": 237 }, { "epoch": 4.333333333333333, "grad_norm": 0.9495451533397854, "learning_rate": 3.5851137054761426e-07, "loss": 1.8212, "step": 238 }, { "epoch": 4.351851851851852, "grad_norm": 1.0780389016215326, "learning_rate": 3.507452481961495e-07, "loss": 1.6304, "step": 239 }, { "epoch": 4.37037037037037, "grad_norm": 1.0244203325558825, "learning_rate": 3.430462369176619e-07, "loss": 1.9347, "step": 240 }, { "epoch": 4.388888888888889, "grad_norm": 0.9762810523750869, "learning_rate": 3.3541513253059726e-07, "loss": 2.0351, "step": 241 }, { "epoch": 4.407407407407407, "grad_norm": 0.8894982063199672, "learning_rate": 3.278527238341163e-07, "loss": 1.7788, "step": 242 }, { "epoch": 4.425925925925926, "grad_norm": 0.9573443483478868, "learning_rate": 3.2035979252655976e-07, "loss": 1.6824, "step": 243 }, { "epoch": 4.444444444444445, "grad_norm": 0.878347387417952, "learning_rate": 3.129371131246459e-07, "loss": 1.7893, "step": 244 }, { "epoch": 4.462962962962963, "grad_norm": 1.2131347174643223, "learning_rate": 3.05585452883412e-07, "loss": 2.4755, "step": 245 }, { "epoch": 4.481481481481482, "grad_norm": 0.9278993006726863, "learning_rate": 2.9830557171690693e-07, "loss": 2.051, "step": 246 }, { "epoch": 4.5, "grad_norm": 0.9769923688632531, "learning_rate": 2.910982221196404e-07, "loss": 1.8307, "step": 247 }, { "epoch": 4.518518518518518, "grad_norm": 1.0084007217465136, "learning_rate": 2.8396414908880095e-07, "loss": 2.0386, "step": 248 }, { "epoch": 4.537037037037037, "grad_norm": 1.0273787706173494, "learning_rate": 2.769040900472488e-07, "loss": 1.9072, "step": 249 }, { "epoch": 4.555555555555555, "grad_norm": 0.8621559648712259, "learning_rate": 2.6991877476728985e-07, "loss": 1.706, "step": 250 }, { "epoch": 4.574074074074074, "grad_norm": 0.8247377172080764, "learning_rate": 2.6300892529524264e-07, "loss": 1.8414, "step": 251 }, { "epoch": 4.592592592592593, "grad_norm": 0.8925073470001154, "learning_rate": 2.56175255876804e-07, "loss": 1.9007, "step": 252 }, { "epoch": 4.611111111111111, "grad_norm": 0.7860274094152706, "learning_rate": 2.494184728832179e-07, "loss": 1.8654, "step": 253 }, { "epoch": 4.62962962962963, "grad_norm": 0.8936613069940655, "learning_rate": 2.427392747382623e-07, "loss": 1.6996, "step": 254 }, { "epoch": 4.648148148148148, "grad_norm": 1.0827181264619206, "learning_rate": 2.3613835184605523e-07, "loss": 1.9413, "step": 255 }, { "epoch": 4.666666666666667, "grad_norm": 0.8918696543620299, "learning_rate": 2.2961638651968974e-07, "loss": 1.856, "step": 256 }, { "epoch": 4.685185185185185, "grad_norm": 0.9976782397503938, "learning_rate": 2.2317405291070567e-07, "loss": 1.8228, "step": 257 }, { "epoch": 4.703703703703704, "grad_norm": 1.021922767232776, "learning_rate": 2.1681201693940666e-07, "loss": 2.0057, "step": 258 }, { "epoch": 4.722222222222222, "grad_norm": 1.1673943142630625, "learning_rate": 2.1053093622602402e-07, "loss": 1.9204, "step": 259 }, { "epoch": 4.7407407407407405, "grad_norm": 1.0912049168909328, "learning_rate": 2.043314600227425e-07, "loss": 1.8173, "step": 260 }, { "epoch": 4.7592592592592595, "grad_norm": 0.9358084522077252, "learning_rate": 1.9821422914658957e-07, "loss": 2.0846, "step": 261 }, { "epoch": 4.777777777777778, "grad_norm": 1.0481784665647413, "learning_rate": 1.921798759131953e-07, "loss": 1.9789, "step": 262 }, { "epoch": 4.796296296296296, "grad_norm": 0.983048254792995, "learning_rate": 1.8622902407143392e-07, "loss": 1.9294, "step": 263 }, { "epoch": 4.814814814814815, "grad_norm": 0.8359638487960833, "learning_rate": 1.8036228873894744e-07, "loss": 1.7806, "step": 264 }, { "epoch": 4.833333333333333, "grad_norm": 1.1295927764034195, "learning_rate": 1.7458027633856475e-07, "loss": 1.9495, "step": 265 }, { "epoch": 4.851851851851852, "grad_norm": 1.1032897990848558, "learning_rate": 1.6888358453561646e-07, "loss": 2.0724, "step": 266 }, { "epoch": 4.87037037037037, "grad_norm": 0.855002738874884, "learning_rate": 1.632728021761579e-07, "loss": 2.102, "step": 267 }, { "epoch": 4.888888888888889, "grad_norm": 1.0646161730662291, "learning_rate": 1.5774850922610116e-07, "loss": 1.9046, "step": 268 }, { "epoch": 4.907407407407407, "grad_norm": 1.0109654313968932, "learning_rate": 1.5231127671126676e-07, "loss": 2.0854, "step": 269 }, { "epoch": 4.925925925925926, "grad_norm": 0.9390534047671891, "learning_rate": 1.4696166665835852e-07, "loss": 2.1436, "step": 270 }, { "epoch": 4.944444444444445, "grad_norm": 0.9838446669064714, "learning_rate": 1.4170023203686875e-07, "loss": 1.9317, "step": 271 }, { "epoch": 4.962962962962963, "grad_norm": 1.0678273880700424, "learning_rate": 1.3652751670192075e-07, "loss": 1.8309, "step": 272 }, { "epoch": 4.981481481481482, "grad_norm": 1.1853311551704062, "learning_rate": 1.3144405533805136e-07, "loss": 1.948, "step": 273 }, { "epoch": 5.0, "grad_norm": 1.0844767215232378, "learning_rate": 1.2645037340394281e-07, "loss": 2.1066, "step": 274 }, { "epoch": 5.018518518518518, "grad_norm": 0.8509695959322425, "learning_rate": 1.2154698707810928e-07, "loss": 1.9217, "step": 275 }, { "epoch": 5.037037037037037, "grad_norm": 0.9599815386335595, "learning_rate": 1.167344032055394e-07, "loss": 1.9898, "step": 276 }, { "epoch": 5.055555555555555, "grad_norm": 0.9561022219351966, "learning_rate": 1.1201311924530688e-07, "loss": 1.6967, "step": 277 }, { "epoch": 5.074074074074074, "grad_norm": 0.8614534074294055, "learning_rate": 1.0738362321914995e-07, "loss": 1.7586, "step": 278 }, { "epoch": 5.092592592592593, "grad_norm": 0.884706815883145, "learning_rate": 1.0284639366102598e-07, "loss": 1.8692, "step": 279 }, { "epoch": 5.111111111111111, "grad_norm": 0.8641496604329509, "learning_rate": 9.840189956764677e-08, "loss": 2.1101, "step": 280 }, { "epoch": 5.12962962962963, "grad_norm": 0.8465414034017087, "learning_rate": 9.405060035000134e-08, "loss": 1.7827, "step": 281 }, { "epoch": 5.148148148148148, "grad_norm": 0.6966794157650356, "learning_rate": 8.979294578586738e-08, "loss": 1.6446, "step": 282 }, { "epoch": 5.166666666666667, "grad_norm": 0.8581271311276034, "learning_rate": 8.562937597331898e-08, "loss": 1.7243, "step": 283 }, { "epoch": 5.185185185185185, "grad_norm": 0.9976947326325505, "learning_rate": 8.156032128523694e-08, "loss": 1.8994, "step": 284 }, { "epoch": 5.203703703703703, "grad_norm": 0.9786757162446749, "learning_rate": 7.758620232482083e-08, "loss": 1.8625, "step": 285 }, { "epoch": 5.222222222222222, "grad_norm": 0.7563393752170862, "learning_rate": 7.370742988211364e-08, "loss": 1.7512, "step": 286 }, { "epoch": 5.2407407407407405, "grad_norm": 0.7955178168012043, "learning_rate": 6.99244048915405e-08, "loss": 2.2105, "step": 287 }, { "epoch": 5.2592592592592595, "grad_norm": 0.8951178929520269, "learning_rate": 6.623751839046455e-08, "loss": 1.8276, "step": 288 }, { "epoch": 5.277777777777778, "grad_norm": 0.9912120605663316, "learning_rate": 6.264715147876742e-08, "loss": 2.2784, "step": 289 }, { "epoch": 5.296296296296296, "grad_norm": 0.759976000502015, "learning_rate": 5.915367527945614e-08, "loss": 1.9346, "step": 290 }, { "epoch": 5.314814814814815, "grad_norm": 0.7423091105639062, "learning_rate": 5.575745090030137e-08, "loss": 1.8795, "step": 291 }, { "epoch": 5.333333333333333, "grad_norm": 0.7811530255930925, "learning_rate": 5.245882939651181e-08, "loss": 2.0584, "step": 292 }, { "epoch": 5.351851851851852, "grad_norm": 0.9202352755672565, "learning_rate": 4.9258151734445694e-08, "loss": 2.0563, "step": 293 }, { "epoch": 5.37037037037037, "grad_norm": 0.7972657702760176, "learning_rate": 4.6155748756367294e-08, "loss": 1.8333, "step": 294 }, { "epoch": 5.388888888888889, "grad_norm": 0.6829451582697305, "learning_rate": 4.3151941146248873e-08, "loss": 1.9896, "step": 295 }, { "epoch": 5.407407407407407, "grad_norm": 0.7886670762082094, "learning_rate": 4.0247039396622e-08, "loss": 1.8183, "step": 296 }, { "epoch": 5.425925925925926, "grad_norm": 0.8840244220041553, "learning_rate": 3.7441343776484113e-08, "loss": 1.9354, "step": 297 }, { "epoch": 5.444444444444445, "grad_norm": 0.71587738270711, "learning_rate": 3.4735144300260255e-08, "loss": 2.0167, "step": 298 }, { "epoch": 5.462962962962963, "grad_norm": 0.7108094024246895, "learning_rate": 3.212872069782513e-08, "loss": 1.7169, "step": 299 }, { "epoch": 5.481481481481482, "grad_norm": 0.6662930242485889, "learning_rate": 2.962234238558925e-08, "loss": 2.2062, "step": 300 }, { "epoch": 5.5, "grad_norm": 0.7122621954506775, "learning_rate": 2.721626843864977e-08, "loss": 2.0591, "step": 301 }, { "epoch": 5.518518518518518, "grad_norm": 0.626318180659774, "learning_rate": 2.491074756401068e-08, "loss": 1.5866, "step": 302 }, { "epoch": 5.537037037037037, "grad_norm": 0.6909592708288532, "learning_rate": 2.2706018074875043e-08, "loss": 1.9005, "step": 303 }, { "epoch": 5.555555555555555, "grad_norm": 0.7144569439769612, "learning_rate": 2.0602307866012246e-08, "loss": 2.0294, "step": 304 }, { "epoch": 5.574074074074074, "grad_norm": 0.684647174393133, "learning_rate": 1.8599834390199853e-08, "loss": 1.6046, "step": 305 }, { "epoch": 5.592592592592593, "grad_norm": 0.7752801436279185, "learning_rate": 1.6698804635747576e-08, "loss": 1.7937, "step": 306 }, { "epoch": 5.611111111111111, "grad_norm": 0.6862611972609113, "learning_rate": 1.4899415105101066e-08, "loss": 1.7256, "step": 307 }, { "epoch": 5.62962962962963, "grad_norm": 0.6608135193001434, "learning_rate": 1.3201851794530371e-08, "loss": 1.7763, "step": 308 }, { "epoch": 5.648148148148148, "grad_norm": 0.7625095579861546, "learning_rate": 1.1606290174903888e-08, "loss": 2.0082, "step": 309 }, { "epoch": 5.666666666666667, "grad_norm": 0.6914220267730987, "learning_rate": 1.0112895173551183e-08, "loss": 1.9359, "step": 310 }, { "epoch": 5.685185185185185, "grad_norm": 0.6505975431309626, "learning_rate": 8.721821157214316e-09, "loss": 1.9317, "step": 311 }, { "epoch": 5.703703703703704, "grad_norm": 0.6947915176450158, "learning_rate": 7.433211916092141e-09, "loss": 1.6243, "step": 312 }, { "epoch": 5.722222222222222, "grad_norm": 0.6360099423433963, "learning_rate": 6.247200648976991e-09, "loss": 1.9931, "step": 313 }, { "epoch": 5.7407407407407405, "grad_norm": 0.6796797146249973, "learning_rate": 5.163909949486233e-09, "loss": 1.9858, "step": 314 }, { "epoch": 5.7592592592592595, "grad_norm": 0.7636965994787633, "learning_rate": 4.183451793390747e-09, "loss": 1.8201, "step": 315 }, { "epoch": 5.777777777777778, "grad_norm": 0.6434704456483539, "learning_rate": 3.30592752703962e-09, "loss": 1.5983, "step": 316 }, { "epoch": 5.796296296296296, "grad_norm": 0.6697682736960676, "learning_rate": 2.531427856885093e-09, "loss": 1.985, "step": 317 }, { "epoch": 5.814814814814815, "grad_norm": 0.657234650874368, "learning_rate": 1.8600328401061627e-09, "loss": 2.0918, "step": 318 }, { "epoch": 5.833333333333333, "grad_norm": 0.6249721935624161, "learning_rate": 1.2918118763335372e-09, "loss": 2.1123, "step": 319 }, { "epoch": 5.851851851851852, "grad_norm": 0.7274585554347512, "learning_rate": 8.268237004757095e-10, "loss": 2.2962, "step": 320 }, { "epoch": 5.87037037037037, "grad_norm": 0.6060864357328691, "learning_rate": 4.651163766484778e-10, "loss": 1.6461, "step": 321 }, { "epoch": 5.888888888888889, "grad_norm": 0.6626618247650778, "learning_rate": 2.0672729320581063e-10, "loss": 2.0178, "step": 322 }, { "epoch": 5.907407407407407, "grad_norm": 0.5905608542721459, "learning_rate": 5.1683158875936994e-11, "loss": 1.7269, "step": 323 }, { "epoch": 5.925925925925926, "grad_norm": 0.7138681736753105, "learning_rate": 0.0, "loss": 2.11, "step": 324 } ], "logging_steps": 1, "max_steps": 324, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 54, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 128024720179200.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }