Safetensors
Japanese
llama
sarashina-2.1-1b-sft / trainer_state.json
googlefan's picture
Upload folder using huggingface_hub
cd36ded verified
raw
history blame
171 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9990426041168023,
"eval_steps": 500,
"global_step": 1044,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019147917663954045,
"grad_norm": 61.0,
"learning_rate": 3.773584905660378e-07,
"loss": 3.0718,
"step": 1
},
{
"epoch": 0.003829583532790809,
"grad_norm": 62.0,
"learning_rate": 7.547169811320755e-07,
"loss": 3.1052,
"step": 2
},
{
"epoch": 0.0057443752991862135,
"grad_norm": 62.5,
"learning_rate": 1.1320754716981133e-06,
"loss": 3.1376,
"step": 3
},
{
"epoch": 0.007659167065581618,
"grad_norm": 103.5,
"learning_rate": 1.509433962264151e-06,
"loss": 3.0909,
"step": 4
},
{
"epoch": 0.009573958831977022,
"grad_norm": 62.0,
"learning_rate": 1.8867924528301889e-06,
"loss": 3.1064,
"step": 5
},
{
"epoch": 0.011488750598372427,
"grad_norm": 60.5,
"learning_rate": 2.2641509433962266e-06,
"loss": 3.0511,
"step": 6
},
{
"epoch": 0.013403542364767831,
"grad_norm": 58.75,
"learning_rate": 2.6415094339622644e-06,
"loss": 3.0435,
"step": 7
},
{
"epoch": 0.015318334131163236,
"grad_norm": 54.0,
"learning_rate": 3.018867924528302e-06,
"loss": 2.9226,
"step": 8
},
{
"epoch": 0.01723312589755864,
"grad_norm": 56.0,
"learning_rate": 3.3962264150943395e-06,
"loss": 3.0539,
"step": 9
},
{
"epoch": 0.019147917663954045,
"grad_norm": 50.5,
"learning_rate": 3.7735849056603777e-06,
"loss": 2.8775,
"step": 10
},
{
"epoch": 0.02106270943034945,
"grad_norm": 46.75,
"learning_rate": 4.150943396226416e-06,
"loss": 2.8774,
"step": 11
},
{
"epoch": 0.022977501196744854,
"grad_norm": 42.5,
"learning_rate": 4.528301886792453e-06,
"loss": 2.6977,
"step": 12
},
{
"epoch": 0.02489229296314026,
"grad_norm": 35.5,
"learning_rate": 4.905660377358491e-06,
"loss": 2.6109,
"step": 13
},
{
"epoch": 0.026807084729535663,
"grad_norm": 30.0,
"learning_rate": 5.283018867924529e-06,
"loss": 2.5524,
"step": 14
},
{
"epoch": 0.028721876495931067,
"grad_norm": 24.125,
"learning_rate": 5.660377358490566e-06,
"loss": 2.5467,
"step": 15
},
{
"epoch": 0.030636668262326472,
"grad_norm": 19.625,
"learning_rate": 6.037735849056604e-06,
"loss": 2.3976,
"step": 16
},
{
"epoch": 0.032551460028721876,
"grad_norm": 17.25,
"learning_rate": 6.415094339622642e-06,
"loss": 2.349,
"step": 17
},
{
"epoch": 0.03446625179511728,
"grad_norm": 13.625,
"learning_rate": 6.792452830188679e-06,
"loss": 2.3495,
"step": 18
},
{
"epoch": 0.036381043561512685,
"grad_norm": 10.375,
"learning_rate": 7.169811320754717e-06,
"loss": 2.3404,
"step": 19
},
{
"epoch": 0.03829583532790809,
"grad_norm": 10.4375,
"learning_rate": 7.5471698113207555e-06,
"loss": 2.2341,
"step": 20
},
{
"epoch": 0.040210627094303494,
"grad_norm": 7.96875,
"learning_rate": 7.924528301886793e-06,
"loss": 2.1499,
"step": 21
},
{
"epoch": 0.0421254188606989,
"grad_norm": 7.125,
"learning_rate": 8.301886792452832e-06,
"loss": 2.1879,
"step": 22
},
{
"epoch": 0.0440402106270943,
"grad_norm": 5.875,
"learning_rate": 8.67924528301887e-06,
"loss": 2.147,
"step": 23
},
{
"epoch": 0.04595500239348971,
"grad_norm": 15.375,
"learning_rate": 9.056603773584907e-06,
"loss": 2.2175,
"step": 24
},
{
"epoch": 0.04786979415988511,
"grad_norm": 7.65625,
"learning_rate": 9.433962264150944e-06,
"loss": 2.1928,
"step": 25
},
{
"epoch": 0.04978458592628052,
"grad_norm": 4.125,
"learning_rate": 9.811320754716981e-06,
"loss": 2.1355,
"step": 26
},
{
"epoch": 0.05169937769267592,
"grad_norm": 3.8125,
"learning_rate": 1.018867924528302e-05,
"loss": 2.117,
"step": 27
},
{
"epoch": 0.053614169459071326,
"grad_norm": 3.890625,
"learning_rate": 1.0566037735849058e-05,
"loss": 2.0604,
"step": 28
},
{
"epoch": 0.05552896122546673,
"grad_norm": 3.5,
"learning_rate": 1.0943396226415095e-05,
"loss": 2.0949,
"step": 29
},
{
"epoch": 0.057443752991862135,
"grad_norm": 3.203125,
"learning_rate": 1.1320754716981132e-05,
"loss": 2.0517,
"step": 30
},
{
"epoch": 0.05935854475825754,
"grad_norm": 2.96875,
"learning_rate": 1.169811320754717e-05,
"loss": 2.1012,
"step": 31
},
{
"epoch": 0.061273336524652944,
"grad_norm": 3.40625,
"learning_rate": 1.2075471698113209e-05,
"loss": 2.0485,
"step": 32
},
{
"epoch": 0.06318812829104835,
"grad_norm": 2.734375,
"learning_rate": 1.2452830188679246e-05,
"loss": 1.9389,
"step": 33
},
{
"epoch": 0.06510292005744375,
"grad_norm": 3.0,
"learning_rate": 1.2830188679245283e-05,
"loss": 2.0672,
"step": 34
},
{
"epoch": 0.06701771182383916,
"grad_norm": 2.765625,
"learning_rate": 1.320754716981132e-05,
"loss": 2.0884,
"step": 35
},
{
"epoch": 0.06893250359023456,
"grad_norm": 2.6875,
"learning_rate": 1.3584905660377358e-05,
"loss": 2.0209,
"step": 36
},
{
"epoch": 0.07084729535662997,
"grad_norm": 2.71875,
"learning_rate": 1.3962264150943397e-05,
"loss": 1.952,
"step": 37
},
{
"epoch": 0.07276208712302537,
"grad_norm": 2.640625,
"learning_rate": 1.4339622641509435e-05,
"loss": 1.9835,
"step": 38
},
{
"epoch": 0.07467687888942078,
"grad_norm": 2.5625,
"learning_rate": 1.4716981132075472e-05,
"loss": 1.962,
"step": 39
},
{
"epoch": 0.07659167065581618,
"grad_norm": 2.625,
"learning_rate": 1.5094339622641511e-05,
"loss": 1.9884,
"step": 40
},
{
"epoch": 0.07850646242221158,
"grad_norm": 6.9375,
"learning_rate": 1.547169811320755e-05,
"loss": 2.0301,
"step": 41
},
{
"epoch": 0.08042125418860699,
"grad_norm": 2.609375,
"learning_rate": 1.5849056603773586e-05,
"loss": 1.9727,
"step": 42
},
{
"epoch": 0.0823360459550024,
"grad_norm": 2.671875,
"learning_rate": 1.6226415094339625e-05,
"loss": 2.0236,
"step": 43
},
{
"epoch": 0.0842508377213978,
"grad_norm": 2.640625,
"learning_rate": 1.6603773584905664e-05,
"loss": 1.9287,
"step": 44
},
{
"epoch": 0.0861656294877932,
"grad_norm": 2.6875,
"learning_rate": 1.69811320754717e-05,
"loss": 1.9009,
"step": 45
},
{
"epoch": 0.0880804212541886,
"grad_norm": 2.609375,
"learning_rate": 1.735849056603774e-05,
"loss": 1.9898,
"step": 46
},
{
"epoch": 0.08999521302058401,
"grad_norm": 2.640625,
"learning_rate": 1.7735849056603774e-05,
"loss": 1.9054,
"step": 47
},
{
"epoch": 0.09191000478697942,
"grad_norm": 2.71875,
"learning_rate": 1.8113207547169813e-05,
"loss": 1.9189,
"step": 48
},
{
"epoch": 0.09382479655337482,
"grad_norm": 2.78125,
"learning_rate": 1.8490566037735852e-05,
"loss": 1.9284,
"step": 49
},
{
"epoch": 0.09573958831977022,
"grad_norm": 2.703125,
"learning_rate": 1.8867924528301888e-05,
"loss": 1.8944,
"step": 50
},
{
"epoch": 0.09765438008616563,
"grad_norm": 2.796875,
"learning_rate": 1.9245283018867927e-05,
"loss": 1.9709,
"step": 51
},
{
"epoch": 0.09956917185256103,
"grad_norm": 2.734375,
"learning_rate": 1.9622641509433963e-05,
"loss": 1.9177,
"step": 52
},
{
"epoch": 0.10148396361895644,
"grad_norm": 2.828125,
"learning_rate": 2e-05,
"loss": 1.9661,
"step": 53
},
{
"epoch": 0.10339875538535184,
"grad_norm": 2.828125,
"learning_rate": 1.9999949751618577e-05,
"loss": 1.8821,
"step": 54
},
{
"epoch": 0.10531354715174725,
"grad_norm": 2.796875,
"learning_rate": 1.9999799006979282e-05,
"loss": 1.922,
"step": 55
},
{
"epoch": 0.10722833891814265,
"grad_norm": 2.875,
"learning_rate": 1.9999547767597055e-05,
"loss": 2.01,
"step": 56
},
{
"epoch": 0.10914313068453806,
"grad_norm": 2.796875,
"learning_rate": 1.9999196035996768e-05,
"loss": 1.9654,
"step": 57
},
{
"epoch": 0.11105792245093346,
"grad_norm": 2.765625,
"learning_rate": 1.999874381571321e-05,
"loss": 1.9159,
"step": 58
},
{
"epoch": 0.11297271421732887,
"grad_norm": 2.734375,
"learning_rate": 1.999819111129105e-05,
"loss": 1.9388,
"step": 59
},
{
"epoch": 0.11488750598372427,
"grad_norm": 2.71875,
"learning_rate": 1.9997537928284783e-05,
"loss": 1.8953,
"step": 60
},
{
"epoch": 0.11680229775011967,
"grad_norm": 2.578125,
"learning_rate": 1.9996784273258688e-05,
"loss": 1.8523,
"step": 61
},
{
"epoch": 0.11871708951651508,
"grad_norm": 2.703125,
"learning_rate": 1.999593015378676e-05,
"loss": 1.9153,
"step": 62
},
{
"epoch": 0.12063188128291048,
"grad_norm": 2.4375,
"learning_rate": 1.9994975578452626e-05,
"loss": 1.817,
"step": 63
},
{
"epoch": 0.12254667304930589,
"grad_norm": 2.359375,
"learning_rate": 1.9993920556849448e-05,
"loss": 1.845,
"step": 64
},
{
"epoch": 0.12446146481570129,
"grad_norm": 2.359375,
"learning_rate": 1.9992765099579853e-05,
"loss": 1.8886,
"step": 65
},
{
"epoch": 0.1263762565820967,
"grad_norm": 2.203125,
"learning_rate": 1.999150921825582e-05,
"loss": 1.8955,
"step": 66
},
{
"epoch": 0.12829104834849211,
"grad_norm": 2.25,
"learning_rate": 1.9990152925498545e-05,
"loss": 1.8308,
"step": 67
},
{
"epoch": 0.1302058401148875,
"grad_norm": 2.078125,
"learning_rate": 1.9988696234938333e-05,
"loss": 1.9141,
"step": 68
},
{
"epoch": 0.13212063188128292,
"grad_norm": 2.140625,
"learning_rate": 1.998713916121445e-05,
"loss": 1.9814,
"step": 69
},
{
"epoch": 0.13403542364767831,
"grad_norm": 2.046875,
"learning_rate": 1.9985481719974985e-05,
"loss": 1.7735,
"step": 70
},
{
"epoch": 0.13595021541407373,
"grad_norm": 2.140625,
"learning_rate": 1.9983723927876685e-05,
"loss": 1.8418,
"step": 71
},
{
"epoch": 0.13786500718046912,
"grad_norm": 2.0625,
"learning_rate": 1.9981865802584795e-05,
"loss": 1.8867,
"step": 72
},
{
"epoch": 0.13977979894686454,
"grad_norm": 2.265625,
"learning_rate": 1.9979907362772865e-05,
"loss": 1.8945,
"step": 73
},
{
"epoch": 0.14169459071325993,
"grad_norm": 2.125,
"learning_rate": 1.997784862812259e-05,
"loss": 1.8312,
"step": 74
},
{
"epoch": 0.14360938247965535,
"grad_norm": 2.0,
"learning_rate": 1.997568961932358e-05,
"loss": 1.8979,
"step": 75
},
{
"epoch": 0.14552417424605074,
"grad_norm": 2.0,
"learning_rate": 1.997343035807318e-05,
"loss": 1.8536,
"step": 76
},
{
"epoch": 0.14743896601244616,
"grad_norm": 2.171875,
"learning_rate": 1.9971070867076227e-05,
"loss": 1.814,
"step": 77
},
{
"epoch": 0.14935375777884155,
"grad_norm": 1.9375,
"learning_rate": 1.996861117004485e-05,
"loss": 1.8618,
"step": 78
},
{
"epoch": 0.15126854954523697,
"grad_norm": 2.046875,
"learning_rate": 1.9966051291698202e-05,
"loss": 1.8376,
"step": 79
},
{
"epoch": 0.15318334131163236,
"grad_norm": 2.140625,
"learning_rate": 1.9963391257762234e-05,
"loss": 1.8344,
"step": 80
},
{
"epoch": 0.15509813307802778,
"grad_norm": 2.203125,
"learning_rate": 1.9960631094969424e-05,
"loss": 1.8546,
"step": 81
},
{
"epoch": 0.15701292484442317,
"grad_norm": 2.140625,
"learning_rate": 1.9957770831058518e-05,
"loss": 1.8692,
"step": 82
},
{
"epoch": 0.1589277166108186,
"grad_norm": 2.109375,
"learning_rate": 1.995481049477424e-05,
"loss": 1.8659,
"step": 83
},
{
"epoch": 0.16084250837721398,
"grad_norm": 2.15625,
"learning_rate": 1.9951750115867013e-05,
"loss": 1.8511,
"step": 84
},
{
"epoch": 0.1627573001436094,
"grad_norm": 2.03125,
"learning_rate": 1.994858972509265e-05,
"loss": 1.8137,
"step": 85
},
{
"epoch": 0.1646720919100048,
"grad_norm": 2.078125,
"learning_rate": 1.994532935421206e-05,
"loss": 1.9099,
"step": 86
},
{
"epoch": 0.1665868836764002,
"grad_norm": 2.0,
"learning_rate": 1.9941969035990913e-05,
"loss": 1.8185,
"step": 87
},
{
"epoch": 0.1685016754427956,
"grad_norm": 2.046875,
"learning_rate": 1.9938508804199322e-05,
"loss": 1.8309,
"step": 88
},
{
"epoch": 0.170416467209191,
"grad_norm": 2.03125,
"learning_rate": 1.9934948693611495e-05,
"loss": 1.7971,
"step": 89
},
{
"epoch": 0.1723312589755864,
"grad_norm": 2.078125,
"learning_rate": 1.9931288740005388e-05,
"loss": 1.8312,
"step": 90
},
{
"epoch": 0.17424605074198182,
"grad_norm": 2.078125,
"learning_rate": 1.992752898016235e-05,
"loss": 1.7379,
"step": 91
},
{
"epoch": 0.1761608425083772,
"grad_norm": 2.34375,
"learning_rate": 1.9923669451866753e-05,
"loss": 1.8071,
"step": 92
},
{
"epoch": 0.17807563427477263,
"grad_norm": 2.0625,
"learning_rate": 1.9919710193905607e-05,
"loss": 1.8601,
"step": 93
},
{
"epoch": 0.17999042604116802,
"grad_norm": 2.140625,
"learning_rate": 1.9915651246068176e-05,
"loss": 1.8539,
"step": 94
},
{
"epoch": 0.18190521780756344,
"grad_norm": 2.125,
"learning_rate": 1.991149264914556e-05,
"loss": 1.8179,
"step": 95
},
{
"epoch": 0.18382000957395883,
"grad_norm": 1.9921875,
"learning_rate": 1.9907234444930328e-05,
"loss": 1.8305,
"step": 96
},
{
"epoch": 0.18573480134035425,
"grad_norm": 2.1875,
"learning_rate": 1.9902876676216044e-05,
"loss": 1.8367,
"step": 97
},
{
"epoch": 0.18764959310674964,
"grad_norm": 2.1875,
"learning_rate": 1.989841938679687e-05,
"loss": 1.8328,
"step": 98
},
{
"epoch": 0.18956438487314506,
"grad_norm": 1.984375,
"learning_rate": 1.989386262146713e-05,
"loss": 1.8388,
"step": 99
},
{
"epoch": 0.19147917663954045,
"grad_norm": 2.046875,
"learning_rate": 1.9889206426020837e-05,
"loss": 1.834,
"step": 100
},
{
"epoch": 0.19339396840593587,
"grad_norm": 1.9765625,
"learning_rate": 1.988445084725125e-05,
"loss": 1.8004,
"step": 101
},
{
"epoch": 0.19530876017233126,
"grad_norm": 1.9375,
"learning_rate": 1.987959593295039e-05,
"loss": 1.8231,
"step": 102
},
{
"epoch": 0.19722355193872668,
"grad_norm": 1.984375,
"learning_rate": 1.987464173190858e-05,
"loss": 1.8879,
"step": 103
},
{
"epoch": 0.19913834370512207,
"grad_norm": 1.9609375,
"learning_rate": 1.9869588293913932e-05,
"loss": 1.8918,
"step": 104
},
{
"epoch": 0.20105313547151749,
"grad_norm": 2.109375,
"learning_rate": 1.986443566975187e-05,
"loss": 1.8015,
"step": 105
},
{
"epoch": 0.20296792723791288,
"grad_norm": 2.015625,
"learning_rate": 1.9859183911204588e-05,
"loss": 1.8785,
"step": 106
},
{
"epoch": 0.2048827190043083,
"grad_norm": 2.015625,
"learning_rate": 1.9853833071050567e-05,
"loss": 1.8675,
"step": 107
},
{
"epoch": 0.20679751077070369,
"grad_norm": 1.9765625,
"learning_rate": 1.9848383203064018e-05,
"loss": 1.8918,
"step": 108
},
{
"epoch": 0.2087123025370991,
"grad_norm": 2.03125,
"learning_rate": 1.984283436201435e-05,
"loss": 1.8418,
"step": 109
},
{
"epoch": 0.2106270943034945,
"grad_norm": 2.0625,
"learning_rate": 1.9837186603665615e-05,
"loss": 1.8626,
"step": 110
},
{
"epoch": 0.2125418860698899,
"grad_norm": 2.25,
"learning_rate": 1.983143998477596e-05,
"loss": 1.858,
"step": 111
},
{
"epoch": 0.2144566778362853,
"grad_norm": 2.0625,
"learning_rate": 1.9825594563097043e-05,
"loss": 1.8028,
"step": 112
},
{
"epoch": 0.21637146960268072,
"grad_norm": 2.015625,
"learning_rate": 1.981965039737346e-05,
"loss": 1.7789,
"step": 113
},
{
"epoch": 0.2182862613690761,
"grad_norm": 2.125,
"learning_rate": 1.9813607547342152e-05,
"loss": 1.7911,
"step": 114
},
{
"epoch": 0.22020105313547153,
"grad_norm": 2.140625,
"learning_rate": 1.9807466073731806e-05,
"loss": 1.7719,
"step": 115
},
{
"epoch": 0.22211584490186692,
"grad_norm": 1.8984375,
"learning_rate": 1.9801226038262244e-05,
"loss": 1.8903,
"step": 116
},
{
"epoch": 0.22403063666826234,
"grad_norm": 2.015625,
"learning_rate": 1.9794887503643805e-05,
"loss": 1.8288,
"step": 117
},
{
"epoch": 0.22594542843465773,
"grad_norm": 1.9375,
"learning_rate": 1.9788450533576708e-05,
"loss": 1.7571,
"step": 118
},
{
"epoch": 0.22786022020105315,
"grad_norm": 1.8671875,
"learning_rate": 1.9781915192750413e-05,
"loss": 1.8572,
"step": 119
},
{
"epoch": 0.22977501196744854,
"grad_norm": 1.9453125,
"learning_rate": 1.9775281546842985e-05,
"loss": 1.7588,
"step": 120
},
{
"epoch": 0.23168980373384396,
"grad_norm": 2.03125,
"learning_rate": 1.976854966252042e-05,
"loss": 1.8695,
"step": 121
},
{
"epoch": 0.23360459550023935,
"grad_norm": 2.078125,
"learning_rate": 1.9761719607435973e-05,
"loss": 1.8384,
"step": 122
},
{
"epoch": 0.23551938726663477,
"grad_norm": 1.9921875,
"learning_rate": 1.9754791450229485e-05,
"loss": 1.8699,
"step": 123
},
{
"epoch": 0.23743417903303016,
"grad_norm": 1.953125,
"learning_rate": 1.9747765260526696e-05,
"loss": 1.8323,
"step": 124
},
{
"epoch": 0.23934897079942558,
"grad_norm": 2.0625,
"learning_rate": 1.9740641108938538e-05,
"loss": 1.7859,
"step": 125
},
{
"epoch": 0.24126376256582097,
"grad_norm": 1.96875,
"learning_rate": 1.9733419067060427e-05,
"loss": 1.8269,
"step": 126
},
{
"epoch": 0.24317855433221638,
"grad_norm": 1.9765625,
"learning_rate": 1.972609920747155e-05,
"loss": 1.8094,
"step": 127
},
{
"epoch": 0.24509334609861178,
"grad_norm": 1.984375,
"learning_rate": 1.9718681603734124e-05,
"loss": 1.8463,
"step": 128
},
{
"epoch": 0.2470081378650072,
"grad_norm": 2.03125,
"learning_rate": 1.971116633039266e-05,
"loss": 1.9069,
"step": 129
},
{
"epoch": 0.24892292963140258,
"grad_norm": 1.859375,
"learning_rate": 1.9703553462973227e-05,
"loss": 1.7865,
"step": 130
},
{
"epoch": 0.250837721397798,
"grad_norm": 2.0,
"learning_rate": 1.9695843077982677e-05,
"loss": 1.8369,
"step": 131
},
{
"epoch": 0.2527525131641934,
"grad_norm": 1.8828125,
"learning_rate": 1.9688035252907888e-05,
"loss": 1.823,
"step": 132
},
{
"epoch": 0.2546673049305888,
"grad_norm": 2.0625,
"learning_rate": 1.9680130066214968e-05,
"loss": 1.8506,
"step": 133
},
{
"epoch": 0.25658209669698423,
"grad_norm": 1.96875,
"learning_rate": 1.9672127597348486e-05,
"loss": 1.8261,
"step": 134
},
{
"epoch": 0.2584968884633796,
"grad_norm": 1.890625,
"learning_rate": 1.9664027926730664e-05,
"loss": 1.7482,
"step": 135
},
{
"epoch": 0.260411680229775,
"grad_norm": 1.96875,
"learning_rate": 1.965583113576057e-05,
"loss": 1.8663,
"step": 136
},
{
"epoch": 0.26232647199617043,
"grad_norm": 2.046875,
"learning_rate": 1.9647537306813303e-05,
"loss": 1.8714,
"step": 137
},
{
"epoch": 0.26424126376256585,
"grad_norm": 1.96875,
"learning_rate": 1.9639146523239156e-05,
"loss": 1.7767,
"step": 138
},
{
"epoch": 0.2661560555289612,
"grad_norm": 1.8671875,
"learning_rate": 1.9630658869362786e-05,
"loss": 1.7756,
"step": 139
},
{
"epoch": 0.26807084729535663,
"grad_norm": 1.9609375,
"learning_rate": 1.9622074430482366e-05,
"loss": 1.8538,
"step": 140
},
{
"epoch": 0.26998563906175205,
"grad_norm": 2.125,
"learning_rate": 1.9613393292868735e-05,
"loss": 1.8367,
"step": 141
},
{
"epoch": 0.27190043082814747,
"grad_norm": 2.203125,
"learning_rate": 1.9604615543764506e-05,
"loss": 1.8716,
"step": 142
},
{
"epoch": 0.27381522259454283,
"grad_norm": 1.890625,
"learning_rate": 1.9595741271383225e-05,
"loss": 1.8014,
"step": 143
},
{
"epoch": 0.27573001436093825,
"grad_norm": 1.8671875,
"learning_rate": 1.9586770564908453e-05,
"loss": 1.7858,
"step": 144
},
{
"epoch": 0.27764480612733367,
"grad_norm": 2.21875,
"learning_rate": 1.9577703514492886e-05,
"loss": 1.7948,
"step": 145
},
{
"epoch": 0.2795595978937291,
"grad_norm": 1.96875,
"learning_rate": 1.9568540211257444e-05,
"loss": 1.8298,
"step": 146
},
{
"epoch": 0.28147438966012445,
"grad_norm": 1.8984375,
"learning_rate": 1.9559280747290362e-05,
"loss": 1.8202,
"step": 147
},
{
"epoch": 0.28338918142651986,
"grad_norm": 2.125,
"learning_rate": 1.954992521564625e-05,
"loss": 1.8162,
"step": 148
},
{
"epoch": 0.2853039731929153,
"grad_norm": 1.9375,
"learning_rate": 1.9540473710345177e-05,
"loss": 1.8356,
"step": 149
},
{
"epoch": 0.2872187649593107,
"grad_norm": 1.9296875,
"learning_rate": 1.9530926326371712e-05,
"loss": 1.8004,
"step": 150
},
{
"epoch": 0.28913355672570606,
"grad_norm": 1.9140625,
"learning_rate": 1.952128315967397e-05,
"loss": 1.8221,
"step": 151
},
{
"epoch": 0.2910483484921015,
"grad_norm": 1.828125,
"learning_rate": 1.9511544307162656e-05,
"loss": 1.8628,
"step": 152
},
{
"epoch": 0.2929631402584969,
"grad_norm": 2.015625,
"learning_rate": 1.9501709866710086e-05,
"loss": 1.7806,
"step": 153
},
{
"epoch": 0.2948779320248923,
"grad_norm": 1.90625,
"learning_rate": 1.9491779937149204e-05,
"loss": 1.8573,
"step": 154
},
{
"epoch": 0.2967927237912877,
"grad_norm": 2.0625,
"learning_rate": 1.9481754618272585e-05,
"loss": 1.8113,
"step": 155
},
{
"epoch": 0.2987075155576831,
"grad_norm": 2.046875,
"learning_rate": 1.947163401083144e-05,
"loss": 1.8203,
"step": 156
},
{
"epoch": 0.3006223073240785,
"grad_norm": 1.8515625,
"learning_rate": 1.9461418216534594e-05,
"loss": 1.8106,
"step": 157
},
{
"epoch": 0.30253709909047394,
"grad_norm": 1.984375,
"learning_rate": 1.9451107338047478e-05,
"loss": 1.792,
"step": 158
},
{
"epoch": 0.3044518908568693,
"grad_norm": 1.984375,
"learning_rate": 1.9440701478991078e-05,
"loss": 1.8266,
"step": 159
},
{
"epoch": 0.3063666826232647,
"grad_norm": 1.9296875,
"learning_rate": 1.9430200743940915e-05,
"loss": 1.862,
"step": 160
},
{
"epoch": 0.30828147438966014,
"grad_norm": 1.984375,
"learning_rate": 1.9419605238425975e-05,
"loss": 1.8587,
"step": 161
},
{
"epoch": 0.31019626615605556,
"grad_norm": 1.9140625,
"learning_rate": 1.9408915068927653e-05,
"loss": 1.7967,
"step": 162
},
{
"epoch": 0.3121110579224509,
"grad_norm": 1.9765625,
"learning_rate": 1.93981303428787e-05,
"loss": 1.7357,
"step": 163
},
{
"epoch": 0.31402584968884634,
"grad_norm": 1.8984375,
"learning_rate": 1.9387251168662114e-05,
"loss": 1.8246,
"step": 164
},
{
"epoch": 0.31594064145524176,
"grad_norm": 1.984375,
"learning_rate": 1.937627765561008e-05,
"loss": 1.8022,
"step": 165
},
{
"epoch": 0.3178554332216372,
"grad_norm": 1.9140625,
"learning_rate": 1.936520991400285e-05,
"loss": 1.8312,
"step": 166
},
{
"epoch": 0.31977022498803254,
"grad_norm": 1.96875,
"learning_rate": 1.935404805506764e-05,
"loss": 1.7942,
"step": 167
},
{
"epoch": 0.32168501675442795,
"grad_norm": 2.09375,
"learning_rate": 1.9342792190977532e-05,
"loss": 1.8574,
"step": 168
},
{
"epoch": 0.3235998085208234,
"grad_norm": 2.0,
"learning_rate": 1.93314424348503e-05,
"loss": 1.78,
"step": 169
},
{
"epoch": 0.3255146002872188,
"grad_norm": 2.0,
"learning_rate": 1.9319998900747327e-05,
"loss": 1.8314,
"step": 170
},
{
"epoch": 0.32742939205361415,
"grad_norm": 1.9453125,
"learning_rate": 1.930846170367243e-05,
"loss": 1.8051,
"step": 171
},
{
"epoch": 0.3293441838200096,
"grad_norm": 2.015625,
"learning_rate": 1.9296830959570697e-05,
"loss": 1.9012,
"step": 172
},
{
"epoch": 0.331258975586405,
"grad_norm": 1.9765625,
"learning_rate": 1.9285106785327345e-05,
"loss": 1.8383,
"step": 173
},
{
"epoch": 0.3331737673528004,
"grad_norm": 1.9609375,
"learning_rate": 1.9273289298766532e-05,
"loss": 1.808,
"step": 174
},
{
"epoch": 0.3350885591191958,
"grad_norm": 1.9296875,
"learning_rate": 1.9261378618650166e-05,
"loss": 1.7936,
"step": 175
},
{
"epoch": 0.3370033508855912,
"grad_norm": 1.8671875,
"learning_rate": 1.9249374864676733e-05,
"loss": 1.8398,
"step": 176
},
{
"epoch": 0.3389181426519866,
"grad_norm": 1.8671875,
"learning_rate": 1.9237278157480074e-05,
"loss": 1.7898,
"step": 177
},
{
"epoch": 0.340832934418382,
"grad_norm": 2.015625,
"learning_rate": 1.922508861862818e-05,
"loss": 1.8345,
"step": 178
},
{
"epoch": 0.3427477261847774,
"grad_norm": 1.8984375,
"learning_rate": 1.9212806370621963e-05,
"loss": 1.8619,
"step": 179
},
{
"epoch": 0.3446625179511728,
"grad_norm": 1.984375,
"learning_rate": 1.920043153689405e-05,
"loss": 1.8286,
"step": 180
},
{
"epoch": 0.3465773097175682,
"grad_norm": 2.265625,
"learning_rate": 1.9187964241807508e-05,
"loss": 1.8079,
"step": 181
},
{
"epoch": 0.34849210148396365,
"grad_norm": 1.9609375,
"learning_rate": 1.917540461065462e-05,
"loss": 1.7952,
"step": 182
},
{
"epoch": 0.350406893250359,
"grad_norm": 1.9921875,
"learning_rate": 1.916275276965561e-05,
"loss": 1.8172,
"step": 183
},
{
"epoch": 0.3523216850167544,
"grad_norm": 2.171875,
"learning_rate": 1.9150008845957388e-05,
"loss": 1.7986,
"step": 184
},
{
"epoch": 0.35423647678314985,
"grad_norm": 2.078125,
"learning_rate": 1.9137172967632262e-05,
"loss": 1.7909,
"step": 185
},
{
"epoch": 0.35615126854954526,
"grad_norm": 1.9453125,
"learning_rate": 1.912424526367665e-05,
"loss": 1.7713,
"step": 186
},
{
"epoch": 0.3580660603159406,
"grad_norm": 1.8984375,
"learning_rate": 1.9111225864009794e-05,
"loss": 1.8165,
"step": 187
},
{
"epoch": 0.35998085208233604,
"grad_norm": 2.0625,
"learning_rate": 1.9098114899472443e-05,
"loss": 1.8618,
"step": 188
},
{
"epoch": 0.36189564384873146,
"grad_norm": 2.0,
"learning_rate": 1.9084912501825554e-05,
"loss": 1.8634,
"step": 189
},
{
"epoch": 0.3638104356151269,
"grad_norm": 1.828125,
"learning_rate": 1.9071618803748945e-05,
"loss": 1.8141,
"step": 190
},
{
"epoch": 0.36572522738152224,
"grad_norm": 1.9140625,
"learning_rate": 1.9058233938839975e-05,
"loss": 1.7962,
"step": 191
},
{
"epoch": 0.36764001914791766,
"grad_norm": 2.125,
"learning_rate": 1.9044758041612207e-05,
"loss": 1.8472,
"step": 192
},
{
"epoch": 0.3695548109143131,
"grad_norm": 2.03125,
"learning_rate": 1.9031191247494046e-05,
"loss": 1.7234,
"step": 193
},
{
"epoch": 0.3714696026807085,
"grad_norm": 1.9140625,
"learning_rate": 1.9017533692827383e-05,
"loss": 1.8239,
"step": 194
},
{
"epoch": 0.37338439444710386,
"grad_norm": 2.109375,
"learning_rate": 1.9003785514866214e-05,
"loss": 1.7942,
"step": 195
},
{
"epoch": 0.3752991862134993,
"grad_norm": 2.15625,
"learning_rate": 1.8989946851775287e-05,
"loss": 1.8161,
"step": 196
},
{
"epoch": 0.3772139779798947,
"grad_norm": 1.8359375,
"learning_rate": 1.8976017842628677e-05,
"loss": 1.8224,
"step": 197
},
{
"epoch": 0.3791287697462901,
"grad_norm": 1.9609375,
"learning_rate": 1.8961998627408424e-05,
"loss": 1.8211,
"step": 198
},
{
"epoch": 0.3810435615126855,
"grad_norm": 1.953125,
"learning_rate": 1.89478893470031e-05,
"loss": 1.8292,
"step": 199
},
{
"epoch": 0.3829583532790809,
"grad_norm": 1.9140625,
"learning_rate": 1.89336901432064e-05,
"loss": 1.8522,
"step": 200
},
{
"epoch": 0.3848731450454763,
"grad_norm": 1.890625,
"learning_rate": 1.891940115871574e-05,
"loss": 1.8274,
"step": 201
},
{
"epoch": 0.38678793681187174,
"grad_norm": 1.9609375,
"learning_rate": 1.8905022537130774e-05,
"loss": 1.8349,
"step": 202
},
{
"epoch": 0.3887027285782671,
"grad_norm": 1.875,
"learning_rate": 1.8890554422952e-05,
"loss": 1.8058,
"step": 203
},
{
"epoch": 0.3906175203446625,
"grad_norm": 1.9140625,
"learning_rate": 1.8875996961579282e-05,
"loss": 1.8383,
"step": 204
},
{
"epoch": 0.39253231211105793,
"grad_norm": 1.8828125,
"learning_rate": 1.8861350299310397e-05,
"loss": 1.7721,
"step": 205
},
{
"epoch": 0.39444710387745335,
"grad_norm": 1.90625,
"learning_rate": 1.8846614583339555e-05,
"loss": 1.8389,
"step": 206
},
{
"epoch": 0.3963618956438487,
"grad_norm": 1.890625,
"learning_rate": 1.883178996175593e-05,
"loss": 1.8014,
"step": 207
},
{
"epoch": 0.39827668741024413,
"grad_norm": 2.046875,
"learning_rate": 1.8816876583542177e-05,
"loss": 1.8612,
"step": 208
},
{
"epoch": 0.40019147917663955,
"grad_norm": 1.9453125,
"learning_rate": 1.8801874598572918e-05,
"loss": 1.7983,
"step": 209
},
{
"epoch": 0.40210627094303497,
"grad_norm": 1.90625,
"learning_rate": 1.8786784157613242e-05,
"loss": 1.8277,
"step": 210
},
{
"epoch": 0.40402106270943033,
"grad_norm": 2.0,
"learning_rate": 1.8771605412317192e-05,
"loss": 1.815,
"step": 211
},
{
"epoch": 0.40593585447582575,
"grad_norm": 1.8359375,
"learning_rate": 1.875633851522625e-05,
"loss": 1.7652,
"step": 212
},
{
"epoch": 0.40785064624222117,
"grad_norm": 2.125,
"learning_rate": 1.874098361976779e-05,
"loss": 1.8605,
"step": 213
},
{
"epoch": 0.4097654380086166,
"grad_norm": 1.984375,
"learning_rate": 1.872554088025354e-05,
"loss": 1.7953,
"step": 214
},
{
"epoch": 0.41168022977501195,
"grad_norm": 1.8203125,
"learning_rate": 1.8710010451878036e-05,
"loss": 1.7491,
"step": 215
},
{
"epoch": 0.41359502154140737,
"grad_norm": 2.03125,
"learning_rate": 1.869439249071705e-05,
"loss": 1.7363,
"step": 216
},
{
"epoch": 0.4155098133078028,
"grad_norm": 2.328125,
"learning_rate": 1.8678687153726037e-05,
"loss": 1.8131,
"step": 217
},
{
"epoch": 0.4174246050741982,
"grad_norm": 2.015625,
"learning_rate": 1.866289459873855e-05,
"loss": 1.8631,
"step": 218
},
{
"epoch": 0.41933939684059357,
"grad_norm": 1.8828125,
"learning_rate": 1.8647014984464657e-05,
"loss": 1.8336,
"step": 219
},
{
"epoch": 0.421254188606989,
"grad_norm": 2.25,
"learning_rate": 1.8631048470489343e-05,
"loss": 1.7929,
"step": 220
},
{
"epoch": 0.4231689803733844,
"grad_norm": 1.9609375,
"learning_rate": 1.8614995217270893e-05,
"loss": 1.7582,
"step": 221
},
{
"epoch": 0.4250837721397798,
"grad_norm": 1.921875,
"learning_rate": 1.859885538613932e-05,
"loss": 1.8399,
"step": 222
},
{
"epoch": 0.4269985639061752,
"grad_norm": 1.9609375,
"learning_rate": 1.8582629139294693e-05,
"loss": 1.8379,
"step": 223
},
{
"epoch": 0.4289133556725706,
"grad_norm": 1.828125,
"learning_rate": 1.8566316639805543e-05,
"loss": 1.711,
"step": 224
},
{
"epoch": 0.430828147438966,
"grad_norm": 1.96875,
"learning_rate": 1.854991805160721e-05,
"loss": 1.8693,
"step": 225
},
{
"epoch": 0.43274293920536144,
"grad_norm": 1.984375,
"learning_rate": 1.8533433539500195e-05,
"loss": 1.7657,
"step": 226
},
{
"epoch": 0.4346577309717568,
"grad_norm": 1.96875,
"learning_rate": 1.851686326914851e-05,
"loss": 1.7919,
"step": 227
},
{
"epoch": 0.4365725227381522,
"grad_norm": 1.8828125,
"learning_rate": 1.850020740707801e-05,
"loss": 1.7852,
"step": 228
},
{
"epoch": 0.43848731450454764,
"grad_norm": 1.8203125,
"learning_rate": 1.8483466120674708e-05,
"loss": 1.7779,
"step": 229
},
{
"epoch": 0.44040210627094306,
"grad_norm": 2.0625,
"learning_rate": 1.846663957818312e-05,
"loss": 1.8108,
"step": 230
},
{
"epoch": 0.4423168980373384,
"grad_norm": 1.875,
"learning_rate": 1.8449727948704556e-05,
"loss": 1.7799,
"step": 231
},
{
"epoch": 0.44423168980373384,
"grad_norm": 1.9765625,
"learning_rate": 1.843273140219541e-05,
"loss": 1.7874,
"step": 232
},
{
"epoch": 0.44614648157012926,
"grad_norm": 1.859375,
"learning_rate": 1.8415650109465473e-05,
"loss": 1.7549,
"step": 233
},
{
"epoch": 0.4480612733365247,
"grad_norm": 1.9609375,
"learning_rate": 1.8398484242176206e-05,
"loss": 1.7661,
"step": 234
},
{
"epoch": 0.44997606510292004,
"grad_norm": 1.8984375,
"learning_rate": 1.8381233972839027e-05,
"loss": 1.8091,
"step": 235
},
{
"epoch": 0.45189085686931546,
"grad_norm": 1.9765625,
"learning_rate": 1.836389947481355e-05,
"loss": 1.7503,
"step": 236
},
{
"epoch": 0.4538056486357109,
"grad_norm": 1.890625,
"learning_rate": 1.834648092230587e-05,
"loss": 1.829,
"step": 237
},
{
"epoch": 0.4557204404021063,
"grad_norm": 1.9609375,
"learning_rate": 1.8328978490366804e-05,
"loss": 1.8481,
"step": 238
},
{
"epoch": 0.45763523216850166,
"grad_norm": 1.9921875,
"learning_rate": 1.831139235489013e-05,
"loss": 1.8335,
"step": 239
},
{
"epoch": 0.4595500239348971,
"grad_norm": 1.859375,
"learning_rate": 1.8293722692610805e-05,
"loss": 1.7653,
"step": 240
},
{
"epoch": 0.4614648157012925,
"grad_norm": 1.8203125,
"learning_rate": 1.8275969681103226e-05,
"loss": 1.8049,
"step": 241
},
{
"epoch": 0.4633796074676879,
"grad_norm": 1.8984375,
"learning_rate": 1.8258133498779407e-05,
"loss": 1.8257,
"step": 242
},
{
"epoch": 0.4652943992340833,
"grad_norm": 1.984375,
"learning_rate": 1.8240214324887212e-05,
"loss": 1.8437,
"step": 243
},
{
"epoch": 0.4672091910004787,
"grad_norm": 1.8125,
"learning_rate": 1.8222212339508528e-05,
"loss": 1.8295,
"step": 244
},
{
"epoch": 0.4691239827668741,
"grad_norm": 1.8046875,
"learning_rate": 1.820412772355749e-05,
"loss": 1.7919,
"step": 245
},
{
"epoch": 0.47103877453326953,
"grad_norm": 1.8984375,
"learning_rate": 1.818596065877863e-05,
"loss": 1.8298,
"step": 246
},
{
"epoch": 0.4729535662996649,
"grad_norm": 1.984375,
"learning_rate": 1.816771132774507e-05,
"loss": 1.7665,
"step": 247
},
{
"epoch": 0.4748683580660603,
"grad_norm": 1.9453125,
"learning_rate": 1.814937991385667e-05,
"loss": 1.8966,
"step": 248
},
{
"epoch": 0.47678314983245573,
"grad_norm": 1.9453125,
"learning_rate": 1.813096660133822e-05,
"loss": 1.7429,
"step": 249
},
{
"epoch": 0.47869794159885115,
"grad_norm": 2.09375,
"learning_rate": 1.811247157523754e-05,
"loss": 1.8369,
"step": 250
},
{
"epoch": 0.4806127333652465,
"grad_norm": 1.9375,
"learning_rate": 1.8093895021423658e-05,
"loss": 1.7983,
"step": 251
},
{
"epoch": 0.48252752513164193,
"grad_norm": 1.828125,
"learning_rate": 1.807523712658493e-05,
"loss": 1.764,
"step": 252
},
{
"epoch": 0.48444231689803735,
"grad_norm": 2.3125,
"learning_rate": 1.8056498078227152e-05,
"loss": 1.8022,
"step": 253
},
{
"epoch": 0.48635710866443277,
"grad_norm": 1.8984375,
"learning_rate": 1.8037678064671702e-05,
"loss": 1.7991,
"step": 254
},
{
"epoch": 0.48827190043082813,
"grad_norm": 1.8515625,
"learning_rate": 1.801877727505362e-05,
"loss": 1.8253,
"step": 255
},
{
"epoch": 0.49018669219722355,
"grad_norm": 1.984375,
"learning_rate": 1.7999795899319718e-05,
"loss": 1.8086,
"step": 256
},
{
"epoch": 0.49210148396361897,
"grad_norm": 1.96875,
"learning_rate": 1.7980734128226685e-05,
"loss": 1.8171,
"step": 257
},
{
"epoch": 0.4940162757300144,
"grad_norm": 2.125,
"learning_rate": 1.7961592153339146e-05,
"loss": 1.8484,
"step": 258
},
{
"epoch": 0.49593106749640975,
"grad_norm": 1.7890625,
"learning_rate": 1.7942370167027756e-05,
"loss": 1.8499,
"step": 259
},
{
"epoch": 0.49784585926280517,
"grad_norm": 2.140625,
"learning_rate": 1.7923068362467252e-05,
"loss": 1.8069,
"step": 260
},
{
"epoch": 0.4997606510292006,
"grad_norm": 1.890625,
"learning_rate": 1.7903686933634522e-05,
"loss": 1.8241,
"step": 261
},
{
"epoch": 0.501675442795596,
"grad_norm": 1.7890625,
"learning_rate": 1.7884226075306652e-05,
"loss": 1.7349,
"step": 262
},
{
"epoch": 0.5035902345619914,
"grad_norm": 1.9140625,
"learning_rate": 1.786468598305897e-05,
"loss": 1.8709,
"step": 263
},
{
"epoch": 0.5055050263283868,
"grad_norm": 1.796875,
"learning_rate": 1.7845066853263068e-05,
"loss": 1.7636,
"step": 264
},
{
"epoch": 0.5074198180947822,
"grad_norm": 1.9140625,
"learning_rate": 1.7825368883084865e-05,
"loss": 1.8139,
"step": 265
},
{
"epoch": 0.5093346098611776,
"grad_norm": 1.859375,
"learning_rate": 1.7805592270482576e-05,
"loss": 1.7859,
"step": 266
},
{
"epoch": 0.511249401627573,
"grad_norm": 1.859375,
"learning_rate": 1.7785737214204752e-05,
"loss": 1.7915,
"step": 267
},
{
"epoch": 0.5131641933939685,
"grad_norm": 1.8828125,
"learning_rate": 1.776580391378829e-05,
"loss": 1.7832,
"step": 268
},
{
"epoch": 0.5150789851603638,
"grad_norm": 1.890625,
"learning_rate": 1.7745792569556402e-05,
"loss": 1.7741,
"step": 269
},
{
"epoch": 0.5169937769267592,
"grad_norm": 1.8671875,
"learning_rate": 1.772570338261662e-05,
"loss": 1.8348,
"step": 270
},
{
"epoch": 0.5189085686931546,
"grad_norm": 1.84375,
"learning_rate": 1.7705536554858768e-05,
"loss": 1.768,
"step": 271
},
{
"epoch": 0.52082336045955,
"grad_norm": 1.9453125,
"learning_rate": 1.768529228895294e-05,
"loss": 1.8223,
"step": 272
},
{
"epoch": 0.5227381522259454,
"grad_norm": 1.953125,
"learning_rate": 1.7664970788347454e-05,
"loss": 1.8459,
"step": 273
},
{
"epoch": 0.5246529439923409,
"grad_norm": 1.8515625,
"learning_rate": 1.7644572257266807e-05,
"loss": 1.7417,
"step": 274
},
{
"epoch": 0.5265677357587363,
"grad_norm": 1.921875,
"learning_rate": 1.762409690070964e-05,
"loss": 1.8374,
"step": 275
},
{
"epoch": 0.5284825275251317,
"grad_norm": 1.9765625,
"learning_rate": 1.7603544924446655e-05,
"loss": 1.8115,
"step": 276
},
{
"epoch": 0.530397319291527,
"grad_norm": 1.8671875,
"learning_rate": 1.7582916535018567e-05,
"loss": 1.8349,
"step": 277
},
{
"epoch": 0.5323121110579224,
"grad_norm": 1.9375,
"learning_rate": 1.7562211939734e-05,
"loss": 1.7624,
"step": 278
},
{
"epoch": 0.5342269028243178,
"grad_norm": 1.9609375,
"learning_rate": 1.7541431346667447e-05,
"loss": 1.765,
"step": 279
},
{
"epoch": 0.5361416945907133,
"grad_norm": 1.921875,
"learning_rate": 1.752057496465713e-05,
"loss": 1.8085,
"step": 280
},
{
"epoch": 0.5380564863571087,
"grad_norm": 1.859375,
"learning_rate": 1.7499643003302943e-05,
"loss": 1.7753,
"step": 281
},
{
"epoch": 0.5399712781235041,
"grad_norm": 1.9765625,
"learning_rate": 1.7478635672964324e-05,
"loss": 1.8409,
"step": 282
},
{
"epoch": 0.5418860698898995,
"grad_norm": 2.078125,
"learning_rate": 1.7457553184758135e-05,
"loss": 1.7598,
"step": 283
},
{
"epoch": 0.5438008616562949,
"grad_norm": 2.234375,
"learning_rate": 1.7436395750556563e-05,
"loss": 1.7612,
"step": 284
},
{
"epoch": 0.5457156534226902,
"grad_norm": 2.03125,
"learning_rate": 1.7415163582984972e-05,
"loss": 1.8075,
"step": 285
},
{
"epoch": 0.5476304451890857,
"grad_norm": 1.953125,
"learning_rate": 1.739385689541977e-05,
"loss": 1.775,
"step": 286
},
{
"epoch": 0.5495452369554811,
"grad_norm": 1.890625,
"learning_rate": 1.7372475901986275e-05,
"loss": 1.8337,
"step": 287
},
{
"epoch": 0.5514600287218765,
"grad_norm": 1.8984375,
"learning_rate": 1.7351020817556548e-05,
"loss": 1.8364,
"step": 288
},
{
"epoch": 0.5533748204882719,
"grad_norm": 1.9609375,
"learning_rate": 1.732949185774724e-05,
"loss": 1.7762,
"step": 289
},
{
"epoch": 0.5552896122546673,
"grad_norm": 2.015625,
"learning_rate": 1.7307889238917424e-05,
"loss": 1.8312,
"step": 290
},
{
"epoch": 0.5572044040210627,
"grad_norm": 1.90625,
"learning_rate": 1.7286213178166434e-05,
"loss": 1.7795,
"step": 291
},
{
"epoch": 0.5591191957874582,
"grad_norm": 1.9375,
"learning_rate": 1.726446389333166e-05,
"loss": 1.7856,
"step": 292
},
{
"epoch": 0.5610339875538535,
"grad_norm": 1.96875,
"learning_rate": 1.7242641602986376e-05,
"loss": 1.7926,
"step": 293
},
{
"epoch": 0.5629487793202489,
"grad_norm": 1.9765625,
"learning_rate": 1.7220746526437535e-05,
"loss": 1.7724,
"step": 294
},
{
"epoch": 0.5648635710866443,
"grad_norm": 1.9140625,
"learning_rate": 1.7198778883723567e-05,
"loss": 1.7524,
"step": 295
},
{
"epoch": 0.5667783628530397,
"grad_norm": 1.75,
"learning_rate": 1.717673889561217e-05,
"loss": 1.7831,
"step": 296
},
{
"epoch": 0.5686931546194351,
"grad_norm": 1.9453125,
"learning_rate": 1.7154626783598092e-05,
"loss": 1.7599,
"step": 297
},
{
"epoch": 0.5706079463858306,
"grad_norm": 1.8125,
"learning_rate": 1.7132442769900898e-05,
"loss": 1.8121,
"step": 298
},
{
"epoch": 0.572522738152226,
"grad_norm": 1.8828125,
"learning_rate": 1.7110187077462747e-05,
"loss": 1.7459,
"step": 299
},
{
"epoch": 0.5744375299186214,
"grad_norm": 1.8515625,
"learning_rate": 1.7087859929946144e-05,
"loss": 1.7934,
"step": 300
},
{
"epoch": 0.5763523216850167,
"grad_norm": 1.8359375,
"learning_rate": 1.706546155173169e-05,
"loss": 1.7771,
"step": 301
},
{
"epoch": 0.5782671134514121,
"grad_norm": 1.8671875,
"learning_rate": 1.7042992167915836e-05,
"loss": 1.8167,
"step": 302
},
{
"epoch": 0.5801819052178075,
"grad_norm": 1.8203125,
"learning_rate": 1.7020452004308622e-05,
"loss": 1.7243,
"step": 303
},
{
"epoch": 0.582096696984203,
"grad_norm": 1.7734375,
"learning_rate": 1.699784128743139e-05,
"loss": 1.7949,
"step": 304
},
{
"epoch": 0.5840114887505984,
"grad_norm": 1.8671875,
"learning_rate": 1.6975160244514522e-05,
"loss": 1.8373,
"step": 305
},
{
"epoch": 0.5859262805169938,
"grad_norm": 1.859375,
"learning_rate": 1.6952409103495163e-05,
"loss": 1.8333,
"step": 306
},
{
"epoch": 0.5878410722833892,
"grad_norm": 2.0,
"learning_rate": 1.6929588093014916e-05,
"loss": 1.8455,
"step": 307
},
{
"epoch": 0.5897558640497846,
"grad_norm": 1.90625,
"learning_rate": 1.6906697442417547e-05,
"loss": 1.8232,
"step": 308
},
{
"epoch": 0.59167065581618,
"grad_norm": 1.890625,
"learning_rate": 1.688373738174668e-05,
"loss": 1.7521,
"step": 309
},
{
"epoch": 0.5935854475825754,
"grad_norm": 1.7890625,
"learning_rate": 1.6860708141743498e-05,
"loss": 1.7724,
"step": 310
},
{
"epoch": 0.5955002393489708,
"grad_norm": 1.890625,
"learning_rate": 1.6837609953844406e-05,
"loss": 1.7663,
"step": 311
},
{
"epoch": 0.5974150311153662,
"grad_norm": 1.921875,
"learning_rate": 1.6814443050178713e-05,
"loss": 1.8029,
"step": 312
},
{
"epoch": 0.5993298228817616,
"grad_norm": 1.9296875,
"learning_rate": 1.6791207663566307e-05,
"loss": 1.805,
"step": 313
},
{
"epoch": 0.601244614648157,
"grad_norm": 1.8125,
"learning_rate": 1.67679040275153e-05,
"loss": 1.7871,
"step": 314
},
{
"epoch": 0.6031594064145525,
"grad_norm": 1.9609375,
"learning_rate": 1.6744532376219688e-05,
"loss": 1.7786,
"step": 315
},
{
"epoch": 0.6050741981809479,
"grad_norm": 1.8515625,
"learning_rate": 1.6721092944557003e-05,
"loss": 1.7525,
"step": 316
},
{
"epoch": 0.6069889899473432,
"grad_norm": 1.8671875,
"learning_rate": 1.6697585968085942e-05,
"loss": 1.7923,
"step": 317
},
{
"epoch": 0.6089037817137386,
"grad_norm": 1.90625,
"learning_rate": 1.667401168304401e-05,
"loss": 1.7879,
"step": 318
},
{
"epoch": 0.610818573480134,
"grad_norm": 1.921875,
"learning_rate": 1.6650370326345146e-05,
"loss": 1.797,
"step": 319
},
{
"epoch": 0.6127333652465294,
"grad_norm": 1.84375,
"learning_rate": 1.6626662135577324e-05,
"loss": 1.7889,
"step": 320
},
{
"epoch": 0.6146481570129249,
"grad_norm": 1.8828125,
"learning_rate": 1.660288734900019e-05,
"loss": 1.7892,
"step": 321
},
{
"epoch": 0.6165629487793203,
"grad_norm": 1.8359375,
"learning_rate": 1.6579046205542656e-05,
"loss": 1.8961,
"step": 322
},
{
"epoch": 0.6184777405457157,
"grad_norm": 2.078125,
"learning_rate": 1.655513894480049e-05,
"loss": 1.7397,
"step": 323
},
{
"epoch": 0.6203925323121111,
"grad_norm": 1.8515625,
"learning_rate": 1.653116580703393e-05,
"loss": 1.8193,
"step": 324
},
{
"epoch": 0.6223073240785064,
"grad_norm": 1.953125,
"learning_rate": 1.6507127033165243e-05,
"loss": 1.8165,
"step": 325
},
{
"epoch": 0.6242221158449018,
"grad_norm": 1.8125,
"learning_rate": 1.6483022864776327e-05,
"loss": 1.7779,
"step": 326
},
{
"epoch": 0.6261369076112973,
"grad_norm": 1.8828125,
"learning_rate": 1.645885354410627e-05,
"loss": 1.8215,
"step": 327
},
{
"epoch": 0.6280516993776927,
"grad_norm": 1.84375,
"learning_rate": 1.643461931404893e-05,
"loss": 1.8383,
"step": 328
},
{
"epoch": 0.6299664911440881,
"grad_norm": 1.90625,
"learning_rate": 1.641032041815046e-05,
"loss": 1.7761,
"step": 329
},
{
"epoch": 0.6318812829104835,
"grad_norm": 1.890625,
"learning_rate": 1.6385957100606912e-05,
"loss": 1.7577,
"step": 330
},
{
"epoch": 0.6337960746768789,
"grad_norm": 1.890625,
"learning_rate": 1.636152960626173e-05,
"loss": 1.7726,
"step": 331
},
{
"epoch": 0.6357108664432743,
"grad_norm": 1.796875,
"learning_rate": 1.6337038180603332e-05,
"loss": 1.7556,
"step": 332
},
{
"epoch": 0.6376256582096697,
"grad_norm": 1.8203125,
"learning_rate": 1.631248306976261e-05,
"loss": 1.8344,
"step": 333
},
{
"epoch": 0.6395404499760651,
"grad_norm": 1.8359375,
"learning_rate": 1.6287864520510483e-05,
"loss": 1.7506,
"step": 334
},
{
"epoch": 0.6414552417424605,
"grad_norm": 2.046875,
"learning_rate": 1.62631827802554e-05,
"loss": 1.84,
"step": 335
},
{
"epoch": 0.6433700335088559,
"grad_norm": 2.046875,
"learning_rate": 1.6238438097040866e-05,
"loss": 1.6987,
"step": 336
},
{
"epoch": 0.6452848252752513,
"grad_norm": 1.8671875,
"learning_rate": 1.621363071954293e-05,
"loss": 1.7537,
"step": 337
},
{
"epoch": 0.6471996170416467,
"grad_norm": 1.8984375,
"learning_rate": 1.6188760897067712e-05,
"loss": 1.7974,
"step": 338
},
{
"epoch": 0.6491144088080422,
"grad_norm": 2.046875,
"learning_rate": 1.616382887954887e-05,
"loss": 1.7815,
"step": 339
},
{
"epoch": 0.6510292005744376,
"grad_norm": 1.9609375,
"learning_rate": 1.6138834917545115e-05,
"loss": 1.8334,
"step": 340
},
{
"epoch": 0.6529439923408329,
"grad_norm": 1.8984375,
"learning_rate": 1.611377926223767e-05,
"loss": 1.7674,
"step": 341
},
{
"epoch": 0.6548587841072283,
"grad_norm": 1.8984375,
"learning_rate": 1.6088662165427767e-05,
"loss": 1.789,
"step": 342
},
{
"epoch": 0.6567735758736237,
"grad_norm": 1.9140625,
"learning_rate": 1.6063483879534092e-05,
"loss": 1.7917,
"step": 343
},
{
"epoch": 0.6586883676400191,
"grad_norm": 1.90625,
"learning_rate": 1.6038244657590266e-05,
"loss": 1.7639,
"step": 344
},
{
"epoch": 0.6606031594064146,
"grad_norm": 1.890625,
"learning_rate": 1.6012944753242304e-05,
"loss": 1.7936,
"step": 345
},
{
"epoch": 0.66251795117281,
"grad_norm": 2.078125,
"learning_rate": 1.598758442074605e-05,
"loss": 1.7632,
"step": 346
},
{
"epoch": 0.6644327429392054,
"grad_norm": 1.8046875,
"learning_rate": 1.5962163914964644e-05,
"loss": 1.7101,
"step": 347
},
{
"epoch": 0.6663475347056008,
"grad_norm": 1.8984375,
"learning_rate": 1.5936683491365933e-05,
"loss": 1.7726,
"step": 348
},
{
"epoch": 0.6682623264719961,
"grad_norm": 1.8828125,
"learning_rate": 1.5911143406019926e-05,
"loss": 1.7999,
"step": 349
},
{
"epoch": 0.6701771182383915,
"grad_norm": 1.9296875,
"learning_rate": 1.5885543915596215e-05,
"loss": 1.863,
"step": 350
},
{
"epoch": 0.672091910004787,
"grad_norm": 1.875,
"learning_rate": 1.5859885277361395e-05,
"loss": 1.8356,
"step": 351
},
{
"epoch": 0.6740067017711824,
"grad_norm": 1.8984375,
"learning_rate": 1.583416774917647e-05,
"loss": 1.7902,
"step": 352
},
{
"epoch": 0.6759214935375778,
"grad_norm": 2.09375,
"learning_rate": 1.580839158949427e-05,
"loss": 1.7567,
"step": 353
},
{
"epoch": 0.6778362853039732,
"grad_norm": 1.9140625,
"learning_rate": 1.5782557057356866e-05,
"loss": 1.7907,
"step": 354
},
{
"epoch": 0.6797510770703686,
"grad_norm": 1.84375,
"learning_rate": 1.575666441239293e-05,
"loss": 1.7513,
"step": 355
},
{
"epoch": 0.681665868836764,
"grad_norm": 1.859375,
"learning_rate": 1.5730713914815174e-05,
"loss": 1.853,
"step": 356
},
{
"epoch": 0.6835806606031594,
"grad_norm": 1.96875,
"learning_rate": 1.570470582541769e-05,
"loss": 1.829,
"step": 357
},
{
"epoch": 0.6854954523695548,
"grad_norm": 1.9140625,
"learning_rate": 1.5678640405573357e-05,
"loss": 1.8129,
"step": 358
},
{
"epoch": 0.6874102441359502,
"grad_norm": 1.8984375,
"learning_rate": 1.5652517917231212e-05,
"loss": 1.8097,
"step": 359
},
{
"epoch": 0.6893250359023456,
"grad_norm": 1.859375,
"learning_rate": 1.5626338622913807e-05,
"loss": 1.7673,
"step": 360
},
{
"epoch": 0.691239827668741,
"grad_norm": 1.890625,
"learning_rate": 1.5600102785714567e-05,
"loss": 1.7461,
"step": 361
},
{
"epoch": 0.6931546194351365,
"grad_norm": 1.90625,
"learning_rate": 1.5573810669295176e-05,
"loss": 1.7969,
"step": 362
},
{
"epoch": 0.6950694112015319,
"grad_norm": 1.859375,
"learning_rate": 1.554746253788288e-05,
"loss": 1.7485,
"step": 363
},
{
"epoch": 0.6969842029679273,
"grad_norm": 1.828125,
"learning_rate": 1.5521058656267874e-05,
"loss": 1.7933,
"step": 364
},
{
"epoch": 0.6988989947343226,
"grad_norm": 1.90625,
"learning_rate": 1.5494599289800627e-05,
"loss": 1.8053,
"step": 365
},
{
"epoch": 0.700813786500718,
"grad_norm": 1.8828125,
"learning_rate": 1.54680847043892e-05,
"loss": 1.7844,
"step": 366
},
{
"epoch": 0.7027285782671134,
"grad_norm": 1.828125,
"learning_rate": 1.5441515166496593e-05,
"loss": 1.8336,
"step": 367
},
{
"epoch": 0.7046433700335089,
"grad_norm": 1.859375,
"learning_rate": 1.5414890943138068e-05,
"loss": 1.8394,
"step": 368
},
{
"epoch": 0.7065581617999043,
"grad_norm": 1.875,
"learning_rate": 1.5388212301878445e-05,
"loss": 1.7923,
"step": 369
},
{
"epoch": 0.7084729535662997,
"grad_norm": 2.109375,
"learning_rate": 1.5361479510829434e-05,
"loss": 1.7798,
"step": 370
},
{
"epoch": 0.7103877453326951,
"grad_norm": 1.9609375,
"learning_rate": 1.5334692838646927e-05,
"loss": 1.7606,
"step": 371
},
{
"epoch": 0.7123025370990905,
"grad_norm": 1.7265625,
"learning_rate": 1.5307852554528318e-05,
"loss": 1.7701,
"step": 372
},
{
"epoch": 0.7142173288654858,
"grad_norm": 1.8046875,
"learning_rate": 1.5280958928209763e-05,
"loss": 1.7918,
"step": 373
},
{
"epoch": 0.7161321206318813,
"grad_norm": 1.8828125,
"learning_rate": 1.5254012229963509e-05,
"loss": 1.7718,
"step": 374
},
{
"epoch": 0.7180469123982767,
"grad_norm": 2.109375,
"learning_rate": 1.5227012730595146e-05,
"loss": 1.7738,
"step": 375
},
{
"epoch": 0.7199617041646721,
"grad_norm": 1.8359375,
"learning_rate": 1.5199960701440902e-05,
"loss": 1.7932,
"step": 376
},
{
"epoch": 0.7218764959310675,
"grad_norm": 1.8515625,
"learning_rate": 1.5172856414364916e-05,
"loss": 1.7774,
"step": 377
},
{
"epoch": 0.7237912876974629,
"grad_norm": 2.015625,
"learning_rate": 1.5145700141756496e-05,
"loss": 1.7727,
"step": 378
},
{
"epoch": 0.7257060794638583,
"grad_norm": 2.046875,
"learning_rate": 1.5118492156527395e-05,
"loss": 1.8104,
"step": 379
},
{
"epoch": 0.7276208712302538,
"grad_norm": 1.78125,
"learning_rate": 1.5091232732109053e-05,
"loss": 1.7703,
"step": 380
},
{
"epoch": 0.7295356629966491,
"grad_norm": 1.8203125,
"learning_rate": 1.5063922142449857e-05,
"loss": 1.7688,
"step": 381
},
{
"epoch": 0.7314504547630445,
"grad_norm": 1.8359375,
"learning_rate": 1.5036560662012405e-05,
"loss": 1.7204,
"step": 382
},
{
"epoch": 0.7333652465294399,
"grad_norm": 1.8984375,
"learning_rate": 1.5009148565770707e-05,
"loss": 1.7462,
"step": 383
},
{
"epoch": 0.7352800382958353,
"grad_norm": 2.0,
"learning_rate": 1.498168612920746e-05,
"loss": 1.7404,
"step": 384
},
{
"epoch": 0.7371948300622307,
"grad_norm": 1.8046875,
"learning_rate": 1.4954173628311262e-05,
"loss": 1.7549,
"step": 385
},
{
"epoch": 0.7391096218286262,
"grad_norm": 2.015625,
"learning_rate": 1.492661133957384e-05,
"loss": 1.8004,
"step": 386
},
{
"epoch": 0.7410244135950216,
"grad_norm": 1.921875,
"learning_rate": 1.4898999539987273e-05,
"loss": 1.7897,
"step": 387
},
{
"epoch": 0.742939205361417,
"grad_norm": 1.921875,
"learning_rate": 1.487133850704121e-05,
"loss": 1.7573,
"step": 388
},
{
"epoch": 0.7448539971278123,
"grad_norm": 1.8203125,
"learning_rate": 1.4843628518720076e-05,
"loss": 1.8918,
"step": 389
},
{
"epoch": 0.7467687888942077,
"grad_norm": 1.890625,
"learning_rate": 1.4815869853500286e-05,
"loss": 1.7759,
"step": 390
},
{
"epoch": 0.7486835806606031,
"grad_norm": 2.03125,
"learning_rate": 1.4788062790347437e-05,
"loss": 1.8615,
"step": 391
},
{
"epoch": 0.7505983724269986,
"grad_norm": 1.890625,
"learning_rate": 1.4760207608713515e-05,
"loss": 1.7365,
"step": 392
},
{
"epoch": 0.752513164193394,
"grad_norm": 1.875,
"learning_rate": 1.4732304588534073e-05,
"loss": 1.783,
"step": 393
},
{
"epoch": 0.7544279559597894,
"grad_norm": 1.875,
"learning_rate": 1.4704354010225436e-05,
"loss": 1.7981,
"step": 394
},
{
"epoch": 0.7563427477261848,
"grad_norm": 1.984375,
"learning_rate": 1.4676356154681867e-05,
"loss": 1.8287,
"step": 395
},
{
"epoch": 0.7582575394925802,
"grad_norm": 1.8125,
"learning_rate": 1.464831130327275e-05,
"loss": 1.7761,
"step": 396
},
{
"epoch": 0.7601723312589755,
"grad_norm": 1.75,
"learning_rate": 1.4620219737839766e-05,
"loss": 1.8013,
"step": 397
},
{
"epoch": 0.762087123025371,
"grad_norm": 1.8828125,
"learning_rate": 1.4592081740694051e-05,
"loss": 1.8035,
"step": 398
},
{
"epoch": 0.7640019147917664,
"grad_norm": 1.8203125,
"learning_rate": 1.4563897594613368e-05,
"loss": 1.8472,
"step": 399
},
{
"epoch": 0.7659167065581618,
"grad_norm": 1.765625,
"learning_rate": 1.453566758283926e-05,
"loss": 1.7201,
"step": 400
},
{
"epoch": 0.7678314983245572,
"grad_norm": 1.71875,
"learning_rate": 1.450739198907421e-05,
"loss": 1.7396,
"step": 401
},
{
"epoch": 0.7697462900909526,
"grad_norm": 1.84375,
"learning_rate": 1.4479071097478778e-05,
"loss": 1.8413,
"step": 402
},
{
"epoch": 0.771661081857348,
"grad_norm": 1.984375,
"learning_rate": 1.4450705192668763e-05,
"loss": 1.8091,
"step": 403
},
{
"epoch": 0.7735758736237435,
"grad_norm": 1.8828125,
"learning_rate": 1.442229455971232e-05,
"loss": 1.7785,
"step": 404
},
{
"epoch": 0.7754906653901388,
"grad_norm": 1.796875,
"learning_rate": 1.4393839484127117e-05,
"loss": 1.7627,
"step": 405
},
{
"epoch": 0.7774054571565342,
"grad_norm": 1.7421875,
"learning_rate": 1.4365340251877446e-05,
"loss": 1.7348,
"step": 406
},
{
"epoch": 0.7793202489229296,
"grad_norm": 1.9453125,
"learning_rate": 1.4336797149371377e-05,
"loss": 1.8185,
"step": 407
},
{
"epoch": 0.781235040689325,
"grad_norm": 1.875,
"learning_rate": 1.4308210463457842e-05,
"loss": 1.7564,
"step": 408
},
{
"epoch": 0.7831498324557205,
"grad_norm": 1.7890625,
"learning_rate": 1.4279580481423778e-05,
"loss": 1.7314,
"step": 409
},
{
"epoch": 0.7850646242221159,
"grad_norm": 1.8359375,
"learning_rate": 1.4250907490991244e-05,
"loss": 1.7787,
"step": 410
},
{
"epoch": 0.7869794159885113,
"grad_norm": 1.8828125,
"learning_rate": 1.4222191780314508e-05,
"loss": 1.7753,
"step": 411
},
{
"epoch": 0.7888942077549067,
"grad_norm": 1.7890625,
"learning_rate": 1.4193433637977165e-05,
"loss": 1.7371,
"step": 412
},
{
"epoch": 0.790808999521302,
"grad_norm": 1.7734375,
"learning_rate": 1.416463335298924e-05,
"loss": 1.7923,
"step": 413
},
{
"epoch": 0.7927237912876974,
"grad_norm": 1.7734375,
"learning_rate": 1.4135791214784272e-05,
"loss": 1.8241,
"step": 414
},
{
"epoch": 0.7946385830540929,
"grad_norm": 1.7421875,
"learning_rate": 1.4106907513216412e-05,
"loss": 1.7248,
"step": 415
},
{
"epoch": 0.7965533748204883,
"grad_norm": 1.8359375,
"learning_rate": 1.4077982538557511e-05,
"loss": 1.8105,
"step": 416
},
{
"epoch": 0.7984681665868837,
"grad_norm": 1.8359375,
"learning_rate": 1.4049016581494204e-05,
"loss": 1.8315,
"step": 417
},
{
"epoch": 0.8003829583532791,
"grad_norm": 1.765625,
"learning_rate": 1.402000993312498e-05,
"loss": 1.7119,
"step": 418
},
{
"epoch": 0.8022977501196745,
"grad_norm": 1.7421875,
"learning_rate": 1.3990962884957267e-05,
"loss": 1.7673,
"step": 419
},
{
"epoch": 0.8042125418860699,
"grad_norm": 1.8046875,
"learning_rate": 1.3961875728904495e-05,
"loss": 1.8066,
"step": 420
},
{
"epoch": 0.8061273336524652,
"grad_norm": 1.921875,
"learning_rate": 1.3932748757283165e-05,
"loss": 1.7593,
"step": 421
},
{
"epoch": 0.8080421254188607,
"grad_norm": 1.828125,
"learning_rate": 1.3903582262809918e-05,
"loss": 1.8116,
"step": 422
},
{
"epoch": 0.8099569171852561,
"grad_norm": 1.8203125,
"learning_rate": 1.3874376538598574e-05,
"loss": 1.789,
"step": 423
},
{
"epoch": 0.8118717089516515,
"grad_norm": 1.765625,
"learning_rate": 1.3845131878157214e-05,
"loss": 1.7977,
"step": 424
},
{
"epoch": 0.8137865007180469,
"grad_norm": 1.7734375,
"learning_rate": 1.3815848575385207e-05,
"loss": 1.8037,
"step": 425
},
{
"epoch": 0.8157012924844423,
"grad_norm": 1.84375,
"learning_rate": 1.3786526924570262e-05,
"loss": 1.7475,
"step": 426
},
{
"epoch": 0.8176160842508378,
"grad_norm": 1.7890625,
"learning_rate": 1.3757167220385483e-05,
"loss": 1.7709,
"step": 427
},
{
"epoch": 0.8195308760172332,
"grad_norm": 1.7109375,
"learning_rate": 1.3727769757886388e-05,
"loss": 1.7576,
"step": 428
},
{
"epoch": 0.8214456677836285,
"grad_norm": 1.8203125,
"learning_rate": 1.3698334832507962e-05,
"loss": 1.8164,
"step": 429
},
{
"epoch": 0.8233604595500239,
"grad_norm": 1.8046875,
"learning_rate": 1.366886274006168e-05,
"loss": 1.7753,
"step": 430
},
{
"epoch": 0.8252752513164193,
"grad_norm": 1.828125,
"learning_rate": 1.3639353776732523e-05,
"loss": 1.7188,
"step": 431
},
{
"epoch": 0.8271900430828147,
"grad_norm": 1.7890625,
"learning_rate": 1.3609808239076025e-05,
"loss": 1.7456,
"step": 432
},
{
"epoch": 0.8291048348492102,
"grad_norm": 1.78125,
"learning_rate": 1.3580226424015273e-05,
"loss": 1.7591,
"step": 433
},
{
"epoch": 0.8310196266156056,
"grad_norm": 1.75,
"learning_rate": 1.3550608628837933e-05,
"loss": 1.8017,
"step": 434
},
{
"epoch": 0.832934418382001,
"grad_norm": 1.8203125,
"learning_rate": 1.352095515119326e-05,
"loss": 1.7858,
"step": 435
},
{
"epoch": 0.8348492101483964,
"grad_norm": 1.90625,
"learning_rate": 1.3491266289089107e-05,
"loss": 1.7865,
"step": 436
},
{
"epoch": 0.8367640019147917,
"grad_norm": 1.765625,
"learning_rate": 1.3461542340888921e-05,
"loss": 1.7595,
"step": 437
},
{
"epoch": 0.8386787936811871,
"grad_norm": 1.8671875,
"learning_rate": 1.3431783605308761e-05,
"loss": 1.8015,
"step": 438
},
{
"epoch": 0.8405935854475826,
"grad_norm": 1.7890625,
"learning_rate": 1.3401990381414287e-05,
"loss": 1.7645,
"step": 439
},
{
"epoch": 0.842508377213978,
"grad_norm": 1.953125,
"learning_rate": 1.3372162968617757e-05,
"loss": 1.7324,
"step": 440
},
{
"epoch": 0.8444231689803734,
"grad_norm": 1.78125,
"learning_rate": 1.3342301666675013e-05,
"loss": 1.7304,
"step": 441
},
{
"epoch": 0.8463379607467688,
"grad_norm": 1.7890625,
"learning_rate": 1.3312406775682471e-05,
"loss": 1.814,
"step": 442
},
{
"epoch": 0.8482527525131642,
"grad_norm": 1.8203125,
"learning_rate": 1.328247859607411e-05,
"loss": 1.7776,
"step": 443
},
{
"epoch": 0.8501675442795597,
"grad_norm": 1.796875,
"learning_rate": 1.3252517428618448e-05,
"loss": 1.7183,
"step": 444
},
{
"epoch": 0.852082336045955,
"grad_norm": 1.875,
"learning_rate": 1.3222523574415516e-05,
"loss": 1.7699,
"step": 445
},
{
"epoch": 0.8539971278123504,
"grad_norm": 1.8359375,
"learning_rate": 1.3192497334893842e-05,
"loss": 1.8766,
"step": 446
},
{
"epoch": 0.8559119195787458,
"grad_norm": 1.75,
"learning_rate": 1.316243901180741e-05,
"loss": 1.7422,
"step": 447
},
{
"epoch": 0.8578267113451412,
"grad_norm": 1.765625,
"learning_rate": 1.3132348907232639e-05,
"loss": 1.7802,
"step": 448
},
{
"epoch": 0.8597415031115366,
"grad_norm": 1.78125,
"learning_rate": 1.310222732356534e-05,
"loss": 1.7016,
"step": 449
},
{
"epoch": 0.861656294877932,
"grad_norm": 1.8828125,
"learning_rate": 1.3072074563517676e-05,
"loss": 1.7917,
"step": 450
},
{
"epoch": 0.8635710866443275,
"grad_norm": 1.8203125,
"learning_rate": 1.3041890930115125e-05,
"loss": 1.7344,
"step": 451
},
{
"epoch": 0.8654858784107229,
"grad_norm": 1.796875,
"learning_rate": 1.3011676726693432e-05,
"loss": 1.7282,
"step": 452
},
{
"epoch": 0.8674006701771182,
"grad_norm": 1.765625,
"learning_rate": 1.298143225689556e-05,
"loss": 1.7817,
"step": 453
},
{
"epoch": 0.8693154619435136,
"grad_norm": 1.7265625,
"learning_rate": 1.2951157824668645e-05,
"loss": 1.7465,
"step": 454
},
{
"epoch": 0.871230253709909,
"grad_norm": 1.828125,
"learning_rate": 1.2920853734260925e-05,
"loss": 1.7836,
"step": 455
},
{
"epoch": 0.8731450454763044,
"grad_norm": 1.8046875,
"learning_rate": 1.2890520290218698e-05,
"loss": 1.762,
"step": 456
},
{
"epoch": 0.8750598372426999,
"grad_norm": 1.8125,
"learning_rate": 1.286015779738326e-05,
"loss": 1.7934,
"step": 457
},
{
"epoch": 0.8769746290090953,
"grad_norm": 1.8046875,
"learning_rate": 1.2829766560887837e-05,
"loss": 1.7687,
"step": 458
},
{
"epoch": 0.8788894207754907,
"grad_norm": 1.7890625,
"learning_rate": 1.2799346886154513e-05,
"loss": 1.8095,
"step": 459
},
{
"epoch": 0.8808042125418861,
"grad_norm": 1.8125,
"learning_rate": 1.2768899078891174e-05,
"loss": 1.7651,
"step": 460
},
{
"epoch": 0.8827190043082814,
"grad_norm": 1.71875,
"learning_rate": 1.2738423445088429e-05,
"loss": 1.7002,
"step": 461
},
{
"epoch": 0.8846337960746768,
"grad_norm": 1.765625,
"learning_rate": 1.2707920291016526e-05,
"loss": 1.804,
"step": 462
},
{
"epoch": 0.8865485878410723,
"grad_norm": 1.7734375,
"learning_rate": 1.2677389923222297e-05,
"loss": 1.7597,
"step": 463
},
{
"epoch": 0.8884633796074677,
"grad_norm": 1.8125,
"learning_rate": 1.2646832648526048e-05,
"loss": 1.7853,
"step": 464
},
{
"epoch": 0.8903781713738631,
"grad_norm": 1.8515625,
"learning_rate": 1.2616248774018503e-05,
"loss": 1.7303,
"step": 465
},
{
"epoch": 0.8922929631402585,
"grad_norm": 1.8203125,
"learning_rate": 1.2585638607057698e-05,
"loss": 1.7358,
"step": 466
},
{
"epoch": 0.8942077549066539,
"grad_norm": 1.7890625,
"learning_rate": 1.25550024552659e-05,
"loss": 1.8593,
"step": 467
},
{
"epoch": 0.8961225466730494,
"grad_norm": 1.8125,
"learning_rate": 1.2524340626526521e-05,
"loss": 1.8011,
"step": 468
},
{
"epoch": 0.8980373384394447,
"grad_norm": 1.8125,
"learning_rate": 1.2493653428981014e-05,
"loss": 1.7807,
"step": 469
},
{
"epoch": 0.8999521302058401,
"grad_norm": 1.8359375,
"learning_rate": 1.2462941171025777e-05,
"loss": 1.7492,
"step": 470
},
{
"epoch": 0.9018669219722355,
"grad_norm": 1.734375,
"learning_rate": 1.2432204161309063e-05,
"loss": 1.7557,
"step": 471
},
{
"epoch": 0.9037817137386309,
"grad_norm": 1.78125,
"learning_rate": 1.2401442708727869e-05,
"loss": 1.779,
"step": 472
},
{
"epoch": 0.9056965055050263,
"grad_norm": 2.109375,
"learning_rate": 1.2370657122424835e-05,
"loss": 1.7955,
"step": 473
},
{
"epoch": 0.9076112972714218,
"grad_norm": 1.953125,
"learning_rate": 1.2339847711785139e-05,
"loss": 1.7179,
"step": 474
},
{
"epoch": 0.9095260890378172,
"grad_norm": 1.78125,
"learning_rate": 1.2309014786433381e-05,
"loss": 1.8023,
"step": 475
},
{
"epoch": 0.9114408808042126,
"grad_norm": 1.8828125,
"learning_rate": 1.2278158656230486e-05,
"loss": 1.8111,
"step": 476
},
{
"epoch": 0.9133556725706079,
"grad_norm": 2.078125,
"learning_rate": 1.224727963127057e-05,
"loss": 1.7841,
"step": 477
},
{
"epoch": 0.9152704643370033,
"grad_norm": 2.0,
"learning_rate": 1.2216378021877835e-05,
"loss": 1.775,
"step": 478
},
{
"epoch": 0.9171852561033987,
"grad_norm": 1.8046875,
"learning_rate": 1.2185454138603458e-05,
"loss": 1.7291,
"step": 479
},
{
"epoch": 0.9191000478697942,
"grad_norm": 1.8359375,
"learning_rate": 1.215450829222245e-05,
"loss": 1.7993,
"step": 480
},
{
"epoch": 0.9210148396361896,
"grad_norm": 1.890625,
"learning_rate": 1.2123540793730554e-05,
"loss": 1.7748,
"step": 481
},
{
"epoch": 0.922929631402585,
"grad_norm": 1.765625,
"learning_rate": 1.2092551954341104e-05,
"loss": 1.7842,
"step": 482
},
{
"epoch": 0.9248444231689804,
"grad_norm": 1.8359375,
"learning_rate": 1.2061542085481904e-05,
"loss": 1.7519,
"step": 483
},
{
"epoch": 0.9267592149353758,
"grad_norm": 1.84375,
"learning_rate": 1.2030511498792095e-05,
"loss": 1.8114,
"step": 484
},
{
"epoch": 0.9286740067017711,
"grad_norm": 1.78125,
"learning_rate": 1.199946050611903e-05,
"loss": 1.7339,
"step": 485
},
{
"epoch": 0.9305887984681666,
"grad_norm": 1.8046875,
"learning_rate": 1.1968389419515134e-05,
"loss": 1.8057,
"step": 486
},
{
"epoch": 0.932503590234562,
"grad_norm": 1.796875,
"learning_rate": 1.1937298551234769e-05,
"loss": 1.7458,
"step": 487
},
{
"epoch": 0.9344183820009574,
"grad_norm": 2.015625,
"learning_rate": 1.1906188213731099e-05,
"loss": 1.7539,
"step": 488
},
{
"epoch": 0.9363331737673528,
"grad_norm": 2.328125,
"learning_rate": 1.187505871965294e-05,
"loss": 1.8346,
"step": 489
},
{
"epoch": 0.9382479655337482,
"grad_norm": 1.8359375,
"learning_rate": 1.1843910381841637e-05,
"loss": 1.7451,
"step": 490
},
{
"epoch": 0.9401627573001436,
"grad_norm": 1.8046875,
"learning_rate": 1.1812743513327896e-05,
"loss": 1.7659,
"step": 491
},
{
"epoch": 0.9420775490665391,
"grad_norm": 2.03125,
"learning_rate": 1.1781558427328662e-05,
"loss": 1.8026,
"step": 492
},
{
"epoch": 0.9439923408329344,
"grad_norm": 1.9921875,
"learning_rate": 1.1750355437243947e-05,
"loss": 1.7427,
"step": 493
},
{
"epoch": 0.9459071325993298,
"grad_norm": 1.8359375,
"learning_rate": 1.1719134856653704e-05,
"loss": 1.7436,
"step": 494
},
{
"epoch": 0.9478219243657252,
"grad_norm": 1.7578125,
"learning_rate": 1.1687896999314663e-05,
"loss": 1.8219,
"step": 495
},
{
"epoch": 0.9497367161321206,
"grad_norm": 1.7421875,
"learning_rate": 1.1656642179157173e-05,
"loss": 1.757,
"step": 496
},
{
"epoch": 0.951651507898516,
"grad_norm": 1.9765625,
"learning_rate": 1.1625370710282067e-05,
"loss": 1.7482,
"step": 497
},
{
"epoch": 0.9535662996649115,
"grad_norm": 1.984375,
"learning_rate": 1.1594082906957478e-05,
"loss": 1.8083,
"step": 498
},
{
"epoch": 0.9554810914313069,
"grad_norm": 1.9140625,
"learning_rate": 1.1562779083615702e-05,
"loss": 1.8082,
"step": 499
},
{
"epoch": 0.9573958831977023,
"grad_norm": 1.8125,
"learning_rate": 1.153145955485003e-05,
"loss": 1.7942,
"step": 500
},
{
"epoch": 0.9593106749640976,
"grad_norm": 1.78125,
"learning_rate": 1.1500124635411592e-05,
"loss": 1.7538,
"step": 501
},
{
"epoch": 0.961225466730493,
"grad_norm": 1.7109375,
"learning_rate": 1.146877464020618e-05,
"loss": 1.7099,
"step": 502
},
{
"epoch": 0.9631402584968884,
"grad_norm": 1.8828125,
"learning_rate": 1.1437409884291097e-05,
"loss": 1.821,
"step": 503
},
{
"epoch": 0.9650550502632839,
"grad_norm": 1.84375,
"learning_rate": 1.140603068287199e-05,
"loss": 1.8018,
"step": 504
},
{
"epoch": 0.9669698420296793,
"grad_norm": 1.7578125,
"learning_rate": 1.1374637351299672e-05,
"loss": 1.8382,
"step": 505
},
{
"epoch": 0.9688846337960747,
"grad_norm": 1.765625,
"learning_rate": 1.1343230205066963e-05,
"loss": 1.8094,
"step": 506
},
{
"epoch": 0.9707994255624701,
"grad_norm": 1.78125,
"learning_rate": 1.131180955980552e-05,
"loss": 1.7568,
"step": 507
},
{
"epoch": 0.9727142173288655,
"grad_norm": 1.9375,
"learning_rate": 1.128037573128265e-05,
"loss": 1.743,
"step": 508
},
{
"epoch": 0.9746290090952608,
"grad_norm": 1.8125,
"learning_rate": 1.124892903539816e-05,
"loss": 1.8106,
"step": 509
},
{
"epoch": 0.9765438008616563,
"grad_norm": 1.75,
"learning_rate": 1.1217469788181158e-05,
"loss": 1.7705,
"step": 510
},
{
"epoch": 0.9784585926280517,
"grad_norm": 1.859375,
"learning_rate": 1.1185998305786902e-05,
"loss": 1.8163,
"step": 511
},
{
"epoch": 0.9803733843944471,
"grad_norm": 1.8203125,
"learning_rate": 1.1154514904493599e-05,
"loss": 1.8028,
"step": 512
},
{
"epoch": 0.9822881761608425,
"grad_norm": 1.84375,
"learning_rate": 1.1123019900699239e-05,
"loss": 1.8154,
"step": 513
},
{
"epoch": 0.9842029679272379,
"grad_norm": 1.828125,
"learning_rate": 1.1091513610918415e-05,
"loss": 1.7655,
"step": 514
},
{
"epoch": 0.9861177596936334,
"grad_norm": 1.7265625,
"learning_rate": 1.1059996351779139e-05,
"loss": 1.7482,
"step": 515
},
{
"epoch": 0.9880325514600288,
"grad_norm": 1.75,
"learning_rate": 1.1028468440019666e-05,
"loss": 1.7829,
"step": 516
},
{
"epoch": 0.9899473432264241,
"grad_norm": 1.7578125,
"learning_rate": 1.0996930192485302e-05,
"loss": 1.7006,
"step": 517
},
{
"epoch": 0.9918621349928195,
"grad_norm": 1.75,
"learning_rate": 1.0965381926125224e-05,
"loss": 1.7737,
"step": 518
},
{
"epoch": 0.9937769267592149,
"grad_norm": 1.734375,
"learning_rate": 1.0933823957989298e-05,
"loss": 1.7464,
"step": 519
},
{
"epoch": 0.9956917185256103,
"grad_norm": 1.765625,
"learning_rate": 1.0902256605224885e-05,
"loss": 1.8521,
"step": 520
},
{
"epoch": 0.9976065102920058,
"grad_norm": 1.7890625,
"learning_rate": 1.0870680185073666e-05,
"loss": 1.7895,
"step": 521
},
{
"epoch": 0.9995213020584012,
"grad_norm": 1.7265625,
"learning_rate": 1.083909501486844e-05,
"loss": 1.7476,
"step": 522
},
{
"epoch": 1.0014360938247966,
"grad_norm": 6.3125,
"learning_rate": 1.080750141202994e-05,
"loss": 3.1048,
"step": 523
},
{
"epoch": 1.003350885591192,
"grad_norm": 1.8671875,
"learning_rate": 1.0775899694063649e-05,
"loss": 1.6357,
"step": 524
},
{
"epoch": 1.0052656773575874,
"grad_norm": 1.8203125,
"learning_rate": 1.0744290178556604e-05,
"loss": 1.7283,
"step": 525
},
{
"epoch": 1.0071804691239827,
"grad_norm": 1.7265625,
"learning_rate": 1.0712673183174205e-05,
"loss": 1.7224,
"step": 526
},
{
"epoch": 1.0090952608903783,
"grad_norm": 1.7265625,
"learning_rate": 1.0681049025657015e-05,
"loss": 1.6712,
"step": 527
},
{
"epoch": 1.0110100526567736,
"grad_norm": 1.921875,
"learning_rate": 1.0649418023817583e-05,
"loss": 1.7204,
"step": 528
},
{
"epoch": 1.0129248444231689,
"grad_norm": 1.953125,
"learning_rate": 1.061778049553724e-05,
"loss": 1.6806,
"step": 529
},
{
"epoch": 1.0148396361895644,
"grad_norm": 1.7890625,
"learning_rate": 1.0586136758762902e-05,
"loss": 1.7295,
"step": 530
},
{
"epoch": 1.0167544279559597,
"grad_norm": 1.796875,
"learning_rate": 1.0554487131503874e-05,
"loss": 1.7294,
"step": 531
},
{
"epoch": 1.0186692197223552,
"grad_norm": 1.8125,
"learning_rate": 1.0522831931828677e-05,
"loss": 1.674,
"step": 532
},
{
"epoch": 1.0205840114887506,
"grad_norm": 1.7734375,
"learning_rate": 1.049117147786181e-05,
"loss": 1.6801,
"step": 533
},
{
"epoch": 1.022498803255146,
"grad_norm": 1.84375,
"learning_rate": 1.0459506087780593e-05,
"loss": 1.6725,
"step": 534
},
{
"epoch": 1.0244135950215414,
"grad_norm": 1.78125,
"learning_rate": 1.042783607981194e-05,
"loss": 1.6826,
"step": 535
},
{
"epoch": 1.026328386787937,
"grad_norm": 1.84375,
"learning_rate": 1.0396161772229185e-05,
"loss": 1.6825,
"step": 536
},
{
"epoch": 1.0282431785543322,
"grad_norm": 1.8203125,
"learning_rate": 1.0364483483348859e-05,
"loss": 1.7038,
"step": 537
},
{
"epoch": 1.0301579703207275,
"grad_norm": 1.796875,
"learning_rate": 1.0332801531527516e-05,
"loss": 1.7607,
"step": 538
},
{
"epoch": 1.032072762087123,
"grad_norm": 1.75,
"learning_rate": 1.0301116235158516e-05,
"loss": 1.7785,
"step": 539
},
{
"epoch": 1.0339875538535184,
"grad_norm": 1.734375,
"learning_rate": 1.0269427912668826e-05,
"loss": 1.6801,
"step": 540
},
{
"epoch": 1.035902345619914,
"grad_norm": 1.765625,
"learning_rate": 1.0237736882515832e-05,
"loss": 1.6535,
"step": 541
},
{
"epoch": 1.0378171373863092,
"grad_norm": 1.84375,
"learning_rate": 1.0206043463184127e-05,
"loss": 1.7229,
"step": 542
},
{
"epoch": 1.0397319291527047,
"grad_norm": 1.7578125,
"learning_rate": 1.0174347973182318e-05,
"loss": 1.6847,
"step": 543
},
{
"epoch": 1.0416467209191,
"grad_norm": 1.8125,
"learning_rate": 1.0142650731039815e-05,
"loss": 1.6263,
"step": 544
},
{
"epoch": 1.0435615126854954,
"grad_norm": 1.734375,
"learning_rate": 1.0110952055303647e-05,
"loss": 1.6602,
"step": 545
},
{
"epoch": 1.0454763044518909,
"grad_norm": 1.7890625,
"learning_rate": 1.0079252264535237e-05,
"loss": 1.6483,
"step": 546
},
{
"epoch": 1.0473910962182862,
"grad_norm": 1.7265625,
"learning_rate": 1.0047551677307226e-05,
"loss": 1.7304,
"step": 547
},
{
"epoch": 1.0493058879846817,
"grad_norm": 1.7578125,
"learning_rate": 1.0015850612200249e-05,
"loss": 1.7147,
"step": 548
},
{
"epoch": 1.051220679751077,
"grad_norm": 1.75,
"learning_rate": 9.984149387799754e-06,
"loss": 1.655,
"step": 549
},
{
"epoch": 1.0531354715174726,
"grad_norm": 1.7109375,
"learning_rate": 9.952448322692776e-06,
"loss": 1.7411,
"step": 550
},
{
"epoch": 1.0550502632838679,
"grad_norm": 1.7890625,
"learning_rate": 9.920747735464766e-06,
"loss": 1.658,
"step": 551
},
{
"epoch": 1.0569650550502634,
"grad_norm": 1.6953125,
"learning_rate": 9.889047944696354e-06,
"loss": 1.6857,
"step": 552
},
{
"epoch": 1.0588798468166587,
"grad_norm": 1.7421875,
"learning_rate": 9.857349268960186e-06,
"loss": 1.7655,
"step": 553
},
{
"epoch": 1.060794638583054,
"grad_norm": 1.78125,
"learning_rate": 9.825652026817683e-06,
"loss": 1.6613,
"step": 554
},
{
"epoch": 1.0627094303494495,
"grad_norm": 1.8828125,
"learning_rate": 9.793956536815874e-06,
"loss": 1.7267,
"step": 555
},
{
"epoch": 1.0646242221158448,
"grad_norm": 1.765625,
"learning_rate": 9.76226311748417e-06,
"loss": 1.788,
"step": 556
},
{
"epoch": 1.0665390138822404,
"grad_norm": 1.75,
"learning_rate": 9.730572087331177e-06,
"loss": 1.6566,
"step": 557
},
{
"epoch": 1.0684538056486357,
"grad_norm": 1.734375,
"learning_rate": 9.698883764841484e-06,
"loss": 1.707,
"step": 558
},
{
"epoch": 1.0703685974150312,
"grad_norm": 1.796875,
"learning_rate": 9.667198468472485e-06,
"loss": 1.7831,
"step": 559
},
{
"epoch": 1.0722833891814265,
"grad_norm": 1.71875,
"learning_rate": 9.63551651665114e-06,
"loss": 1.7095,
"step": 560
},
{
"epoch": 1.0741981809478218,
"grad_norm": 1.7421875,
"learning_rate": 9.603838227770819e-06,
"loss": 1.7534,
"step": 561
},
{
"epoch": 1.0761129727142174,
"grad_norm": 1.6875,
"learning_rate": 9.57216392018806e-06,
"loss": 1.5954,
"step": 562
},
{
"epoch": 1.0780277644806127,
"grad_norm": 1.8046875,
"learning_rate": 9.54049391221941e-06,
"loss": 1.6686,
"step": 563
},
{
"epoch": 1.0799425562470082,
"grad_norm": 1.7109375,
"learning_rate": 9.508828522138191e-06,
"loss": 1.7209,
"step": 564
},
{
"epoch": 1.0818573480134035,
"grad_norm": 1.71875,
"learning_rate": 9.477168068171326e-06,
"loss": 1.6337,
"step": 565
},
{
"epoch": 1.083772139779799,
"grad_norm": 1.765625,
"learning_rate": 9.445512868496128e-06,
"loss": 1.6905,
"step": 566
},
{
"epoch": 1.0856869315461943,
"grad_norm": 1.765625,
"learning_rate": 9.413863241237101e-06,
"loss": 1.6972,
"step": 567
},
{
"epoch": 1.0876017233125896,
"grad_norm": 1.6953125,
"learning_rate": 9.382219504462766e-06,
"loss": 1.7069,
"step": 568
},
{
"epoch": 1.0895165150789852,
"grad_norm": 1.6875,
"learning_rate": 9.350581976182418e-06,
"loss": 1.6554,
"step": 569
},
{
"epoch": 1.0914313068453805,
"grad_norm": 1.6953125,
"learning_rate": 9.31895097434299e-06,
"loss": 1.6556,
"step": 570
},
{
"epoch": 1.093346098611776,
"grad_norm": 1.7265625,
"learning_rate": 9.287326816825799e-06,
"loss": 1.7767,
"step": 571
},
{
"epoch": 1.0952608903781713,
"grad_norm": 1.71875,
"learning_rate": 9.255709821443399e-06,
"loss": 1.6971,
"step": 572
},
{
"epoch": 1.0971756821445668,
"grad_norm": 1.7265625,
"learning_rate": 9.224100305936353e-06,
"loss": 1.6232,
"step": 573
},
{
"epoch": 1.0990904739109622,
"grad_norm": 1.7890625,
"learning_rate": 9.192498587970065e-06,
"loss": 1.6941,
"step": 574
},
{
"epoch": 1.1010052656773577,
"grad_norm": 1.7421875,
"learning_rate": 9.160904985131564e-06,
"loss": 1.6419,
"step": 575
},
{
"epoch": 1.102920057443753,
"grad_norm": 1.828125,
"learning_rate": 9.129319814926339e-06,
"loss": 1.7512,
"step": 576
},
{
"epoch": 1.1048348492101483,
"grad_norm": 1.7265625,
"learning_rate": 9.097743394775116e-06,
"loss": 1.749,
"step": 577
},
{
"epoch": 1.1067496409765438,
"grad_norm": 1.75,
"learning_rate": 9.066176042010705e-06,
"loss": 1.723,
"step": 578
},
{
"epoch": 1.1086644327429391,
"grad_norm": 1.8359375,
"learning_rate": 9.034618073874777e-06,
"loss": 1.7473,
"step": 579
},
{
"epoch": 1.1105792245093347,
"grad_norm": 1.875,
"learning_rate": 9.003069807514702e-06,
"loss": 1.6308,
"step": 580
},
{
"epoch": 1.11249401627573,
"grad_norm": 1.7890625,
"learning_rate": 8.971531559980334e-06,
"loss": 1.7228,
"step": 581
},
{
"epoch": 1.1144088080421255,
"grad_norm": 1.6953125,
"learning_rate": 8.940003648220863e-06,
"loss": 1.6956,
"step": 582
},
{
"epoch": 1.1163235998085208,
"grad_norm": 1.7265625,
"learning_rate": 8.908486389081587e-06,
"loss": 1.7137,
"step": 583
},
{
"epoch": 1.1182383915749163,
"grad_norm": 1.734375,
"learning_rate": 8.876980099300764e-06,
"loss": 1.7155,
"step": 584
},
{
"epoch": 1.1201531833413116,
"grad_norm": 1.9140625,
"learning_rate": 8.845485095506401e-06,
"loss": 1.6971,
"step": 585
},
{
"epoch": 1.122067975107707,
"grad_norm": 1.9375,
"learning_rate": 8.8140016942131e-06,
"loss": 1.7349,
"step": 586
},
{
"epoch": 1.1239827668741025,
"grad_norm": 1.78125,
"learning_rate": 8.78253021181884e-06,
"loss": 1.7291,
"step": 587
},
{
"epoch": 1.1258975586404978,
"grad_norm": 1.75,
"learning_rate": 8.751070964601845e-06,
"loss": 1.6818,
"step": 588
},
{
"epoch": 1.1278123504068933,
"grad_norm": 1.703125,
"learning_rate": 8.719624268717351e-06,
"loss": 1.6302,
"step": 589
},
{
"epoch": 1.1297271421732886,
"grad_norm": 1.7421875,
"learning_rate": 8.688190440194483e-06,
"loss": 1.7592,
"step": 590
},
{
"epoch": 1.1316419339396842,
"grad_norm": 1.734375,
"learning_rate": 8.656769794933042e-06,
"loss": 1.7508,
"step": 591
},
{
"epoch": 1.1335567257060795,
"grad_norm": 1.8125,
"learning_rate": 8.625362648700332e-06,
"loss": 1.6968,
"step": 592
},
{
"epoch": 1.135471517472475,
"grad_norm": 1.890625,
"learning_rate": 8.593969317128015e-06,
"loss": 1.7048,
"step": 593
},
{
"epoch": 1.1373863092388703,
"grad_norm": 1.6484375,
"learning_rate": 8.562590115708906e-06,
"loss": 1.6671,
"step": 594
},
{
"epoch": 1.1393011010052656,
"grad_norm": 1.96875,
"learning_rate": 8.531225359793825e-06,
"loss": 1.7416,
"step": 595
},
{
"epoch": 1.1412158927716611,
"grad_norm": 1.7421875,
"learning_rate": 8.49987536458841e-06,
"loss": 1.7597,
"step": 596
},
{
"epoch": 1.1431306845380564,
"grad_norm": 1.6796875,
"learning_rate": 8.468540445149973e-06,
"loss": 1.6507,
"step": 597
},
{
"epoch": 1.145045476304452,
"grad_norm": 1.8515625,
"learning_rate": 8.437220916384301e-06,
"loss": 1.6958,
"step": 598
},
{
"epoch": 1.1469602680708473,
"grad_norm": 1.8203125,
"learning_rate": 8.405917093042526e-06,
"loss": 1.72,
"step": 599
},
{
"epoch": 1.1488750598372426,
"grad_norm": 1.9296875,
"learning_rate": 8.374629289717937e-06,
"loss": 1.7362,
"step": 600
},
{
"epoch": 1.1507898516036381,
"grad_norm": 1.765625,
"learning_rate": 8.343357820842829e-06,
"loss": 1.6909,
"step": 601
},
{
"epoch": 1.1527046433700334,
"grad_norm": 1.671875,
"learning_rate": 8.31210300068534e-06,
"loss": 1.641,
"step": 602
},
{
"epoch": 1.154619435136429,
"grad_norm": 1.78125,
"learning_rate": 8.280865143346301e-06,
"loss": 1.7413,
"step": 603
},
{
"epoch": 1.1565342269028243,
"grad_norm": 1.7265625,
"learning_rate": 8.249644562756056e-06,
"loss": 1.6597,
"step": 604
},
{
"epoch": 1.1584490186692198,
"grad_norm": 1.7890625,
"learning_rate": 8.218441572671343e-06,
"loss": 1.6769,
"step": 605
},
{
"epoch": 1.160363810435615,
"grad_norm": 1.8671875,
"learning_rate": 8.187256486672106e-06,
"loss": 1.6976,
"step": 606
},
{
"epoch": 1.1622786022020106,
"grad_norm": 1.640625,
"learning_rate": 8.156089618158366e-06,
"loss": 1.6065,
"step": 607
},
{
"epoch": 1.164193393968406,
"grad_norm": 1.75,
"learning_rate": 8.12494128034706e-06,
"loss": 1.7632,
"step": 608
},
{
"epoch": 1.1661081857348012,
"grad_norm": 1.7421875,
"learning_rate": 8.093811786268905e-06,
"loss": 1.6849,
"step": 609
},
{
"epoch": 1.1680229775011968,
"grad_norm": 1.796875,
"learning_rate": 8.06270144876523e-06,
"loss": 1.7192,
"step": 610
},
{
"epoch": 1.169937769267592,
"grad_norm": 1.6796875,
"learning_rate": 8.03161058048487e-06,
"loss": 1.6864,
"step": 611
},
{
"epoch": 1.1718525610339876,
"grad_norm": 1.703125,
"learning_rate": 8.000539493880972e-06,
"loss": 1.704,
"step": 612
},
{
"epoch": 1.173767352800383,
"grad_norm": 1.765625,
"learning_rate": 7.969488501207909e-06,
"loss": 1.6947,
"step": 613
},
{
"epoch": 1.1756821445667784,
"grad_norm": 1.703125,
"learning_rate": 7.938457914518098e-06,
"loss": 1.7126,
"step": 614
},
{
"epoch": 1.1775969363331737,
"grad_norm": 1.765625,
"learning_rate": 7.907448045658899e-06,
"loss": 1.7502,
"step": 615
},
{
"epoch": 1.1795117280995693,
"grad_norm": 1.7578125,
"learning_rate": 7.876459206269446e-06,
"loss": 1.7348,
"step": 616
},
{
"epoch": 1.1814265198659646,
"grad_norm": 1.7109375,
"learning_rate": 7.845491707777551e-06,
"loss": 1.6578,
"step": 617
},
{
"epoch": 1.18334131163236,
"grad_norm": 1.734375,
"learning_rate": 7.814545861396543e-06,
"loss": 1.7479,
"step": 618
},
{
"epoch": 1.1852561033987554,
"grad_norm": 1.7421875,
"learning_rate": 7.783621978122167e-06,
"loss": 1.7027,
"step": 619
},
{
"epoch": 1.1871708951651507,
"grad_norm": 1.7265625,
"learning_rate": 7.752720368729436e-06,
"loss": 1.6828,
"step": 620
},
{
"epoch": 1.1890856869315463,
"grad_norm": 1.703125,
"learning_rate": 7.721841343769518e-06,
"loss": 1.689,
"step": 621
},
{
"epoch": 1.1910004786979416,
"grad_norm": 1.6796875,
"learning_rate": 7.69098521356662e-06,
"loss": 1.6498,
"step": 622
},
{
"epoch": 1.192915270464337,
"grad_norm": 1.6796875,
"learning_rate": 7.660152288214865e-06,
"loss": 1.6855,
"step": 623
},
{
"epoch": 1.1948300622307324,
"grad_norm": 1.7109375,
"learning_rate": 7.629342877575169e-06,
"loss": 1.7234,
"step": 624
},
{
"epoch": 1.196744853997128,
"grad_norm": 1.7734375,
"learning_rate": 7.598557291272133e-06,
"loss": 1.7746,
"step": 625
},
{
"epoch": 1.1986596457635232,
"grad_norm": 1.765625,
"learning_rate": 7.567795838690941e-06,
"loss": 1.7283,
"step": 626
},
{
"epoch": 1.2005744375299185,
"grad_norm": 1.6953125,
"learning_rate": 7.537058828974226e-06,
"loss": 1.7204,
"step": 627
},
{
"epoch": 1.202489229296314,
"grad_norm": 1.7109375,
"learning_rate": 7.506346571018992e-06,
"loss": 1.7417,
"step": 628
},
{
"epoch": 1.2044040210627094,
"grad_norm": 1.7109375,
"learning_rate": 7.475659373473481e-06,
"loss": 1.6638,
"step": 629
},
{
"epoch": 1.206318812829105,
"grad_norm": 1.6796875,
"learning_rate": 7.444997544734105e-06,
"loss": 1.7001,
"step": 630
},
{
"epoch": 1.2082336045955002,
"grad_norm": 1.6953125,
"learning_rate": 7.414361392942307e-06,
"loss": 1.7401,
"step": 631
},
{
"epoch": 1.2101483963618955,
"grad_norm": 1.765625,
"learning_rate": 7.383751225981503e-06,
"loss": 1.7466,
"step": 632
},
{
"epoch": 1.212063188128291,
"grad_norm": 1.875,
"learning_rate": 7.353167351473955e-06,
"loss": 1.6681,
"step": 633
},
{
"epoch": 1.2139779798946864,
"grad_norm": 1.71875,
"learning_rate": 7.322610076777707e-06,
"loss": 1.7014,
"step": 634
},
{
"epoch": 1.215892771661082,
"grad_norm": 1.734375,
"learning_rate": 7.292079708983475e-06,
"loss": 1.7218,
"step": 635
},
{
"epoch": 1.2178075634274772,
"grad_norm": 1.7578125,
"learning_rate": 7.261576554911575e-06,
"loss": 1.7206,
"step": 636
},
{
"epoch": 1.2197223551938727,
"grad_norm": 1.7421875,
"learning_rate": 7.2311009211088255e-06,
"loss": 1.6895,
"step": 637
},
{
"epoch": 1.221637146960268,
"grad_norm": 1.734375,
"learning_rate": 7.20065311384549e-06,
"loss": 1.744,
"step": 638
},
{
"epoch": 1.2235519387266636,
"grad_norm": 1.703125,
"learning_rate": 7.170233439112164e-06,
"loss": 1.7217,
"step": 639
},
{
"epoch": 1.2254667304930589,
"grad_norm": 1.6875,
"learning_rate": 7.139842202616741e-06,
"loss": 1.6799,
"step": 640
},
{
"epoch": 1.2273815222594542,
"grad_norm": 1.71875,
"learning_rate": 7.109479709781302e-06,
"loss": 1.7117,
"step": 641
},
{
"epoch": 1.2292963140258497,
"grad_norm": 1.734375,
"learning_rate": 7.079146265739079e-06,
"loss": 1.6948,
"step": 642
},
{
"epoch": 1.231211105792245,
"grad_norm": 1.78125,
"learning_rate": 7.048842175331356e-06,
"loss": 1.7343,
"step": 643
},
{
"epoch": 1.2331258975586405,
"grad_norm": 1.7578125,
"learning_rate": 7.0185677431044404e-06,
"loss": 1.7078,
"step": 644
},
{
"epoch": 1.2350406893250359,
"grad_norm": 1.671875,
"learning_rate": 6.988323273306569e-06,
"loss": 1.7168,
"step": 645
},
{
"epoch": 1.2369554810914314,
"grad_norm": 1.6875,
"learning_rate": 6.958109069884879e-06,
"loss": 1.6997,
"step": 646
},
{
"epoch": 1.2388702728578267,
"grad_norm": 1.6640625,
"learning_rate": 6.9279254364823265e-06,
"loss": 1.6204,
"step": 647
},
{
"epoch": 1.2407850646242222,
"grad_norm": 1.7265625,
"learning_rate": 6.897772676434663e-06,
"loss": 1.727,
"step": 648
},
{
"epoch": 1.2426998563906175,
"grad_norm": 1.71875,
"learning_rate": 6.867651092767366e-06,
"loss": 1.7278,
"step": 649
},
{
"epoch": 1.2446146481570128,
"grad_norm": 1.703125,
"learning_rate": 6.837560988192593e-06,
"loss": 1.7087,
"step": 650
},
{
"epoch": 1.2465294399234084,
"grad_norm": 1.65625,
"learning_rate": 6.807502665106164e-06,
"loss": 1.6614,
"step": 651
},
{
"epoch": 1.2484442316898037,
"grad_norm": 1.7578125,
"learning_rate": 6.777476425584486e-06,
"loss": 1.7264,
"step": 652
},
{
"epoch": 1.2503590234561992,
"grad_norm": 1.7109375,
"learning_rate": 6.747482571381556e-06,
"loss": 1.701,
"step": 653
},
{
"epoch": 1.2522738152225945,
"grad_norm": 1.6875,
"learning_rate": 6.717521403925892e-06,
"loss": 1.7061,
"step": 654
},
{
"epoch": 1.2541886069889898,
"grad_norm": 1.7890625,
"learning_rate": 6.687593224317533e-06,
"loss": 1.6551,
"step": 655
},
{
"epoch": 1.2561033987553853,
"grad_norm": 1.6796875,
"learning_rate": 6.657698333324991e-06,
"loss": 1.731,
"step": 656
},
{
"epoch": 1.2580181905217809,
"grad_norm": 1.65625,
"learning_rate": 6.627837031382246e-06,
"loss": 1.7075,
"step": 657
},
{
"epoch": 1.2599329822881762,
"grad_norm": 1.65625,
"learning_rate": 6.598009618585717e-06,
"loss": 1.6765,
"step": 658
},
{
"epoch": 1.2618477740545715,
"grad_norm": 1.765625,
"learning_rate": 6.568216394691245e-06,
"loss": 1.7378,
"step": 659
},
{
"epoch": 1.263762565820967,
"grad_norm": 1.78125,
"learning_rate": 6.538457659111084e-06,
"loss": 1.7609,
"step": 660
},
{
"epoch": 1.2656773575873623,
"grad_norm": 1.6953125,
"learning_rate": 6.5087337109109e-06,
"loss": 1.7036,
"step": 661
},
{
"epoch": 1.2675921493537579,
"grad_norm": 1.6875,
"learning_rate": 6.479044848806739e-06,
"loss": 1.6546,
"step": 662
},
{
"epoch": 1.2695069411201532,
"grad_norm": 1.7421875,
"learning_rate": 6.4493913711620685e-06,
"loss": 1.7018,
"step": 663
},
{
"epoch": 1.2714217328865485,
"grad_norm": 1.703125,
"learning_rate": 6.419773575984727e-06,
"loss": 1.7357,
"step": 664
},
{
"epoch": 1.273336524652944,
"grad_norm": 1.6875,
"learning_rate": 6.390191760923978e-06,
"loss": 1.6928,
"step": 665
},
{
"epoch": 1.2752513164193395,
"grad_norm": 1.7734375,
"learning_rate": 6.360646223267477e-06,
"loss": 1.6623,
"step": 666
},
{
"epoch": 1.2771661081857348,
"grad_norm": 1.84375,
"learning_rate": 6.3311372599383245e-06,
"loss": 1.6921,
"step": 667
},
{
"epoch": 1.2790808999521301,
"grad_norm": 1.78125,
"learning_rate": 6.301665167492037e-06,
"loss": 1.7036,
"step": 668
},
{
"epoch": 1.2809956917185257,
"grad_norm": 1.7578125,
"learning_rate": 6.272230242113613e-06,
"loss": 1.7099,
"step": 669
},
{
"epoch": 1.282910483484921,
"grad_norm": 1.6875,
"learning_rate": 6.242832779614521e-06,
"loss": 1.6826,
"step": 670
},
{
"epoch": 1.2848252752513165,
"grad_norm": 1.8046875,
"learning_rate": 6.213473075429741e-06,
"loss": 1.7058,
"step": 671
},
{
"epoch": 1.2867400670177118,
"grad_norm": 1.8515625,
"learning_rate": 6.184151424614795e-06,
"loss": 1.6677,
"step": 672
},
{
"epoch": 1.2886548587841071,
"grad_norm": 1.7578125,
"learning_rate": 6.154868121842788e-06,
"loss": 1.7125,
"step": 673
},
{
"epoch": 1.2905696505505027,
"grad_norm": 1.7734375,
"learning_rate": 6.1256234614014256e-06,
"loss": 1.6581,
"step": 674
},
{
"epoch": 1.292484442316898,
"grad_norm": 1.78125,
"learning_rate": 6.096417737190085e-06,
"loss": 1.7104,
"step": 675
},
{
"epoch": 1.2943992340832935,
"grad_norm": 1.671875,
"learning_rate": 6.067251242716838e-06,
"loss": 1.6612,
"step": 676
},
{
"epoch": 1.2963140258496888,
"grad_norm": 1.75,
"learning_rate": 6.038124271095507e-06,
"loss": 1.6501,
"step": 677
},
{
"epoch": 1.2982288176160843,
"grad_norm": 1.734375,
"learning_rate": 6.0090371150427375e-06,
"loss": 1.7283,
"step": 678
},
{
"epoch": 1.3001436093824796,
"grad_norm": 1.7265625,
"learning_rate": 5.979990066875022e-06,
"loss": 1.699,
"step": 679
},
{
"epoch": 1.3020584011488752,
"grad_norm": 1.84375,
"learning_rate": 5.950983418505799e-06,
"loss": 1.7458,
"step": 680
},
{
"epoch": 1.3039731929152705,
"grad_norm": 1.703125,
"learning_rate": 5.922017461442492e-06,
"loss": 1.6889,
"step": 681
},
{
"epoch": 1.3058879846816658,
"grad_norm": 1.7265625,
"learning_rate": 5.893092486783594e-06,
"loss": 1.6935,
"step": 682
},
{
"epoch": 1.3078027764480613,
"grad_norm": 1.7109375,
"learning_rate": 5.864208785215732e-06,
"loss": 1.6641,
"step": 683
},
{
"epoch": 1.3097175682144566,
"grad_norm": 1.734375,
"learning_rate": 5.835366647010767e-06,
"loss": 1.7062,
"step": 684
},
{
"epoch": 1.3116323599808521,
"grad_norm": 1.6796875,
"learning_rate": 5.8065663620228404e-06,
"loss": 1.7008,
"step": 685
},
{
"epoch": 1.3135471517472475,
"grad_norm": 1.7265625,
"learning_rate": 5.777808219685496e-06,
"loss": 1.7002,
"step": 686
},
{
"epoch": 1.3154619435136428,
"grad_norm": 1.7265625,
"learning_rate": 5.749092509008761e-06,
"loss": 1.6896,
"step": 687
},
{
"epoch": 1.3173767352800383,
"grad_norm": 1.7578125,
"learning_rate": 5.720419518576223e-06,
"loss": 1.7014,
"step": 688
},
{
"epoch": 1.3192915270464338,
"grad_norm": 1.75,
"learning_rate": 5.691789536542161e-06,
"loss": 1.6799,
"step": 689
},
{
"epoch": 1.3212063188128291,
"grad_norm": 1.7265625,
"learning_rate": 5.6632028506286266e-06,
"loss": 1.6558,
"step": 690
},
{
"epoch": 1.3231211105792244,
"grad_norm": 1.6640625,
"learning_rate": 5.634659748122552e-06,
"loss": 1.6286,
"step": 691
},
{
"epoch": 1.32503590234562,
"grad_norm": 1.6796875,
"learning_rate": 5.606160515872886e-06,
"loss": 1.6983,
"step": 692
},
{
"epoch": 1.3269506941120153,
"grad_norm": 1.8359375,
"learning_rate": 5.57770544028768e-06,
"loss": 1.7115,
"step": 693
},
{
"epoch": 1.3288654858784108,
"grad_norm": 1.7265625,
"learning_rate": 5.5492948073312406e-06,
"loss": 1.719,
"step": 694
},
{
"epoch": 1.330780277644806,
"grad_norm": 1.7421875,
"learning_rate": 5.520928902521221e-06,
"loss": 1.7074,
"step": 695
},
{
"epoch": 1.3326950694112014,
"grad_norm": 1.65625,
"learning_rate": 5.492608010925793e-06,
"loss": 1.7135,
"step": 696
},
{
"epoch": 1.334609861177597,
"grad_norm": 1.7109375,
"learning_rate": 5.46433241716074e-06,
"loss": 1.6395,
"step": 697
},
{
"epoch": 1.3365246529439925,
"grad_norm": 1.90625,
"learning_rate": 5.436102405386636e-06,
"loss": 1.7543,
"step": 698
},
{
"epoch": 1.3384394447103878,
"grad_norm": 1.78125,
"learning_rate": 5.407918259305951e-06,
"loss": 1.6431,
"step": 699
},
{
"epoch": 1.340354236476783,
"grad_norm": 1.7109375,
"learning_rate": 5.379780262160237e-06,
"loss": 1.7222,
"step": 700
},
{
"epoch": 1.3422690282431786,
"grad_norm": 1.734375,
"learning_rate": 5.3516886967272485e-06,
"loss": 1.7227,
"step": 701
},
{
"epoch": 1.344183820009574,
"grad_norm": 1.75,
"learning_rate": 5.323643845318135e-06,
"loss": 1.7426,
"step": 702
},
{
"epoch": 1.3460986117759695,
"grad_norm": 1.6875,
"learning_rate": 5.295645989774565e-06,
"loss": 1.7319,
"step": 703
},
{
"epoch": 1.3480134035423648,
"grad_norm": 1.75,
"learning_rate": 5.26769541146593e-06,
"loss": 1.6958,
"step": 704
},
{
"epoch": 1.34992819530876,
"grad_norm": 1.734375,
"learning_rate": 5.239792391286492e-06,
"loss": 1.682,
"step": 705
},
{
"epoch": 1.3518429870751556,
"grad_norm": 1.65625,
"learning_rate": 5.211937209652567e-06,
"loss": 1.6474,
"step": 706
},
{
"epoch": 1.353757778841551,
"grad_norm": 1.734375,
"learning_rate": 5.1841301464997206e-06,
"loss": 1.6762,
"step": 707
},
{
"epoch": 1.3556725706079464,
"grad_norm": 1.7734375,
"learning_rate": 5.156371481279928e-06,
"loss": 1.6729,
"step": 708
},
{
"epoch": 1.3575873623743417,
"grad_norm": 1.7109375,
"learning_rate": 5.128661492958793e-06,
"loss": 1.7092,
"step": 709
},
{
"epoch": 1.3595021541407373,
"grad_norm": 1.7890625,
"learning_rate": 5.101000460012731e-06,
"loss": 1.6943,
"step": 710
},
{
"epoch": 1.3614169459071326,
"grad_norm": 1.671875,
"learning_rate": 5.073388660426164e-06,
"loss": 1.6758,
"step": 711
},
{
"epoch": 1.363331737673528,
"grad_norm": 1.78125,
"learning_rate": 5.04582637168874e-06,
"loss": 1.671,
"step": 712
},
{
"epoch": 1.3652465294399234,
"grad_norm": 1.7109375,
"learning_rate": 5.018313870792544e-06,
"loss": 1.6675,
"step": 713
},
{
"epoch": 1.3671613212063187,
"grad_norm": 1.7109375,
"learning_rate": 4.990851434229295e-06,
"loss": 1.7074,
"step": 714
},
{
"epoch": 1.3690761129727143,
"grad_norm": 1.7265625,
"learning_rate": 4.9634393379875986e-06,
"loss": 1.6558,
"step": 715
},
{
"epoch": 1.3709909047391096,
"grad_norm": 1.703125,
"learning_rate": 4.936077857550141e-06,
"loss": 1.7064,
"step": 716
},
{
"epoch": 1.372905696505505,
"grad_norm": 1.640625,
"learning_rate": 4.908767267890952e-06,
"loss": 1.646,
"step": 717
},
{
"epoch": 1.3748204882719004,
"grad_norm": 1.7109375,
"learning_rate": 4.8815078434726075e-06,
"loss": 1.7207,
"step": 718
},
{
"epoch": 1.3767352800382957,
"grad_norm": 1.734375,
"learning_rate": 4.854299858243505e-06,
"loss": 1.7459,
"step": 719
},
{
"epoch": 1.3786500718046912,
"grad_norm": 1.6796875,
"learning_rate": 4.827143585635085e-06,
"loss": 1.677,
"step": 720
},
{
"epoch": 1.3805648635710868,
"grad_norm": 1.796875,
"learning_rate": 4.800039298559101e-06,
"loss": 1.682,
"step": 721
},
{
"epoch": 1.382479655337482,
"grad_norm": 1.7265625,
"learning_rate": 4.772987269404855e-06,
"loss": 1.6784,
"step": 722
},
{
"epoch": 1.3843944471038774,
"grad_norm": 1.703125,
"learning_rate": 4.745987770036494e-06,
"loss": 1.7062,
"step": 723
},
{
"epoch": 1.386309238870273,
"grad_norm": 1.6953125,
"learning_rate": 4.719041071790238e-06,
"loss": 1.6879,
"step": 724
},
{
"epoch": 1.3882240306366682,
"grad_norm": 1.6875,
"learning_rate": 4.692147445471687e-06,
"loss": 1.7091,
"step": 725
},
{
"epoch": 1.3901388224030637,
"grad_norm": 1.7265625,
"learning_rate": 4.665307161353073e-06,
"loss": 1.721,
"step": 726
},
{
"epoch": 1.392053614169459,
"grad_norm": 1.671875,
"learning_rate": 4.638520489170572e-06,
"loss": 1.6974,
"step": 727
},
{
"epoch": 1.3939684059358544,
"grad_norm": 1.71875,
"learning_rate": 4.611787698121558e-06,
"loss": 1.7036,
"step": 728
},
{
"epoch": 1.39588319770225,
"grad_norm": 1.7109375,
"learning_rate": 4.585109056861936e-06,
"loss": 1.6918,
"step": 729
},
{
"epoch": 1.3977979894686454,
"grad_norm": 1.75,
"learning_rate": 4.558484833503407e-06,
"loss": 1.6522,
"step": 730
},
{
"epoch": 1.3997127812350407,
"grad_norm": 1.6953125,
"learning_rate": 4.531915295610805e-06,
"loss": 1.682,
"step": 731
},
{
"epoch": 1.401627573001436,
"grad_norm": 1.7109375,
"learning_rate": 4.505400710199376e-06,
"loss": 1.7143,
"step": 732
},
{
"epoch": 1.4035423647678316,
"grad_norm": 1.78125,
"learning_rate": 4.478941343732125e-06,
"loss": 1.7184,
"step": 733
},
{
"epoch": 1.4054571565342269,
"grad_norm": 1.7734375,
"learning_rate": 4.452537462117123e-06,
"loss": 1.7365,
"step": 734
},
{
"epoch": 1.4073719483006224,
"grad_norm": 1.7578125,
"learning_rate": 4.426189330704826e-06,
"loss": 1.6457,
"step": 735
},
{
"epoch": 1.4092867400670177,
"grad_norm": 1.703125,
"learning_rate": 4.3998972142854334e-06,
"loss": 1.6866,
"step": 736
},
{
"epoch": 1.411201531833413,
"grad_norm": 1.6796875,
"learning_rate": 4.373661377086195e-06,
"loss": 1.6959,
"step": 737
},
{
"epoch": 1.4131163235998085,
"grad_norm": 1.71875,
"learning_rate": 4.3474820827687894e-06,
"loss": 1.7323,
"step": 738
},
{
"epoch": 1.4150311153662039,
"grad_norm": 1.6953125,
"learning_rate": 4.321359594426644e-06,
"loss": 1.6982,
"step": 739
},
{
"epoch": 1.4169459071325994,
"grad_norm": 1.671875,
"learning_rate": 4.295294174582315e-06,
"loss": 1.6859,
"step": 740
},
{
"epoch": 1.4188606988989947,
"grad_norm": 1.6953125,
"learning_rate": 4.2692860851848295e-06,
"loss": 1.6616,
"step": 741
},
{
"epoch": 1.4207754906653902,
"grad_norm": 1.75,
"learning_rate": 4.243335587607074e-06,
"loss": 1.6983,
"step": 742
},
{
"epoch": 1.4226902824317855,
"grad_norm": 1.6875,
"learning_rate": 4.217442942643138e-06,
"loss": 1.6405,
"step": 743
},
{
"epoch": 1.424605074198181,
"grad_norm": 1.65625,
"learning_rate": 4.191608410505732e-06,
"loss": 1.6706,
"step": 744
},
{
"epoch": 1.4265198659645764,
"grad_norm": 1.875,
"learning_rate": 4.165832250823534e-06,
"loss": 1.7247,
"step": 745
},
{
"epoch": 1.4284346577309717,
"grad_norm": 1.7578125,
"learning_rate": 4.140114722638609e-06,
"loss": 1.713,
"step": 746
},
{
"epoch": 1.4303494494973672,
"grad_norm": 1.671875,
"learning_rate": 4.114456084403785e-06,
"loss": 1.7519,
"step": 747
},
{
"epoch": 1.4322642412637625,
"grad_norm": 1.765625,
"learning_rate": 4.088856593980078e-06,
"loss": 1.7403,
"step": 748
},
{
"epoch": 1.434179033030158,
"grad_norm": 1.65625,
"learning_rate": 4.06331650863407e-06,
"loss": 1.6786,
"step": 749
},
{
"epoch": 1.4360938247965533,
"grad_norm": 1.7265625,
"learning_rate": 4.03783608503536e-06,
"loss": 1.6476,
"step": 750
},
{
"epoch": 1.4380086165629486,
"grad_norm": 1.796875,
"learning_rate": 4.0124155792539496e-06,
"loss": 1.8036,
"step": 751
},
{
"epoch": 1.4399234083293442,
"grad_norm": 1.703125,
"learning_rate": 3.987055246757701e-06,
"loss": 1.7387,
"step": 752
},
{
"epoch": 1.4418382000957397,
"grad_norm": 1.6875,
"learning_rate": 3.961755342409737e-06,
"loss": 1.7148,
"step": 753
},
{
"epoch": 1.443752991862135,
"grad_norm": 1.6484375,
"learning_rate": 3.936516120465914e-06,
"loss": 1.621,
"step": 754
},
{
"epoch": 1.4456677836285303,
"grad_norm": 1.65625,
"learning_rate": 3.911337834572235e-06,
"loss": 1.6647,
"step": 755
},
{
"epoch": 1.4475825753949259,
"grad_norm": 1.75,
"learning_rate": 3.886220737762328e-06,
"loss": 1.6833,
"step": 756
},
{
"epoch": 1.4494973671613212,
"grad_norm": 1.734375,
"learning_rate": 3.861165082454888e-06,
"loss": 1.7302,
"step": 757
},
{
"epoch": 1.4514121589277167,
"grad_norm": 1.6953125,
"learning_rate": 3.836171120451131e-06,
"loss": 1.7396,
"step": 758
},
{
"epoch": 1.453326950694112,
"grad_norm": 1.7734375,
"learning_rate": 3.811239102932289e-06,
"loss": 1.7763,
"step": 759
},
{
"epoch": 1.4552417424605073,
"grad_norm": 1.7109375,
"learning_rate": 3.7863692804570707e-06,
"loss": 1.734,
"step": 760
},
{
"epoch": 1.4571565342269028,
"grad_norm": 1.734375,
"learning_rate": 3.761561902959139e-06,
"loss": 1.6783,
"step": 761
},
{
"epoch": 1.4590713259932984,
"grad_norm": 1.703125,
"learning_rate": 3.7368172197446007e-06,
"loss": 1.6689,
"step": 762
},
{
"epoch": 1.4609861177596937,
"grad_norm": 1.7265625,
"learning_rate": 3.7121354794895216e-06,
"loss": 1.6886,
"step": 763
},
{
"epoch": 1.462900909526089,
"grad_norm": 1.65625,
"learning_rate": 3.6875169302373938e-06,
"loss": 1.6309,
"step": 764
},
{
"epoch": 1.4648157012924845,
"grad_norm": 1.7421875,
"learning_rate": 3.6629618193966744e-06,
"loss": 1.7063,
"step": 765
},
{
"epoch": 1.4667304930588798,
"grad_norm": 1.7265625,
"learning_rate": 3.6384703937382714e-06,
"loss": 1.7162,
"step": 766
},
{
"epoch": 1.4686452848252753,
"grad_norm": 1.71875,
"learning_rate": 3.6140428993930922e-06,
"loss": 1.7338,
"step": 767
},
{
"epoch": 1.4705600765916707,
"grad_norm": 1.734375,
"learning_rate": 3.589679581849539e-06,
"loss": 1.6977,
"step": 768
},
{
"epoch": 1.472474868358066,
"grad_norm": 1.71875,
"learning_rate": 3.5653806859510743e-06,
"loss": 1.6789,
"step": 769
},
{
"epoch": 1.4743896601244615,
"grad_norm": 1.7421875,
"learning_rate": 3.5411464558937302e-06,
"loss": 1.6898,
"step": 770
},
{
"epoch": 1.4763044518908568,
"grad_norm": 1.703125,
"learning_rate": 3.5169771352236782e-06,
"loss": 1.7292,
"step": 771
},
{
"epoch": 1.4782192436572523,
"grad_norm": 1.703125,
"learning_rate": 3.4928729668347616e-06,
"loss": 1.7046,
"step": 772
},
{
"epoch": 1.4801340354236476,
"grad_norm": 1.734375,
"learning_rate": 3.4688341929660776e-06,
"loss": 1.7028,
"step": 773
},
{
"epoch": 1.4820488271900432,
"grad_norm": 1.6875,
"learning_rate": 3.444861055199512e-06,
"loss": 1.6834,
"step": 774
},
{
"epoch": 1.4839636189564385,
"grad_norm": 1.6875,
"learning_rate": 3.420953794457349e-06,
"loss": 1.7299,
"step": 775
},
{
"epoch": 1.485878410722834,
"grad_norm": 1.6640625,
"learning_rate": 3.397112650999811e-06,
"loss": 1.6711,
"step": 776
},
{
"epoch": 1.4877932024892293,
"grad_norm": 1.7265625,
"learning_rate": 3.37333786442268e-06,
"loss": 1.6332,
"step": 777
},
{
"epoch": 1.4897079942556246,
"grad_norm": 1.671875,
"learning_rate": 3.349629673654858e-06,
"loss": 1.673,
"step": 778
},
{
"epoch": 1.4916227860220201,
"grad_norm": 1.65625,
"learning_rate": 3.32598831695599e-06,
"loss": 1.6594,
"step": 779
},
{
"epoch": 1.4935375777884154,
"grad_norm": 1.671875,
"learning_rate": 3.3024140319140617e-06,
"loss": 1.6547,
"step": 780
},
{
"epoch": 1.495452369554811,
"grad_norm": 1.6875,
"learning_rate": 3.2789070554430003e-06,
"loss": 1.6371,
"step": 781
},
{
"epoch": 1.4973671613212063,
"grad_norm": 1.7578125,
"learning_rate": 3.2554676237803117e-06,
"loss": 1.7326,
"step": 782
},
{
"epoch": 1.4992819530876016,
"grad_norm": 1.6953125,
"learning_rate": 3.2320959724847e-06,
"loss": 1.7118,
"step": 783
},
{
"epoch": 1.5011967448539971,
"grad_norm": 1.7109375,
"learning_rate": 3.2087923364336904e-06,
"loss": 1.8037,
"step": 784
},
{
"epoch": 1.5031115366203927,
"grad_norm": 1.703125,
"learning_rate": 3.1855569498212857e-06,
"loss": 1.7526,
"step": 785
},
{
"epoch": 1.505026328386788,
"grad_norm": 1.703125,
"learning_rate": 3.1623900461555933e-06,
"loss": 1.665,
"step": 786
},
{
"epoch": 1.5069411201531833,
"grad_norm": 1.7890625,
"learning_rate": 3.1392918582565037e-06,
"loss": 1.6528,
"step": 787
},
{
"epoch": 1.5088559119195788,
"grad_norm": 1.703125,
"learning_rate": 3.1162626182533207e-06,
"loss": 1.7152,
"step": 788
},
{
"epoch": 1.510770703685974,
"grad_norm": 1.6796875,
"learning_rate": 3.093302557582457e-06,
"loss": 1.661,
"step": 789
},
{
"epoch": 1.5126854954523696,
"grad_norm": 1.6953125,
"learning_rate": 3.070411906985088e-06,
"loss": 1.7286,
"step": 790
},
{
"epoch": 1.514600287218765,
"grad_norm": 1.703125,
"learning_rate": 3.0475908965048374e-06,
"loss": 1.7394,
"step": 791
},
{
"epoch": 1.5165150789851602,
"grad_norm": 1.6796875,
"learning_rate": 3.0248397554854813e-06,
"loss": 1.6925,
"step": 792
},
{
"epoch": 1.5184298707515558,
"grad_norm": 1.6640625,
"learning_rate": 3.002158712568615e-06,
"loss": 1.6607,
"step": 793
},
{
"epoch": 1.5203446625179513,
"grad_norm": 1.71875,
"learning_rate": 2.979547995691383e-06,
"loss": 1.6779,
"step": 794
},
{
"epoch": 1.5222594542843466,
"grad_norm": 1.7265625,
"learning_rate": 2.9570078320841644e-06,
"loss": 1.7353,
"step": 795
},
{
"epoch": 1.524174246050742,
"grad_norm": 1.703125,
"learning_rate": 2.9345384482683148e-06,
"loss": 1.7012,
"step": 796
},
{
"epoch": 1.5260890378171372,
"grad_norm": 1.65625,
"learning_rate": 2.9121400700538593e-06,
"loss": 1.6287,
"step": 797
},
{
"epoch": 1.5280038295835328,
"grad_norm": 1.6953125,
"learning_rate": 2.8898129225372564e-06,
"loss": 1.6926,
"step": 798
},
{
"epoch": 1.5299186213499283,
"grad_norm": 1.71875,
"learning_rate": 2.867557230099104e-06,
"loss": 1.6822,
"step": 799
},
{
"epoch": 1.5318334131163236,
"grad_norm": 1.6640625,
"learning_rate": 2.845373216401913e-06,
"loss": 1.7203,
"step": 800
},
{
"epoch": 1.533748204882719,
"grad_norm": 1.71875,
"learning_rate": 2.823261104387833e-06,
"loss": 1.7103,
"step": 801
},
{
"epoch": 1.5356629966491144,
"grad_norm": 1.703125,
"learning_rate": 2.801221116276436e-06,
"loss": 1.6772,
"step": 802
},
{
"epoch": 1.53757778841551,
"grad_norm": 1.6953125,
"learning_rate": 2.7792534735624687e-06,
"loss": 1.7132,
"step": 803
},
{
"epoch": 1.5394925801819053,
"grad_norm": 1.75,
"learning_rate": 2.757358397013625e-06,
"loss": 1.8025,
"step": 804
},
{
"epoch": 1.5414073719483006,
"grad_norm": 1.6796875,
"learning_rate": 2.7355361066683393e-06,
"loss": 1.6785,
"step": 805
},
{
"epoch": 1.5433221637146959,
"grad_norm": 1.703125,
"learning_rate": 2.7137868218335674e-06,
"loss": 1.6791,
"step": 806
},
{
"epoch": 1.5452369554810914,
"grad_norm": 1.734375,
"learning_rate": 2.692110761082577e-06,
"loss": 1.7253,
"step": 807
},
{
"epoch": 1.547151747247487,
"grad_norm": 1.6796875,
"learning_rate": 2.670508142252766e-06,
"loss": 1.7035,
"step": 808
},
{
"epoch": 1.5490665390138822,
"grad_norm": 1.7421875,
"learning_rate": 2.648979182443454e-06,
"loss": 1.7488,
"step": 809
},
{
"epoch": 1.5509813307802776,
"grad_norm": 1.671875,
"learning_rate": 2.6275240980137272e-06,
"loss": 1.704,
"step": 810
},
{
"epoch": 1.552896122546673,
"grad_norm": 1.7109375,
"learning_rate": 2.6061431045802286e-06,
"loss": 1.7235,
"step": 811
},
{
"epoch": 1.5548109143130686,
"grad_norm": 1.6875,
"learning_rate": 2.5848364170150307e-06,
"loss": 1.6652,
"step": 812
},
{
"epoch": 1.556725706079464,
"grad_norm": 1.703125,
"learning_rate": 2.563604249443438e-06,
"loss": 1.6524,
"step": 813
},
{
"epoch": 1.5586404978458592,
"grad_norm": 1.71875,
"learning_rate": 2.542446815241867e-06,
"loss": 1.697,
"step": 814
},
{
"epoch": 1.5605552896122545,
"grad_norm": 1.640625,
"learning_rate": 2.521364327035678e-06,
"loss": 1.6973,
"step": 815
},
{
"epoch": 1.56247008137865,
"grad_norm": 1.6953125,
"learning_rate": 2.5003569966970574e-06,
"loss": 1.7513,
"step": 816
},
{
"epoch": 1.5643848731450456,
"grad_norm": 1.6796875,
"learning_rate": 2.4794250353428707e-06,
"loss": 1.6145,
"step": 817
},
{
"epoch": 1.566299664911441,
"grad_norm": 1.7890625,
"learning_rate": 2.458568653332557e-06,
"loss": 1.7153,
"step": 818
},
{
"epoch": 1.5682144566778362,
"grad_norm": 1.6953125,
"learning_rate": 2.437788060266002e-06,
"loss": 1.6331,
"step": 819
},
{
"epoch": 1.5701292484442317,
"grad_norm": 1.671875,
"learning_rate": 2.4170834649814366e-06,
"loss": 1.6747,
"step": 820
},
{
"epoch": 1.572044040210627,
"grad_norm": 1.671875,
"learning_rate": 2.3964550755533468e-06,
"loss": 1.6055,
"step": 821
},
{
"epoch": 1.5739588319770226,
"grad_norm": 1.6796875,
"learning_rate": 2.375903099290362e-06,
"loss": 1.6992,
"step": 822
},
{
"epoch": 1.5758736237434179,
"grad_norm": 1.65625,
"learning_rate": 2.355427742733197e-06,
"loss": 1.6433,
"step": 823
},
{
"epoch": 1.5777884155098132,
"grad_norm": 1.6875,
"learning_rate": 2.335029211652552e-06,
"loss": 1.7133,
"step": 824
},
{
"epoch": 1.5797032072762087,
"grad_norm": 1.6875,
"learning_rate": 2.314707711047063e-06,
"loss": 1.7327,
"step": 825
},
{
"epoch": 1.5816179990426043,
"grad_norm": 1.6796875,
"learning_rate": 2.294463445141233e-06,
"loss": 1.7631,
"step": 826
},
{
"epoch": 1.5835327908089996,
"grad_norm": 1.6953125,
"learning_rate": 2.2742966173833835e-06,
"loss": 1.7577,
"step": 827
},
{
"epoch": 1.5854475825753949,
"grad_norm": 1.6875,
"learning_rate": 2.254207430443599e-06,
"loss": 1.7896,
"step": 828
},
{
"epoch": 1.5873623743417902,
"grad_norm": 1.7109375,
"learning_rate": 2.2341960862117118e-06,
"loss": 1.6763,
"step": 829
},
{
"epoch": 1.5892771661081857,
"grad_norm": 1.640625,
"learning_rate": 2.214262785795248e-06,
"loss": 1.6878,
"step": 830
},
{
"epoch": 1.5911919578745812,
"grad_norm": 1.640625,
"learning_rate": 2.1944077295174284e-06,
"loss": 1.6532,
"step": 831
},
{
"epoch": 1.5931067496409765,
"grad_norm": 1.765625,
"learning_rate": 2.174631116915137e-06,
"loss": 1.7702,
"step": 832
},
{
"epoch": 1.5950215414073718,
"grad_norm": 1.6796875,
"learning_rate": 2.1549331467369327e-06,
"loss": 1.7115,
"step": 833
},
{
"epoch": 1.5969363331737674,
"grad_norm": 1.6484375,
"learning_rate": 2.1353140169410347e-06,
"loss": 1.6486,
"step": 834
},
{
"epoch": 1.598851124940163,
"grad_norm": 1.6484375,
"learning_rate": 2.1157739246933507e-06,
"loss": 1.7097,
"step": 835
},
{
"epoch": 1.6007659167065582,
"grad_norm": 1.6875,
"learning_rate": 2.0963130663654785e-06,
"loss": 1.7174,
"step": 836
},
{
"epoch": 1.6026807084729535,
"grad_norm": 1.671875,
"learning_rate": 2.0769316375327497e-06,
"loss": 1.6954,
"step": 837
},
{
"epoch": 1.6045955002393488,
"grad_norm": 1.671875,
"learning_rate": 2.0576298329722445e-06,
"loss": 1.6773,
"step": 838
},
{
"epoch": 1.6065102920057444,
"grad_norm": 1.65625,
"learning_rate": 2.038407846660855e-06,
"loss": 1.6202,
"step": 839
},
{
"epoch": 1.6084250837721399,
"grad_norm": 1.71875,
"learning_rate": 2.019265871773316e-06,
"loss": 1.7177,
"step": 840
},
{
"epoch": 1.6103398755385352,
"grad_norm": 1.6875,
"learning_rate": 2.0002041006802843e-06,
"loss": 1.7824,
"step": 841
},
{
"epoch": 1.6122546673049305,
"grad_norm": 1.6953125,
"learning_rate": 1.981222724946383e-06,
"loss": 1.7061,
"step": 842
},
{
"epoch": 1.614169459071326,
"grad_norm": 1.6875,
"learning_rate": 1.9623219353283005e-06,
"loss": 1.7551,
"step": 843
},
{
"epoch": 1.6160842508377216,
"grad_norm": 1.6953125,
"learning_rate": 1.943501921772848e-06,
"loss": 1.7165,
"step": 844
},
{
"epoch": 1.6179990426041169,
"grad_norm": 1.6796875,
"learning_rate": 1.9247628734150725e-06,
"loss": 1.6687,
"step": 845
},
{
"epoch": 1.6199138343705122,
"grad_norm": 1.6640625,
"learning_rate": 1.9061049785763419e-06,
"loss": 1.6888,
"step": 846
},
{
"epoch": 1.6218286261369075,
"grad_norm": 1.6484375,
"learning_rate": 1.8875284247624625e-06,
"loss": 1.6674,
"step": 847
},
{
"epoch": 1.623743417903303,
"grad_norm": 1.734375,
"learning_rate": 1.8690333986617827e-06,
"loss": 1.7384,
"step": 848
},
{
"epoch": 1.6256582096696985,
"grad_norm": 1.65625,
"learning_rate": 1.8506200861433287e-06,
"loss": 1.6367,
"step": 849
},
{
"epoch": 1.6275730014360938,
"grad_norm": 1.6640625,
"learning_rate": 1.832288672254936e-06,
"loss": 1.6592,
"step": 850
},
{
"epoch": 1.6294877932024892,
"grad_norm": 1.6953125,
"learning_rate": 1.8140393412213719e-06,
"loss": 1.7263,
"step": 851
},
{
"epoch": 1.6314025849688847,
"grad_norm": 1.671875,
"learning_rate": 1.7958722764425119e-06,
"loss": 1.6543,
"step": 852
},
{
"epoch": 1.63331737673528,
"grad_norm": 1.671875,
"learning_rate": 1.7777876604914712e-06,
"loss": 1.7082,
"step": 853
},
{
"epoch": 1.6352321685016755,
"grad_norm": 1.6953125,
"learning_rate": 1.7597856751127919e-06,
"loss": 1.7153,
"step": 854
},
{
"epoch": 1.6371469602680708,
"grad_norm": 1.6796875,
"learning_rate": 1.7418665012205927e-06,
"loss": 1.65,
"step": 855
},
{
"epoch": 1.6390617520344661,
"grad_norm": 1.6875,
"learning_rate": 1.7240303188967767e-06,
"loss": 1.6985,
"step": 856
},
{
"epoch": 1.6409765438008617,
"grad_norm": 1.703125,
"learning_rate": 1.7062773073891958e-06,
"loss": 1.6766,
"step": 857
},
{
"epoch": 1.6428913355672572,
"grad_norm": 1.6875,
"learning_rate": 1.6886076451098766e-06,
"loss": 1.6786,
"step": 858
},
{
"epoch": 1.6448061273336525,
"grad_norm": 1.6953125,
"learning_rate": 1.6710215096331971e-06,
"loss": 1.7329,
"step": 859
},
{
"epoch": 1.6467209191000478,
"grad_norm": 1.6953125,
"learning_rate": 1.6535190776941323e-06,
"loss": 1.7428,
"step": 860
},
{
"epoch": 1.6486357108664431,
"grad_norm": 1.671875,
"learning_rate": 1.6361005251864525e-06,
"loss": 1.6936,
"step": 861
},
{
"epoch": 1.6505505026328386,
"grad_norm": 1.6875,
"learning_rate": 1.6187660271609773e-06,
"loss": 1.7386,
"step": 862
},
{
"epoch": 1.6524652943992342,
"grad_norm": 1.7734375,
"learning_rate": 1.6015157578237939e-06,
"loss": 1.7213,
"step": 863
},
{
"epoch": 1.6543800861656295,
"grad_norm": 1.75,
"learning_rate": 1.584349890534531e-06,
"loss": 1.6877,
"step": 864
},
{
"epoch": 1.6562948779320248,
"grad_norm": 1.6953125,
"learning_rate": 1.5672685978045931e-06,
"loss": 1.7153,
"step": 865
},
{
"epoch": 1.6582096696984203,
"grad_norm": 1.734375,
"learning_rate": 1.5502720512954472e-06,
"loss": 1.7155,
"step": 866
},
{
"epoch": 1.6601244614648158,
"grad_norm": 1.671875,
"learning_rate": 1.5333604218168785e-06,
"loss": 1.7235,
"step": 867
},
{
"epoch": 1.6620392532312112,
"grad_norm": 1.6484375,
"learning_rate": 1.5165338793252937e-06,
"loss": 1.6423,
"step": 868
},
{
"epoch": 1.6639540449976065,
"grad_norm": 1.765625,
"learning_rate": 1.4997925929219937e-06,
"loss": 1.7088,
"step": 869
},
{
"epoch": 1.6658688367640018,
"grad_norm": 1.65625,
"learning_rate": 1.483136730851492e-06,
"loss": 1.6486,
"step": 870
},
{
"epoch": 1.6677836285303973,
"grad_norm": 1.6796875,
"learning_rate": 1.4665664604998053e-06,
"loss": 1.6938,
"step": 871
},
{
"epoch": 1.6696984202967928,
"grad_norm": 1.671875,
"learning_rate": 1.4500819483927898e-06,
"loss": 1.6819,
"step": 872
},
{
"epoch": 1.6716132120631881,
"grad_norm": 1.6953125,
"learning_rate": 1.4336833601944577e-06,
"loss": 1.6385,
"step": 873
},
{
"epoch": 1.6735280038295834,
"grad_norm": 1.6875,
"learning_rate": 1.4173708607053071e-06,
"loss": 1.6798,
"step": 874
},
{
"epoch": 1.675442795595979,
"grad_norm": 1.6796875,
"learning_rate": 1.4011446138606822e-06,
"loss": 1.7299,
"step": 875
},
{
"epoch": 1.6773575873623745,
"grad_norm": 1.71875,
"learning_rate": 1.3850047827291057e-06,
"loss": 1.715,
"step": 876
},
{
"epoch": 1.6792723791287698,
"grad_norm": 1.6953125,
"learning_rate": 1.3689515295106626e-06,
"loss": 1.6405,
"step": 877
},
{
"epoch": 1.6811871708951651,
"grad_norm": 1.65625,
"learning_rate": 1.352985015535344e-06,
"loss": 1.7398,
"step": 878
},
{
"epoch": 1.6831019626615604,
"grad_norm": 1.671875,
"learning_rate": 1.3371054012614527e-06,
"loss": 1.6731,
"step": 879
},
{
"epoch": 1.685016754427956,
"grad_norm": 1.6953125,
"learning_rate": 1.3213128462739656e-06,
"loss": 1.6651,
"step": 880
},
{
"epoch": 1.6869315461943515,
"grad_norm": 1.6640625,
"learning_rate": 1.3056075092829546e-06,
"loss": 1.6424,
"step": 881
},
{
"epoch": 1.6888463379607468,
"grad_norm": 1.6796875,
"learning_rate": 1.2899895481219672e-06,
"loss": 1.6476,
"step": 882
},
{
"epoch": 1.690761129727142,
"grad_norm": 1.7109375,
"learning_rate": 1.2744591197464618e-06,
"loss": 1.7422,
"step": 883
},
{
"epoch": 1.6926759214935376,
"grad_norm": 1.6640625,
"learning_rate": 1.2590163802322108e-06,
"loss": 1.6761,
"step": 884
},
{
"epoch": 1.694590713259933,
"grad_norm": 1.6953125,
"learning_rate": 1.2436614847737526e-06,
"loss": 1.7296,
"step": 885
},
{
"epoch": 1.6965055050263285,
"grad_norm": 1.6640625,
"learning_rate": 1.2283945876828107e-06,
"loss": 1.671,
"step": 886
},
{
"epoch": 1.6984202967927238,
"grad_norm": 1.6875,
"learning_rate": 1.2132158423867645e-06,
"loss": 1.7288,
"step": 887
},
{
"epoch": 1.700335088559119,
"grad_norm": 1.734375,
"learning_rate": 1.198125401427085e-06,
"loss": 1.7769,
"step": 888
},
{
"epoch": 1.7022498803255146,
"grad_norm": 1.6953125,
"learning_rate": 1.1831234164578242e-06,
"loss": 1.7237,
"step": 889
},
{
"epoch": 1.7041646720919101,
"grad_norm": 1.7421875,
"learning_rate": 1.1682100382440686e-06,
"loss": 1.7282,
"step": 890
},
{
"epoch": 1.7060794638583054,
"grad_norm": 1.7578125,
"learning_rate": 1.1533854166604486e-06,
"loss": 1.7385,
"step": 891
},
{
"epoch": 1.7079942556247008,
"grad_norm": 1.6875,
"learning_rate": 1.1386497006896058e-06,
"loss": 1.6813,
"step": 892
},
{
"epoch": 1.709909047391096,
"grad_norm": 1.6953125,
"learning_rate": 1.1240030384207202e-06,
"loss": 1.7269,
"step": 893
},
{
"epoch": 1.7118238391574916,
"grad_norm": 1.6796875,
"learning_rate": 1.1094455770480017e-06,
"loss": 1.6812,
"step": 894
},
{
"epoch": 1.7137386309238871,
"grad_norm": 1.671875,
"learning_rate": 1.0949774628692278e-06,
"loss": 1.6795,
"step": 895
},
{
"epoch": 1.7156534226902824,
"grad_norm": 1.6484375,
"learning_rate": 1.0805988412842638e-06,
"loss": 1.7112,
"step": 896
},
{
"epoch": 1.7175682144566777,
"grad_norm": 1.6796875,
"learning_rate": 1.0663098567935981e-06,
"loss": 1.7174,
"step": 897
},
{
"epoch": 1.7194830062230733,
"grad_norm": 1.6796875,
"learning_rate": 1.0521106529969016e-06,
"loss": 1.6405,
"step": 898
},
{
"epoch": 1.7213977979894688,
"grad_norm": 1.6953125,
"learning_rate": 1.0380013725915783e-06,
"loss": 1.7008,
"step": 899
},
{
"epoch": 1.723312589755864,
"grad_norm": 1.78125,
"learning_rate": 1.0239821573713228e-06,
"loss": 1.6936,
"step": 900
},
{
"epoch": 1.7252273815222594,
"grad_norm": 1.75,
"learning_rate": 1.0100531482247155e-06,
"loss": 1.7914,
"step": 901
},
{
"epoch": 1.7271421732886547,
"grad_norm": 1.640625,
"learning_rate": 9.962144851337863e-07,
"loss": 1.654,
"step": 902
},
{
"epoch": 1.7290569650550502,
"grad_norm": 1.7578125,
"learning_rate": 9.824663071726204e-07,
"loss": 1.7272,
"step": 903
},
{
"epoch": 1.7309717568214458,
"grad_norm": 1.671875,
"learning_rate": 9.688087525059575e-07,
"loss": 1.7288,
"step": 904
},
{
"epoch": 1.732886548587841,
"grad_norm": 1.6484375,
"learning_rate": 9.55241958387796e-07,
"loss": 1.6932,
"step": 905
},
{
"epoch": 1.7348013403542364,
"grad_norm": 1.6640625,
"learning_rate": 9.417660611600299e-07,
"loss": 1.6952,
"step": 906
},
{
"epoch": 1.736716132120632,
"grad_norm": 1.6796875,
"learning_rate": 9.283811962510603e-07,
"loss": 1.7722,
"step": 907
},
{
"epoch": 1.7386309238870274,
"grad_norm": 1.6953125,
"learning_rate": 9.150874981744507e-07,
"loss": 1.6527,
"step": 908
},
{
"epoch": 1.7405457156534228,
"grad_norm": 1.6875,
"learning_rate": 9.018851005275586e-07,
"loss": 1.7071,
"step": 909
},
{
"epoch": 1.742460507419818,
"grad_norm": 1.7109375,
"learning_rate": 8.887741359902113e-07,
"loss": 1.7559,
"step": 910
},
{
"epoch": 1.7443752991862134,
"grad_norm": 1.703125,
"learning_rate": 8.757547363233543e-07,
"loss": 1.6998,
"step": 911
},
{
"epoch": 1.746290090952609,
"grad_norm": 1.65625,
"learning_rate": 8.628270323677424e-07,
"loss": 1.593,
"step": 912
},
{
"epoch": 1.7482048827190044,
"grad_norm": 1.765625,
"learning_rate": 8.499911540426131e-07,
"loss": 1.7532,
"step": 913
},
{
"epoch": 1.7501196744853997,
"grad_norm": 1.6796875,
"learning_rate": 8.372472303443924e-07,
"loss": 1.696,
"step": 914
},
{
"epoch": 1.752034466251795,
"grad_norm": 1.703125,
"learning_rate": 8.245953893453829e-07,
"loss": 1.7278,
"step": 915
},
{
"epoch": 1.7539492580181906,
"grad_norm": 1.6953125,
"learning_rate": 8.120357581924931e-07,
"loss": 1.7215,
"step": 916
},
{
"epoch": 1.7558640497845859,
"grad_norm": 1.6640625,
"learning_rate": 7.99568463105953e-07,
"loss": 1.6702,
"step": 917
},
{
"epoch": 1.7577788415509814,
"grad_norm": 1.65625,
"learning_rate": 7.87193629378038e-07,
"loss": 1.6721,
"step": 918
},
{
"epoch": 1.7596936333173767,
"grad_norm": 1.6796875,
"learning_rate": 7.749113813718234e-07,
"loss": 1.7008,
"step": 919
},
{
"epoch": 1.761608425083772,
"grad_norm": 1.6328125,
"learning_rate": 7.627218425199278e-07,
"loss": 1.6697,
"step": 920
},
{
"epoch": 1.7635232168501676,
"grad_norm": 1.734375,
"learning_rate": 7.506251353232663e-07,
"loss": 1.7305,
"step": 921
},
{
"epoch": 1.765438008616563,
"grad_norm": 1.75,
"learning_rate": 7.386213813498344e-07,
"loss": 1.7425,
"step": 922
},
{
"epoch": 1.7673528003829584,
"grad_norm": 1.7421875,
"learning_rate": 7.267107012334707e-07,
"loss": 1.7188,
"step": 923
},
{
"epoch": 1.7692675921493537,
"grad_norm": 1.7109375,
"learning_rate": 7.148932146726572e-07,
"loss": 1.668,
"step": 924
},
{
"epoch": 1.771182383915749,
"grad_norm": 1.6953125,
"learning_rate": 7.031690404293046e-07,
"loss": 1.6772,
"step": 925
},
{
"epoch": 1.7730971756821445,
"grad_norm": 1.6953125,
"learning_rate": 6.915382963275741e-07,
"loss": 1.704,
"step": 926
},
{
"epoch": 1.77501196744854,
"grad_norm": 1.65625,
"learning_rate": 6.800010992526729e-07,
"loss": 1.6764,
"step": 927
},
{
"epoch": 1.7769267592149354,
"grad_norm": 1.6796875,
"learning_rate": 6.685575651497022e-07,
"loss": 1.7017,
"step": 928
},
{
"epoch": 1.7788415509813307,
"grad_norm": 1.671875,
"learning_rate": 6.572078090224721e-07,
"loss": 1.7396,
"step": 929
},
{
"epoch": 1.7807563427477262,
"grad_norm": 1.671875,
"learning_rate": 6.459519449323592e-07,
"loss": 1.7057,
"step": 930
},
{
"epoch": 1.7826711345141217,
"grad_norm": 1.703125,
"learning_rate": 6.347900859971534e-07,
"loss": 1.6723,
"step": 931
},
{
"epoch": 1.784585926280517,
"grad_norm": 1.7109375,
"learning_rate": 6.237223443899221e-07,
"loss": 1.7167,
"step": 932
},
{
"epoch": 1.7865007180469124,
"grad_norm": 1.7734375,
"learning_rate": 6.127488313378894e-07,
"loss": 1.7802,
"step": 933
},
{
"epoch": 1.7884155098133077,
"grad_norm": 1.6953125,
"learning_rate": 6.018696571213045e-07,
"loss": 1.6742,
"step": 934
},
{
"epoch": 1.7903303015797032,
"grad_norm": 1.6875,
"learning_rate": 5.910849310723499e-07,
"loss": 1.7288,
"step": 935
},
{
"epoch": 1.7922450933460987,
"grad_norm": 1.65625,
"learning_rate": 5.803947615740291e-07,
"loss": 1.6607,
"step": 936
},
{
"epoch": 1.794159885112494,
"grad_norm": 1.6796875,
"learning_rate": 5.697992560590882e-07,
"loss": 1.7431,
"step": 937
},
{
"epoch": 1.7960746768788893,
"grad_norm": 1.6796875,
"learning_rate": 5.59298521008923e-07,
"loss": 1.6916,
"step": 938
},
{
"epoch": 1.7979894686452849,
"grad_norm": 1.6875,
"learning_rate": 5.488926619525248e-07,
"loss": 1.6678,
"step": 939
},
{
"epoch": 1.7999042604116804,
"grad_norm": 1.703125,
"learning_rate": 5.385817834654095e-07,
"loss": 1.673,
"step": 940
},
{
"epoch": 1.8018190521780757,
"grad_norm": 1.6796875,
"learning_rate": 5.283659891685656e-07,
"loss": 1.7494,
"step": 941
},
{
"epoch": 1.803733843944471,
"grad_norm": 1.671875,
"learning_rate": 5.18245381727418e-07,
"loss": 1.667,
"step": 942
},
{
"epoch": 1.8056486357108663,
"grad_norm": 1.6796875,
"learning_rate": 5.082200628507994e-07,
"loss": 1.7009,
"step": 943
},
{
"epoch": 1.8075634274772618,
"grad_norm": 1.6640625,
"learning_rate": 4.982901332899159e-07,
"loss": 1.681,
"step": 944
},
{
"epoch": 1.8094782192436574,
"grad_norm": 1.671875,
"learning_rate": 4.884556928373462e-07,
"loss": 1.6666,
"step": 945
},
{
"epoch": 1.8113930110100527,
"grad_norm": 1.6953125,
"learning_rate": 4.787168403260323e-07,
"loss": 1.6944,
"step": 946
},
{
"epoch": 1.813307802776448,
"grad_norm": 1.6953125,
"learning_rate": 4.690736736282908e-07,
"loss": 1.6373,
"step": 947
},
{
"epoch": 1.8152225945428435,
"grad_norm": 1.6484375,
"learning_rate": 4.595262896548236e-07,
"loss": 1.709,
"step": 948
},
{
"epoch": 1.8171373863092388,
"grad_norm": 1.6875,
"learning_rate": 4.500747843537523e-07,
"loss": 1.7181,
"step": 949
},
{
"epoch": 1.8190521780756344,
"grad_norm": 1.71875,
"learning_rate": 4.407192527096404e-07,
"loss": 1.6296,
"step": 950
},
{
"epoch": 1.8209669698420297,
"grad_norm": 1.7265625,
"learning_rate": 4.3145978874255757e-07,
"loss": 1.6935,
"step": 951
},
{
"epoch": 1.822881761608425,
"grad_norm": 1.71875,
"learning_rate": 4.222964855071154e-07,
"loss": 1.6726,
"step": 952
},
{
"epoch": 1.8247965533748205,
"grad_norm": 1.671875,
"learning_rate": 4.1322943509154887e-07,
"loss": 1.6841,
"step": 953
},
{
"epoch": 1.826711345141216,
"grad_norm": 1.703125,
"learning_rate": 4.042587286167754e-07,
"loss": 1.6808,
"step": 954
},
{
"epoch": 1.8286261369076113,
"grad_norm": 1.7265625,
"learning_rate": 3.953844562354936e-07,
"loss": 1.7918,
"step": 955
},
{
"epoch": 1.8305409286740066,
"grad_norm": 1.71875,
"learning_rate": 3.8660670713126735e-07,
"loss": 1.7129,
"step": 956
},
{
"epoch": 1.832455720440402,
"grad_norm": 1.6796875,
"learning_rate": 3.7792556951763424e-07,
"loss": 1.6843,
"step": 957
},
{
"epoch": 1.8343705122067975,
"grad_norm": 1.7109375,
"learning_rate": 3.6934113063721634e-07,
"loss": 1.7275,
"step": 958
},
{
"epoch": 1.836285303973193,
"grad_norm": 1.6875,
"learning_rate": 3.6085347676084736e-07,
"loss": 1.7158,
"step": 959
},
{
"epoch": 1.8382000957395883,
"grad_norm": 1.625,
"learning_rate": 3.5246269318669924e-07,
"loss": 1.6048,
"step": 960
},
{
"epoch": 1.8401148875059836,
"grad_norm": 1.875,
"learning_rate": 3.441688642394292e-07,
"loss": 1.6986,
"step": 961
},
{
"epoch": 1.8420296792723792,
"grad_norm": 1.6640625,
"learning_rate": 3.359720732693361e-07,
"loss": 1.6519,
"step": 962
},
{
"epoch": 1.8439444710387747,
"grad_norm": 1.6796875,
"learning_rate": 3.2787240265151674e-07,
"loss": 1.7094,
"step": 963
},
{
"epoch": 1.84585926280517,
"grad_norm": 1.671875,
"learning_rate": 3.1986993378503526e-07,
"loss": 1.678,
"step": 964
},
{
"epoch": 1.8477740545715653,
"grad_norm": 1.6640625,
"learning_rate": 3.11964747092115e-07,
"loss": 1.6928,
"step": 965
},
{
"epoch": 1.8496888463379606,
"grad_norm": 1.6796875,
"learning_rate": 3.041569220173235e-07,
"loss": 1.7484,
"step": 966
},
{
"epoch": 1.8516036381043561,
"grad_norm": 1.75,
"learning_rate": 2.9644653702677553e-07,
"loss": 1.7354,
"step": 967
},
{
"epoch": 1.8535184298707517,
"grad_norm": 1.6875,
"learning_rate": 2.888336696073435e-07,
"loss": 1.6876,
"step": 968
},
{
"epoch": 1.855433221637147,
"grad_norm": 1.671875,
"learning_rate": 2.8131839626588056e-07,
"loss": 1.6774,
"step": 969
},
{
"epoch": 1.8573480134035423,
"grad_norm": 1.6875,
"learning_rate": 2.7390079252845205e-07,
"loss": 1.6936,
"step": 970
},
{
"epoch": 1.8592628051699378,
"grad_norm": 1.6953125,
"learning_rate": 2.6658093293957187e-07,
"loss": 1.7162,
"step": 971
},
{
"epoch": 1.8611775969363333,
"grad_norm": 1.6796875,
"learning_rate": 2.5935889106146305e-07,
"loss": 1.7387,
"step": 972
},
{
"epoch": 1.8630923887027286,
"grad_norm": 1.6875,
"learning_rate": 2.522347394733049e-07,
"loss": 1.6896,
"step": 973
},
{
"epoch": 1.865007180469124,
"grad_norm": 1.671875,
"learning_rate": 2.45208549770517e-07,
"loss": 1.7292,
"step": 974
},
{
"epoch": 1.8669219722355193,
"grad_norm": 1.6796875,
"learning_rate": 2.382803925640309e-07,
"loss": 1.7433,
"step": 975
},
{
"epoch": 1.8688367640019148,
"grad_norm": 1.7109375,
"learning_rate": 2.314503374795829e-07,
"loss": 1.6975,
"step": 976
},
{
"epoch": 1.8707515557683103,
"grad_norm": 1.703125,
"learning_rate": 2.247184531570168e-07,
"loss": 1.7252,
"step": 977
},
{
"epoch": 1.8726663475347056,
"grad_norm": 1.71875,
"learning_rate": 2.1808480724959004e-07,
"loss": 1.7407,
"step": 978
},
{
"epoch": 1.874581139301101,
"grad_norm": 1.6796875,
"learning_rate": 2.1154946642329644e-07,
"loss": 1.7345,
"step": 979
},
{
"epoch": 1.8764959310674965,
"grad_norm": 1.703125,
"learning_rate": 2.051124963561979e-07,
"loss": 1.7497,
"step": 980
},
{
"epoch": 1.8784107228338918,
"grad_norm": 1.6953125,
"learning_rate": 1.9877396173775598e-07,
"loss": 1.7163,
"step": 981
},
{
"epoch": 1.8803255146002873,
"grad_norm": 1.6875,
"learning_rate": 1.9253392626819468e-07,
"loss": 1.6973,
"step": 982
},
{
"epoch": 1.8822403063666826,
"grad_norm": 1.6796875,
"learning_rate": 1.8639245265784866e-07,
"loss": 1.6974,
"step": 983
},
{
"epoch": 1.884155098133078,
"grad_norm": 1.7109375,
"learning_rate": 1.8034960262654276e-07,
"loss": 1.7159,
"step": 984
},
{
"epoch": 1.8860698898994734,
"grad_norm": 1.6484375,
"learning_rate": 1.744054369029591e-07,
"loss": 1.6499,
"step": 985
},
{
"epoch": 1.887984681665869,
"grad_norm": 1.671875,
"learning_rate": 1.6856001522404296e-07,
"loss": 1.7192,
"step": 986
},
{
"epoch": 1.8898994734322643,
"grad_norm": 1.6796875,
"learning_rate": 1.6281339633438698e-07,
"loss": 1.7233,
"step": 987
},
{
"epoch": 1.8918142651986596,
"grad_norm": 1.6875,
"learning_rate": 1.5716563798565232e-07,
"loss": 1.7212,
"step": 988
},
{
"epoch": 1.893729056965055,
"grad_norm": 1.6953125,
"learning_rate": 1.5161679693598274e-07,
"loss": 1.6976,
"step": 989
},
{
"epoch": 1.8956438487314504,
"grad_norm": 1.7109375,
"learning_rate": 1.4616692894943274e-07,
"loss": 1.7309,
"step": 990
},
{
"epoch": 1.897558640497846,
"grad_norm": 1.734375,
"learning_rate": 1.4081608879541241e-07,
"loss": 1.7336,
"step": 991
},
{
"epoch": 1.8994734322642413,
"grad_norm": 1.65625,
"learning_rate": 1.3556433024813353e-07,
"loss": 1.7123,
"step": 992
},
{
"epoch": 1.9013882240306366,
"grad_norm": 1.671875,
"learning_rate": 1.304117060860688e-07,
"loss": 1.6961,
"step": 993
},
{
"epoch": 1.903303015797032,
"grad_norm": 1.6640625,
"learning_rate": 1.2535826809142339e-07,
"loss": 1.6591,
"step": 994
},
{
"epoch": 1.9052178075634276,
"grad_norm": 1.6796875,
"learning_rate": 1.2040406704961316e-07,
"loss": 1.6986,
"step": 995
},
{
"epoch": 1.907132599329823,
"grad_norm": 1.71875,
"learning_rate": 1.15549152748754e-07,
"loss": 1.6602,
"step": 996
},
{
"epoch": 1.9090473910962182,
"grad_norm": 1.6484375,
"learning_rate": 1.1079357397916435e-07,
"loss": 1.6829,
"step": 997
},
{
"epoch": 1.9109621828626135,
"grad_norm": 1.6640625,
"learning_rate": 1.061373785328701e-07,
"loss": 1.688,
"step": 998
},
{
"epoch": 1.912876974629009,
"grad_norm": 1.671875,
"learning_rate": 1.015806132031305e-07,
"loss": 1.7246,
"step": 999
},
{
"epoch": 1.9147917663954046,
"grad_norm": 1.6796875,
"learning_rate": 9.712332378395861e-08,
"loss": 1.7222,
"step": 1000
},
{
"epoch": 1.9167065581618,
"grad_norm": 1.6640625,
"learning_rate": 9.276555506967378e-08,
"loss": 1.7414,
"step": 1001
},
{
"epoch": 1.9186213499281952,
"grad_norm": 1.6796875,
"learning_rate": 8.850735085443763e-08,
"loss": 1.6952,
"step": 1002
},
{
"epoch": 1.9205361416945907,
"grad_norm": 1.671875,
"learning_rate": 8.434875393182662e-08,
"loss": 1.671,
"step": 1003
},
{
"epoch": 1.9224509334609863,
"grad_norm": 1.671875,
"learning_rate": 8.028980609439241e-08,
"loss": 1.6717,
"step": 1004
},
{
"epoch": 1.9243657252273816,
"grad_norm": 1.703125,
"learning_rate": 7.633054813324769e-08,
"loss": 1.6772,
"step": 1005
},
{
"epoch": 1.926280516993777,
"grad_norm": 1.6640625,
"learning_rate": 7.247101983765104e-08,
"loss": 1.7314,
"step": 1006
},
{
"epoch": 1.9281953087601722,
"grad_norm": 1.6484375,
"learning_rate": 6.871125999461604e-08,
"loss": 1.5927,
"step": 1007
},
{
"epoch": 1.9301101005265677,
"grad_norm": 1.671875,
"learning_rate": 6.505130638850831e-08,
"loss": 1.6729,
"step": 1008
},
{
"epoch": 1.9320248922929633,
"grad_norm": 1.6640625,
"learning_rate": 6.14911958006792e-08,
"loss": 1.7042,
"step": 1009
},
{
"epoch": 1.9339396840593586,
"grad_norm": 1.703125,
"learning_rate": 5.803096400908703e-08,
"loss": 1.8112,
"step": 1010
},
{
"epoch": 1.9358544758257539,
"grad_norm": 1.71875,
"learning_rate": 5.46706457879409e-08,
"loss": 1.7101,
"step": 1011
},
{
"epoch": 1.9377692675921494,
"grad_norm": 1.671875,
"learning_rate": 5.141027490735195e-08,
"loss": 1.782,
"step": 1012
},
{
"epoch": 1.9396840593585447,
"grad_norm": 1.6640625,
"learning_rate": 4.824988413299037e-08,
"loss": 1.6928,
"step": 1013
},
{
"epoch": 1.9415988511249402,
"grad_norm": 1.7734375,
"learning_rate": 4.5189505225762266e-08,
"loss": 1.7131,
"step": 1014
},
{
"epoch": 1.9435136428913355,
"grad_norm": 1.6875,
"learning_rate": 4.2229168941484434e-08,
"loss": 1.7265,
"step": 1015
},
{
"epoch": 1.9454284346577309,
"grad_norm": 1.65625,
"learning_rate": 3.9368905030578994e-08,
"loss": 1.6424,
"step": 1016
},
{
"epoch": 1.9473432264241264,
"grad_norm": 1.71875,
"learning_rate": 3.6608742237769227e-08,
"loss": 1.7135,
"step": 1017
},
{
"epoch": 1.949258018190522,
"grad_norm": 1.7109375,
"learning_rate": 3.394870830180197e-08,
"loss": 1.7292,
"step": 1018
},
{
"epoch": 1.9511728099569172,
"grad_norm": 1.671875,
"learning_rate": 3.1388829955153466e-08,
"loss": 1.6791,
"step": 1019
},
{
"epoch": 1.9530876017233125,
"grad_norm": 1.71875,
"learning_rate": 2.892913292377508e-08,
"loss": 1.6906,
"step": 1020
},
{
"epoch": 1.9550023934897078,
"grad_norm": 1.671875,
"learning_rate": 2.656964192682354e-08,
"loss": 1.698,
"step": 1021
},
{
"epoch": 1.9569171852561034,
"grad_norm": 1.6953125,
"learning_rate": 2.431038067642111e-08,
"loss": 1.6888,
"step": 1022
},
{
"epoch": 1.958831977022499,
"grad_norm": 1.6875,
"learning_rate": 2.2151371877412452e-08,
"loss": 1.6964,
"step": 1023
},
{
"epoch": 1.9607467687888942,
"grad_norm": 1.671875,
"learning_rate": 2.0092637227134836e-08,
"loss": 1.6933,
"step": 1024
},
{
"epoch": 1.9626615605552895,
"grad_norm": 1.6796875,
"learning_rate": 1.8134197415207165e-08,
"loss": 1.689,
"step": 1025
},
{
"epoch": 1.964576352321685,
"grad_norm": 1.6328125,
"learning_rate": 1.627607212331572e-08,
"loss": 1.6823,
"step": 1026
},
{
"epoch": 1.9664911440880806,
"grad_norm": 1.6953125,
"learning_rate": 1.451828002501654e-08,
"loss": 1.7396,
"step": 1027
},
{
"epoch": 1.9684059358544759,
"grad_norm": 1.640625,
"learning_rate": 1.286083878555111e-08,
"loss": 1.6459,
"step": 1028
},
{
"epoch": 1.9703207276208712,
"grad_norm": 1.671875,
"learning_rate": 1.1303765061668748e-08,
"loss": 1.6741,
"step": 1029
},
{
"epoch": 1.9722355193872665,
"grad_norm": 1.6796875,
"learning_rate": 9.847074501456722e-09,
"loss": 1.6616,
"step": 1030
},
{
"epoch": 1.974150311153662,
"grad_norm": 1.65625,
"learning_rate": 8.490781744181498e-09,
"loss": 1.7448,
"step": 1031
},
{
"epoch": 1.9760651029200575,
"grad_norm": 1.6640625,
"learning_rate": 7.234900420147739e-09,
"loss": 1.6673,
"step": 1032
},
{
"epoch": 1.9779798946864529,
"grad_norm": 1.65625,
"learning_rate": 6.079443150556197e-09,
"loss": 1.7032,
"step": 1033
},
{
"epoch": 1.9798946864528482,
"grad_norm": 1.6640625,
"learning_rate": 5.0244215473782556e-09,
"loss": 1.6774,
"step": 1034
},
{
"epoch": 1.9818094782192437,
"grad_norm": 1.71875,
"learning_rate": 4.069846213238249e-09,
"loss": 1.7234,
"step": 1035
},
{
"epoch": 1.9837242699856392,
"grad_norm": 1.734375,
"learning_rate": 3.2157267413113203e-09,
"loss": 1.6599,
"step": 1036
},
{
"epoch": 1.9856390617520345,
"grad_norm": 1.6875,
"learning_rate": 2.4620717152201713e-09,
"loss": 1.7449,
"step": 1037
},
{
"epoch": 1.9875538535184298,
"grad_norm": 1.640625,
"learning_rate": 1.8088887089551255e-09,
"loss": 1.6684,
"step": 1038
},
{
"epoch": 1.9894686452848251,
"grad_norm": 1.6875,
"learning_rate": 1.256184286793083e-09,
"loss": 1.641,
"step": 1039
},
{
"epoch": 1.9913834370512207,
"grad_norm": 1.6796875,
"learning_rate": 8.039640032342366e-10,
"loss": 1.7045,
"step": 1040
},
{
"epoch": 1.9932982288176162,
"grad_norm": 1.65625,
"learning_rate": 4.522324029465619e-10,
"loss": 1.7222,
"step": 1041
},
{
"epoch": 1.9952130205840115,
"grad_norm": 1.71875,
"learning_rate": 2.0099302071807658e-10,
"loss": 1.6946,
"step": 1042
},
{
"epoch": 1.9971278123504068,
"grad_norm": 1.6796875,
"learning_rate": 5.024838142464461e-11,
"loss": 1.7026,
"step": 1043
},
{
"epoch": 1.9990426041168023,
"grad_norm": 1.671875,
"learning_rate": 0.0,
"loss": 1.7507,
"step": 1044
}
],
"logging_steps": 1,
"max_steps": 1044,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.28007906539733e+17,
"train_batch_size": 10,
"trial_name": null,
"trial_params": null
}