{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.8330817989737396, "eval_steps": 500, "global_step": 4692, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006036824630244491, "grad_norm": 0.876740574836731, "learning_rate": 1.3333333333333334e-06, "loss": 1.3437, "step": 1 }, { "epoch": 0.0012073649260488982, "grad_norm": 0.8783963322639465, "learning_rate": 2.666666666666667e-06, "loss": 1.4745, "step": 2 }, { "epoch": 0.0018110473890733474, "grad_norm": 0.8955765962600708, "learning_rate": 4.000000000000001e-06, "loss": 1.6589, "step": 3 }, { "epoch": 0.0024147298520977964, "grad_norm": 1.0829285383224487, "learning_rate": 5.333333333333334e-06, "loss": 1.5662, "step": 4 }, { "epoch": 0.003018412315122246, "grad_norm": 1.1975041627883911, "learning_rate": 6.666666666666667e-06, "loss": 1.4658, "step": 5 }, { "epoch": 0.003622094778146695, "grad_norm": 1.0904583930969238, "learning_rate": 8.000000000000001e-06, "loss": 1.6691, "step": 6 }, { "epoch": 0.004225777241171144, "grad_norm": 0.6331161260604858, "learning_rate": 9.333333333333334e-06, "loss": 1.5705, "step": 7 }, { "epoch": 0.004829459704195593, "grad_norm": 0.3753370940685272, "learning_rate": 1.0666666666666667e-05, "loss": 1.5175, "step": 8 }, { "epoch": 0.005433142167220043, "grad_norm": 2.936147451400757, "learning_rate": 1.2e-05, "loss": 1.5404, "step": 9 }, { "epoch": 0.006036824630244492, "grad_norm": 2.635810613632202, "learning_rate": 1.3333333333333333e-05, "loss": 1.5605, "step": 10 }, { "epoch": 0.006640507093268941, "grad_norm": 1.9180281162261963, "learning_rate": 1.4666666666666668e-05, "loss": 1.5239, "step": 11 }, { "epoch": 0.00724418955629339, "grad_norm": 3.021333932876587, "learning_rate": 1.6000000000000003e-05, "loss": 1.5081, "step": 12 }, { "epoch": 0.00784787201931784, "grad_norm": 1.9040541648864746, "learning_rate": 1.7333333333333336e-05, "loss": 1.4366, "step": 13 }, { "epoch": 0.008451554482342288, "grad_norm": 1.0555269718170166, "learning_rate": 1.866666666666667e-05, "loss": 1.6606, "step": 14 }, { "epoch": 0.009055236945366738, "grad_norm": 0.5196813344955444, "learning_rate": 2e-05, "loss": 1.7105, "step": 15 }, { "epoch": 0.009658919408391186, "grad_norm": 1.1674487590789795, "learning_rate": 2.1333333333333335e-05, "loss": 1.3767, "step": 16 }, { "epoch": 0.010262601871415635, "grad_norm": 1.3149781227111816, "learning_rate": 2.2666666666666668e-05, "loss": 1.2422, "step": 17 }, { "epoch": 0.010866284334440085, "grad_norm": 0.5175302028656006, "learning_rate": 2.4e-05, "loss": 1.3791, "step": 18 }, { "epoch": 0.011469966797464533, "grad_norm": 0.3626837432384491, "learning_rate": 2.5333333333333337e-05, "loss": 1.4945, "step": 19 }, { "epoch": 0.012073649260488983, "grad_norm": 0.26292356848716736, "learning_rate": 2.6666666666666667e-05, "loss": 1.3598, "step": 20 }, { "epoch": 0.012677331723513431, "grad_norm": 0.2678084671497345, "learning_rate": 2.8000000000000003e-05, "loss": 1.3896, "step": 21 }, { "epoch": 0.013281014186537881, "grad_norm": 0.34738680720329285, "learning_rate": 2.9333333333333336e-05, "loss": 1.3734, "step": 22 }, { "epoch": 0.01388469664956233, "grad_norm": 0.3626849949359894, "learning_rate": 3.066666666666667e-05, "loss": 1.306, "step": 23 }, { "epoch": 0.01448837911258678, "grad_norm": 0.2588627338409424, "learning_rate": 3.2000000000000005e-05, "loss": 1.3373, "step": 24 }, { "epoch": 0.01509206157561123, "grad_norm": 0.6032963395118713, "learning_rate": 3.3333333333333335e-05, "loss": 1.1394, "step": 25 }, { "epoch": 0.01569574403863568, "grad_norm": 0.194705531001091, "learning_rate": 3.466666666666667e-05, "loss": 1.0727, "step": 26 }, { "epoch": 0.016299426501660125, "grad_norm": 0.3403484523296356, "learning_rate": 3.6e-05, "loss": 1.045, "step": 27 }, { "epoch": 0.016903108964684575, "grad_norm": 0.3210083544254303, "learning_rate": 3.733333333333334e-05, "loss": 1.285, "step": 28 }, { "epoch": 0.017506791427709025, "grad_norm": 0.6296122074127197, "learning_rate": 3.866666666666667e-05, "loss": 1.15, "step": 29 }, { "epoch": 0.018110473890733475, "grad_norm": 0.4753726124763489, "learning_rate": 4e-05, "loss": 1.0053, "step": 30 }, { "epoch": 0.018714156353757925, "grad_norm": 0.19821159541606903, "learning_rate": 4.133333333333333e-05, "loss": 1.1965, "step": 31 }, { "epoch": 0.01931783881678237, "grad_norm": 0.2836245000362396, "learning_rate": 4.266666666666667e-05, "loss": 1.0633, "step": 32 }, { "epoch": 0.01992152127980682, "grad_norm": 0.19199256598949432, "learning_rate": 4.4000000000000006e-05, "loss": 0.9728, "step": 33 }, { "epoch": 0.02052520374283127, "grad_norm": 0.17820274829864502, "learning_rate": 4.5333333333333335e-05, "loss": 1.0132, "step": 34 }, { "epoch": 0.02112888620585572, "grad_norm": 0.20217391848564148, "learning_rate": 4.666666666666667e-05, "loss": 1.0202, "step": 35 }, { "epoch": 0.02173256866888017, "grad_norm": 0.18383841216564178, "learning_rate": 4.8e-05, "loss": 0.9666, "step": 36 }, { "epoch": 0.022336251131904617, "grad_norm": 0.17872729897499084, "learning_rate": 4.933333333333334e-05, "loss": 0.9385, "step": 37 }, { "epoch": 0.022939933594929067, "grad_norm": 0.14148443937301636, "learning_rate": 5.0666666666666674e-05, "loss": 0.9338, "step": 38 }, { "epoch": 0.023543616057953517, "grad_norm": 0.14587634801864624, "learning_rate": 5.2000000000000004e-05, "loss": 0.964, "step": 39 }, { "epoch": 0.024147298520977967, "grad_norm": 0.17298458516597748, "learning_rate": 5.333333333333333e-05, "loss": 0.9934, "step": 40 }, { "epoch": 0.024750980984002413, "grad_norm": 0.1523771435022354, "learning_rate": 5.466666666666666e-05, "loss": 0.9995, "step": 41 }, { "epoch": 0.025354663447026863, "grad_norm": 0.16784684360027313, "learning_rate": 5.6000000000000006e-05, "loss": 0.9621, "step": 42 }, { "epoch": 0.025958345910051313, "grad_norm": 0.18494223058223724, "learning_rate": 5.7333333333333336e-05, "loss": 0.9181, "step": 43 }, { "epoch": 0.026562028373075763, "grad_norm": 0.15912078320980072, "learning_rate": 5.866666666666667e-05, "loss": 0.9311, "step": 44 }, { "epoch": 0.027165710836100213, "grad_norm": 0.17092670500278473, "learning_rate": 6e-05, "loss": 0.8838, "step": 45 }, { "epoch": 0.02776939329912466, "grad_norm": 0.2552148401737213, "learning_rate": 6.133333333333334e-05, "loss": 0.9121, "step": 46 }, { "epoch": 0.02837307576214911, "grad_norm": 0.20719660818576813, "learning_rate": 6.266666666666667e-05, "loss": 0.8881, "step": 47 }, { "epoch": 0.02897675822517356, "grad_norm": 0.25345510244369507, "learning_rate": 6.400000000000001e-05, "loss": 0.8592, "step": 48 }, { "epoch": 0.02958044068819801, "grad_norm": 0.23677493631839752, "learning_rate": 6.533333333333334e-05, "loss": 0.8428, "step": 49 }, { "epoch": 0.03018412315122246, "grad_norm": 0.3329518437385559, "learning_rate": 6.666666666666667e-05, "loss": 0.7946, "step": 50 }, { "epoch": 0.030787805614246905, "grad_norm": 0.1412961632013321, "learning_rate": 6.800000000000001e-05, "loss": 0.9836, "step": 51 }, { "epoch": 0.03139148807727136, "grad_norm": 0.1430910974740982, "learning_rate": 6.933333333333334e-05, "loss": 1.0782, "step": 52 }, { "epoch": 0.031995170540295804, "grad_norm": 0.19594986736774445, "learning_rate": 7.066666666666667e-05, "loss": 0.9402, "step": 53 }, { "epoch": 0.03259885300332025, "grad_norm": 0.15413224697113037, "learning_rate": 7.2e-05, "loss": 1.0387, "step": 54 }, { "epoch": 0.033202535466344704, "grad_norm": 0.15673322975635529, "learning_rate": 7.333333333333333e-05, "loss": 0.9795, "step": 55 }, { "epoch": 0.03380621792936915, "grad_norm": 0.14926938712596893, "learning_rate": 7.466666666666667e-05, "loss": 1.088, "step": 56 }, { "epoch": 0.034409900392393604, "grad_norm": 0.20912237465381622, "learning_rate": 7.6e-05, "loss": 0.9247, "step": 57 }, { "epoch": 0.03501358285541805, "grad_norm": 0.1995040327310562, "learning_rate": 7.733333333333333e-05, "loss": 0.9202, "step": 58 }, { "epoch": 0.0356172653184425, "grad_norm": 0.20646536350250244, "learning_rate": 7.866666666666666e-05, "loss": 0.8604, "step": 59 }, { "epoch": 0.03622094778146695, "grad_norm": 0.2546747624874115, "learning_rate": 8e-05, "loss": 0.955, "step": 60 }, { "epoch": 0.036824630244491396, "grad_norm": 0.25772592425346375, "learning_rate": 8.133333333333334e-05, "loss": 0.9363, "step": 61 }, { "epoch": 0.03742831270751585, "grad_norm": 0.4138891398906708, "learning_rate": 8.266666666666667e-05, "loss": 0.83, "step": 62 }, { "epoch": 0.038031995170540296, "grad_norm": 0.2324582189321518, "learning_rate": 8.4e-05, "loss": 0.95, "step": 63 }, { "epoch": 0.03863567763356474, "grad_norm": 0.22521646320819855, "learning_rate": 8.533333333333334e-05, "loss": 0.8359, "step": 64 }, { "epoch": 0.039239360096589196, "grad_norm": 0.19713546335697174, "learning_rate": 8.666666666666667e-05, "loss": 0.9552, "step": 65 }, { "epoch": 0.03984304255961364, "grad_norm": 0.1569695621728897, "learning_rate": 8.800000000000001e-05, "loss": 0.9283, "step": 66 }, { "epoch": 0.040446725022638096, "grad_norm": 0.13675987720489502, "learning_rate": 8.933333333333334e-05, "loss": 0.9144, "step": 67 }, { "epoch": 0.04105040748566254, "grad_norm": 0.08005592226982117, "learning_rate": 9.066666666666667e-05, "loss": 0.9661, "step": 68 }, { "epoch": 0.04165408994868699, "grad_norm": 0.17173191905021667, "learning_rate": 9.200000000000001e-05, "loss": 1.1486, "step": 69 }, { "epoch": 0.04225777241171144, "grad_norm": 0.2080589085817337, "learning_rate": 9.333333333333334e-05, "loss": 0.8091, "step": 70 }, { "epoch": 0.04286145487473589, "grad_norm": 0.21545596420764923, "learning_rate": 9.466666666666667e-05, "loss": 0.8606, "step": 71 }, { "epoch": 0.04346513733776034, "grad_norm": 0.21326108276844025, "learning_rate": 9.6e-05, "loss": 0.9271, "step": 72 }, { "epoch": 0.04406881980078479, "grad_norm": 0.21748293936252594, "learning_rate": 9.733333333333335e-05, "loss": 1.1042, "step": 73 }, { "epoch": 0.044672502263809234, "grad_norm": 0.20910663902759552, "learning_rate": 9.866666666666668e-05, "loss": 0.8998, "step": 74 }, { "epoch": 0.04527618472683369, "grad_norm": 0.19514353573322296, "learning_rate": 0.0001, "loss": 0.8976, "step": 75 }, { "epoch": 0.045879867189858134, "grad_norm": 0.17271868884563446, "learning_rate": 0.00010133333333333335, "loss": 0.8919, "step": 76 }, { "epoch": 0.04648354965288258, "grad_norm": 0.17146329581737518, "learning_rate": 0.00010266666666666666, "loss": 0.8703, "step": 77 }, { "epoch": 0.047087232115907034, "grad_norm": 0.12326353043317795, "learning_rate": 0.00010400000000000001, "loss": 1.0872, "step": 78 }, { "epoch": 0.04769091457893148, "grad_norm": 0.08894680440425873, "learning_rate": 0.00010533333333333332, "loss": 0.9586, "step": 79 }, { "epoch": 0.04829459704195593, "grad_norm": 0.07784762233495712, "learning_rate": 0.00010666666666666667, "loss": 0.9304, "step": 80 }, { "epoch": 0.04889827950498038, "grad_norm": 0.08525023609399796, "learning_rate": 0.00010800000000000001, "loss": 1.1813, "step": 81 }, { "epoch": 0.049501961968004826, "grad_norm": 0.11610118299722672, "learning_rate": 0.00010933333333333333, "loss": 0.8561, "step": 82 }, { "epoch": 0.05010564443102928, "grad_norm": 0.12873214483261108, "learning_rate": 0.00011066666666666667, "loss": 0.8394, "step": 83 }, { "epoch": 0.050709326894053726, "grad_norm": 0.11888518184423447, "learning_rate": 0.00011200000000000001, "loss": 0.873, "step": 84 }, { "epoch": 0.05131300935707818, "grad_norm": 0.12383485585451126, "learning_rate": 0.00011333333333333334, "loss": 0.803, "step": 85 }, { "epoch": 0.051916691820102626, "grad_norm": 0.13411828875541687, "learning_rate": 0.00011466666666666667, "loss": 0.8332, "step": 86 }, { "epoch": 0.05252037428312707, "grad_norm": 0.12794946134090424, "learning_rate": 0.000116, "loss": 0.8405, "step": 87 }, { "epoch": 0.053124056746151525, "grad_norm": 0.12056224048137665, "learning_rate": 0.00011733333333333334, "loss": 0.9108, "step": 88 }, { "epoch": 0.05372773920917597, "grad_norm": 0.11550690233707428, "learning_rate": 0.00011866666666666669, "loss": 0.9052, "step": 89 }, { "epoch": 0.054331421672200425, "grad_norm": 0.09730254113674164, "learning_rate": 0.00012, "loss": 0.9104, "step": 90 }, { "epoch": 0.05493510413522487, "grad_norm": 0.10430070012807846, "learning_rate": 0.00012133333333333335, "loss": 0.8333, "step": 91 }, { "epoch": 0.05553878659824932, "grad_norm": 0.10595888644456863, "learning_rate": 0.00012266666666666668, "loss": 0.7792, "step": 92 }, { "epoch": 0.05614246906127377, "grad_norm": 0.13614441454410553, "learning_rate": 0.000124, "loss": 0.8317, "step": 93 }, { "epoch": 0.05674615152429822, "grad_norm": 0.1545422077178955, "learning_rate": 0.00012533333333333334, "loss": 0.8139, "step": 94 }, { "epoch": 0.05734983398732267, "grad_norm": 0.17710909247398376, "learning_rate": 0.00012666666666666666, "loss": 0.8969, "step": 95 }, { "epoch": 0.05795351645034712, "grad_norm": 0.17680825293064117, "learning_rate": 0.00012800000000000002, "loss": 0.7812, "step": 96 }, { "epoch": 0.058557198913371564, "grad_norm": 0.17637892067432404, "learning_rate": 0.00012933333333333332, "loss": 0.7518, "step": 97 }, { "epoch": 0.05916088137639602, "grad_norm": 0.14965130388736725, "learning_rate": 0.00013066666666666668, "loss": 0.6467, "step": 98 }, { "epoch": 0.05976456383942046, "grad_norm": 0.15980888903141022, "learning_rate": 0.000132, "loss": 0.724, "step": 99 }, { "epoch": 0.06036824630244492, "grad_norm": 0.1575448364019394, "learning_rate": 0.00013333333333333334, "loss": 0.523, "step": 100 }, { "epoch": 0.06097192876546936, "grad_norm": 0.0918162539601326, "learning_rate": 0.00013466666666666667, "loss": 0.8158, "step": 101 }, { "epoch": 0.06157561122849381, "grad_norm": 0.09651898592710495, "learning_rate": 0.00013600000000000003, "loss": 1.1298, "step": 102 }, { "epoch": 0.06217929369151826, "grad_norm": 0.10406182706356049, "learning_rate": 0.00013733333333333333, "loss": 1.2655, "step": 103 }, { "epoch": 0.06278297615454272, "grad_norm": 0.09533622860908508, "learning_rate": 0.00013866666666666669, "loss": 0.9408, "step": 104 }, { "epoch": 0.06338665861756716, "grad_norm": 0.1048218235373497, "learning_rate": 0.00014, "loss": 0.9564, "step": 105 }, { "epoch": 0.06399034108059161, "grad_norm": 0.09610721468925476, "learning_rate": 0.00014133333333333334, "loss": 0.8791, "step": 106 }, { "epoch": 0.06459402354361606, "grad_norm": 0.08564590662717819, "learning_rate": 0.00014266666666666667, "loss": 1.0284, "step": 107 }, { "epoch": 0.0651977060066405, "grad_norm": 0.07987428456544876, "learning_rate": 0.000144, "loss": 1.1595, "step": 108 }, { "epoch": 0.06580138846966496, "grad_norm": 0.09171764552593231, "learning_rate": 0.00014533333333333333, "loss": 0.9098, "step": 109 }, { "epoch": 0.06640507093268941, "grad_norm": 0.08003644645214081, "learning_rate": 0.00014666666666666666, "loss": 1.0118, "step": 110 }, { "epoch": 0.06700875339571385, "grad_norm": 0.08099836111068726, "learning_rate": 0.000148, "loss": 1.0011, "step": 111 }, { "epoch": 0.0676124358587383, "grad_norm": 0.08993902802467346, "learning_rate": 0.00014933333333333335, "loss": 1.2031, "step": 112 }, { "epoch": 0.06821611832176275, "grad_norm": 0.0858883187174797, "learning_rate": 0.00015066666666666668, "loss": 0.7726, "step": 113 }, { "epoch": 0.06881980078478721, "grad_norm": 0.07739171385765076, "learning_rate": 0.000152, "loss": 0.8213, "step": 114 }, { "epoch": 0.06942348324781165, "grad_norm": 0.09992550313472748, "learning_rate": 0.00015333333333333334, "loss": 1.2648, "step": 115 }, { "epoch": 0.0700271657108361, "grad_norm": 0.08734551072120667, "learning_rate": 0.00015466666666666667, "loss": 0.9113, "step": 116 }, { "epoch": 0.07063084817386055, "grad_norm": 0.13743777573108673, "learning_rate": 0.00015600000000000002, "loss": 1.2323, "step": 117 }, { "epoch": 0.071234530636885, "grad_norm": 0.08608172088861465, "learning_rate": 0.00015733333333333333, "loss": 1.0779, "step": 118 }, { "epoch": 0.07183821309990945, "grad_norm": 0.07633006572723389, "learning_rate": 0.00015866666666666668, "loss": 0.9064, "step": 119 }, { "epoch": 0.0724418955629339, "grad_norm": 0.07484789192676544, "learning_rate": 0.00016, "loss": 0.8816, "step": 120 }, { "epoch": 0.07304557802595835, "grad_norm": 0.06947381794452667, "learning_rate": 0.00016133333333333334, "loss": 0.9609, "step": 121 }, { "epoch": 0.07364926048898279, "grad_norm": 0.09331609308719635, "learning_rate": 0.00016266666666666667, "loss": 0.899, "step": 122 }, { "epoch": 0.07425294295200724, "grad_norm": 0.0736856609582901, "learning_rate": 0.000164, "loss": 0.8749, "step": 123 }, { "epoch": 0.0748566254150317, "grad_norm": 0.08171521127223969, "learning_rate": 0.00016533333333333333, "loss": 0.8952, "step": 124 }, { "epoch": 0.07546030787805615, "grad_norm": 0.0707223191857338, "learning_rate": 0.0001666666666666667, "loss": 0.9492, "step": 125 }, { "epoch": 0.07606399034108059, "grad_norm": 0.07196628302335739, "learning_rate": 0.000168, "loss": 1.0499, "step": 126 }, { "epoch": 0.07666767280410504, "grad_norm": 0.07485999912023544, "learning_rate": 0.00016933333333333335, "loss": 0.8929, "step": 127 }, { "epoch": 0.07727135526712949, "grad_norm": 0.06874241679906845, "learning_rate": 0.00017066666666666668, "loss": 0.8589, "step": 128 }, { "epoch": 0.07787503773015395, "grad_norm": 0.07588055729866028, "learning_rate": 0.000172, "loss": 0.9973, "step": 129 }, { "epoch": 0.07847872019317839, "grad_norm": 0.07442892342805862, "learning_rate": 0.00017333333333333334, "loss": 1.3857, "step": 130 }, { "epoch": 0.07908240265620284, "grad_norm": 0.07991475611925125, "learning_rate": 0.00017466666666666667, "loss": 0.8986, "step": 131 }, { "epoch": 0.07968608511922728, "grad_norm": 0.07599324733018875, "learning_rate": 0.00017600000000000002, "loss": 0.874, "step": 132 }, { "epoch": 0.08028976758225173, "grad_norm": 0.08738681674003601, "learning_rate": 0.00017733333333333335, "loss": 0.8635, "step": 133 }, { "epoch": 0.08089345004527619, "grad_norm": 0.08158082515001297, "learning_rate": 0.00017866666666666668, "loss": 0.8143, "step": 134 }, { "epoch": 0.08149713250830064, "grad_norm": 0.08535363525152206, "learning_rate": 0.00018, "loss": 0.7629, "step": 135 }, { "epoch": 0.08210081497132508, "grad_norm": 0.08319278806447983, "learning_rate": 0.00018133333333333334, "loss": 0.7807, "step": 136 }, { "epoch": 0.08270449743434953, "grad_norm": 0.10717540234327316, "learning_rate": 0.00018266666666666667, "loss": 0.7731, "step": 137 }, { "epoch": 0.08330817989737398, "grad_norm": 0.08758540451526642, "learning_rate": 0.00018400000000000003, "loss": 0.7561, "step": 138 }, { "epoch": 0.08391186236039844, "grad_norm": 0.09366493672132492, "learning_rate": 0.00018533333333333333, "loss": 0.7703, "step": 139 }, { "epoch": 0.08451554482342288, "grad_norm": 0.10316765308380127, "learning_rate": 0.0001866666666666667, "loss": 0.7798, "step": 140 }, { "epoch": 0.08511922728644733, "grad_norm": 0.09655182808637619, "learning_rate": 0.000188, "loss": 0.8053, "step": 141 }, { "epoch": 0.08572290974947178, "grad_norm": 0.09309862554073334, "learning_rate": 0.00018933333333333335, "loss": 0.7777, "step": 142 }, { "epoch": 0.08632659221249622, "grad_norm": 0.11444847285747528, "learning_rate": 0.00019066666666666668, "loss": 0.8681, "step": 143 }, { "epoch": 0.08693027467552068, "grad_norm": 0.09801818430423737, "learning_rate": 0.000192, "loss": 0.6858, "step": 144 }, { "epoch": 0.08753395713854513, "grad_norm": 0.10028098523616791, "learning_rate": 0.00019333333333333333, "loss": 0.8063, "step": 145 }, { "epoch": 0.08813763960156958, "grad_norm": 0.11404330283403397, "learning_rate": 0.0001946666666666667, "loss": 0.8138, "step": 146 }, { "epoch": 0.08874132206459402, "grad_norm": 0.12089970707893372, "learning_rate": 0.000196, "loss": 0.653, "step": 147 }, { "epoch": 0.08934500452761847, "grad_norm": 0.13996511697769165, "learning_rate": 0.00019733333333333335, "loss": 0.6567, "step": 148 }, { "epoch": 0.08994868699064293, "grad_norm": 0.1570245325565338, "learning_rate": 0.00019866666666666668, "loss": 0.6688, "step": 149 }, { "epoch": 0.09055236945366738, "grad_norm": 0.14232978224754333, "learning_rate": 0.0002, "loss": 0.5686, "step": 150 }, { "epoch": 0.09115605191669182, "grad_norm": 0.09911559522151947, "learning_rate": 0.00019995848899958488, "loss": 0.9231, "step": 151 }, { "epoch": 0.09175973437971627, "grad_norm": 0.0942869484424591, "learning_rate": 0.0001999169779991698, "loss": 0.9987, "step": 152 }, { "epoch": 0.09236341684274071, "grad_norm": 0.08815930783748627, "learning_rate": 0.00019987546699875468, "loss": 1.0607, "step": 153 }, { "epoch": 0.09296709930576516, "grad_norm": 0.0868469625711441, "learning_rate": 0.00019983395599833956, "loss": 0.9998, "step": 154 }, { "epoch": 0.09357078176878962, "grad_norm": 0.07780348509550095, "learning_rate": 0.00019979244499792446, "loss": 0.911, "step": 155 }, { "epoch": 0.09417446423181407, "grad_norm": 0.09089189767837524, "learning_rate": 0.00019975093399750936, "loss": 0.9344, "step": 156 }, { "epoch": 0.09477814669483851, "grad_norm": 0.07558704167604446, "learning_rate": 0.00019970942299709423, "loss": 0.8435, "step": 157 }, { "epoch": 0.09538182915786296, "grad_norm": 0.1009376272559166, "learning_rate": 0.00019966791199667913, "loss": 0.8571, "step": 158 }, { "epoch": 0.0959855116208874, "grad_norm": 0.07474809885025024, "learning_rate": 0.00019962640099626403, "loss": 0.9764, "step": 159 }, { "epoch": 0.09658919408391187, "grad_norm": 0.0775795429944992, "learning_rate": 0.0001995848899958489, "loss": 1.0164, "step": 160 }, { "epoch": 0.09719287654693631, "grad_norm": 0.07155506312847137, "learning_rate": 0.0001995433789954338, "loss": 0.9886, "step": 161 }, { "epoch": 0.09779655900996076, "grad_norm": 0.0788518413901329, "learning_rate": 0.00019950186799501867, "loss": 0.8248, "step": 162 }, { "epoch": 0.0984002414729852, "grad_norm": 0.06630228459835052, "learning_rate": 0.00019946035699460357, "loss": 0.9792, "step": 163 }, { "epoch": 0.09900392393600965, "grad_norm": 0.20900733768939972, "learning_rate": 0.00019941884599418847, "loss": 0.9877, "step": 164 }, { "epoch": 0.09960760639903411, "grad_norm": 0.06860389560461044, "learning_rate": 0.00019937733499377335, "loss": 0.9082, "step": 165 }, { "epoch": 0.10021128886205856, "grad_norm": 0.06557682156562805, "learning_rate": 0.00019933582399335825, "loss": 0.8677, "step": 166 }, { "epoch": 0.100814971325083, "grad_norm": 0.07390300929546356, "learning_rate": 0.00019929431299294315, "loss": 1.1377, "step": 167 }, { "epoch": 0.10141865378810745, "grad_norm": 0.07005932927131653, "learning_rate": 0.00019925280199252802, "loss": 1.0111, "step": 168 }, { "epoch": 0.1020223362511319, "grad_norm": 0.06733494251966476, "learning_rate": 0.00019921129099211292, "loss": 0.9273, "step": 169 }, { "epoch": 0.10262601871415636, "grad_norm": 0.4364582300186157, "learning_rate": 0.00019916977999169782, "loss": 0.863, "step": 170 }, { "epoch": 0.1032297011771808, "grad_norm": 0.06318307667970657, "learning_rate": 0.0001991282689912827, "loss": 1.0073, "step": 171 }, { "epoch": 0.10383338364020525, "grad_norm": 492.8619689941406, "learning_rate": 0.00019908675799086757, "loss": 0.9392, "step": 172 }, { "epoch": 0.1044370661032297, "grad_norm": 14.513906478881836, "learning_rate": 0.0001990452469904525, "loss": 0.8524, "step": 173 }, { "epoch": 0.10504074856625414, "grad_norm": 0.08823119848966599, "learning_rate": 0.00019900373599003737, "loss": 0.8712, "step": 174 }, { "epoch": 0.1056444310292786, "grad_norm": 0.0739288404583931, "learning_rate": 0.00019896222498962227, "loss": 1.0265, "step": 175 }, { "epoch": 0.10624811349230305, "grad_norm": 6.7402520179748535, "learning_rate": 0.00019892071398920714, "loss": 0.832, "step": 176 }, { "epoch": 0.1068517959553275, "grad_norm": 0.20480509102344513, "learning_rate": 0.00019887920298879204, "loss": 0.962, "step": 177 }, { "epoch": 0.10745547841835194, "grad_norm": 2.3439409732818604, "learning_rate": 0.00019883769198837694, "loss": 0.8044, "step": 178 }, { "epoch": 0.10805916088137639, "grad_norm": 10.576101303100586, "learning_rate": 0.0001987961809879618, "loss": 1.1324, "step": 179 }, { "epoch": 0.10866284334440085, "grad_norm": 5.063470840454102, "learning_rate": 0.0001987546699875467, "loss": 1.0371, "step": 180 }, { "epoch": 0.1092665258074253, "grad_norm": 0.47612178325653076, "learning_rate": 0.0001987131589871316, "loss": 1.0325, "step": 181 }, { "epoch": 0.10987020827044974, "grad_norm": 0.5040671229362488, "learning_rate": 0.00019867164798671649, "loss": 0.8587, "step": 182 }, { "epoch": 0.11047389073347419, "grad_norm": 0.49563658237457275, "learning_rate": 0.00019863013698630139, "loss": 0.9701, "step": 183 }, { "epoch": 0.11107757319649864, "grad_norm": 0.5639724731445312, "learning_rate": 0.00019858862598588629, "loss": 0.8009, "step": 184 }, { "epoch": 0.1116812556595231, "grad_norm": 0.10233303904533386, "learning_rate": 0.00019854711498547116, "loss": 0.8951, "step": 185 }, { "epoch": 0.11228493812254754, "grad_norm": 0.1553623080253601, "learning_rate": 0.00019850560398505603, "loss": 0.7539, "step": 186 }, { "epoch": 0.11288862058557199, "grad_norm": 0.15513239800930023, "learning_rate": 0.00019846409298464096, "loss": 0.8451, "step": 187 }, { "epoch": 0.11349230304859644, "grad_norm": 0.15111730992794037, "learning_rate": 0.00019842258198422583, "loss": 0.8565, "step": 188 }, { "epoch": 0.11409598551162088, "grad_norm": 0.17460127174854279, "learning_rate": 0.0001983810709838107, "loss": 0.8017, "step": 189 }, { "epoch": 0.11469966797464534, "grad_norm": 0.19736827909946442, "learning_rate": 0.0001983395599833956, "loss": 0.7927, "step": 190 }, { "epoch": 0.11530335043766979, "grad_norm": 0.15492677688598633, "learning_rate": 0.0001982980489829805, "loss": 0.8179, "step": 191 }, { "epoch": 0.11590703290069423, "grad_norm": 0.33444538712501526, "learning_rate": 0.00019825653798256538, "loss": 0.8264, "step": 192 }, { "epoch": 0.11651071536371868, "grad_norm": 0.15822017192840576, "learning_rate": 0.00019821502698215028, "loss": 0.7953, "step": 193 }, { "epoch": 0.11711439782674313, "grad_norm": 0.19879087805747986, "learning_rate": 0.00019817351598173518, "loss": 0.8023, "step": 194 }, { "epoch": 0.11771808028976759, "grad_norm": 0.155591681599617, "learning_rate": 0.00019813200498132005, "loss": 0.6876, "step": 195 }, { "epoch": 0.11832176275279203, "grad_norm": 0.19856096804141998, "learning_rate": 0.00019809049398090495, "loss": 0.7227, "step": 196 }, { "epoch": 0.11892544521581648, "grad_norm": 0.14677810668945312, "learning_rate": 0.00019804898298048985, "loss": 0.729, "step": 197 }, { "epoch": 0.11952912767884093, "grad_norm": 3.824247121810913, "learning_rate": 0.00019800747198007472, "loss": 0.7045, "step": 198 }, { "epoch": 0.12013281014186537, "grad_norm": 1.2843711376190186, "learning_rate": 0.00019796596097965962, "loss": 0.681, "step": 199 }, { "epoch": 0.12073649260488983, "grad_norm": 0.15802475810050964, "learning_rate": 0.0001979244499792445, "loss": 0.54, "step": 200 }, { "epoch": 0.12134017506791428, "grad_norm": 0.08058023452758789, "learning_rate": 0.0001978829389788294, "loss": 1.2773, "step": 201 }, { "epoch": 0.12194385753093873, "grad_norm": 0.09442666172981262, "learning_rate": 0.0001978414279784143, "loss": 1.0519, "step": 202 }, { "epoch": 0.12254753999396317, "grad_norm": 0.11098414659500122, "learning_rate": 0.00019779991697799917, "loss": 0.9809, "step": 203 }, { "epoch": 0.12315122245698762, "grad_norm": 0.08391015231609344, "learning_rate": 0.00019775840597758407, "loss": 0.7483, "step": 204 }, { "epoch": 0.12375490492001208, "grad_norm": 0.09060636162757874, "learning_rate": 0.00019771689497716897, "loss": 0.9001, "step": 205 }, { "epoch": 0.12435858738303653, "grad_norm": 0.08621667325496674, "learning_rate": 0.00019767538397675384, "loss": 1.0983, "step": 206 }, { "epoch": 0.12496226984606097, "grad_norm": 0.07971488684415817, "learning_rate": 0.00019763387297633874, "loss": 0.8414, "step": 207 }, { "epoch": 0.12556595230908543, "grad_norm": 0.07815425097942352, "learning_rate": 0.00019759236197592364, "loss": 0.8872, "step": 208 }, { "epoch": 0.12616963477210988, "grad_norm": 0.0847320556640625, "learning_rate": 0.00019755085097550852, "loss": 0.8398, "step": 209 }, { "epoch": 0.12677331723513433, "grad_norm": 0.08897458761930466, "learning_rate": 0.0001975093399750934, "loss": 0.7865, "step": 210 }, { "epoch": 0.12737699969815877, "grad_norm": 0.0703200101852417, "learning_rate": 0.00019746782897467832, "loss": 0.8256, "step": 211 }, { "epoch": 0.12798068216118322, "grad_norm": 0.07879285514354706, "learning_rate": 0.0001974263179742632, "loss": 0.9032, "step": 212 }, { "epoch": 0.12858436462420766, "grad_norm": 0.07276301831007004, "learning_rate": 0.00019738480697384806, "loss": 0.9857, "step": 213 }, { "epoch": 0.1291880470872321, "grad_norm": 0.09440230578184128, "learning_rate": 0.00019734329597343296, "loss": 0.9156, "step": 214 }, { "epoch": 0.12979172955025656, "grad_norm": 0.1108328178524971, "learning_rate": 0.00019730178497301786, "loss": 0.9671, "step": 215 }, { "epoch": 0.130395412013281, "grad_norm": 0.07690040022134781, "learning_rate": 0.00019726027397260273, "loss": 0.9945, "step": 216 }, { "epoch": 0.13099909447630545, "grad_norm": 0.06875025480985641, "learning_rate": 0.00019721876297218763, "loss": 0.911, "step": 217 }, { "epoch": 0.13160277693932992, "grad_norm": 0.09118712693452835, "learning_rate": 0.00019717725197177253, "loss": 1.1363, "step": 218 }, { "epoch": 0.13220645940235437, "grad_norm": 0.07356920093297958, "learning_rate": 0.00019713574097135743, "loss": 1.14, "step": 219 }, { "epoch": 0.13281014186537882, "grad_norm": 0.06750470399856567, "learning_rate": 0.0001970942299709423, "loss": 0.9591, "step": 220 }, { "epoch": 0.13341382432840326, "grad_norm": 0.11778965592384338, "learning_rate": 0.0001970527189705272, "loss": 0.9048, "step": 221 }, { "epoch": 0.1340175067914277, "grad_norm": 0.07331351190805435, "learning_rate": 0.0001970112079701121, "loss": 0.8792, "step": 222 }, { "epoch": 0.13462118925445216, "grad_norm": 0.08289069682359695, "learning_rate": 0.00019696969696969698, "loss": 1.1292, "step": 223 }, { "epoch": 0.1352248717174766, "grad_norm": 0.07485181093215942, "learning_rate": 0.00019692818596928185, "loss": 1.013, "step": 224 }, { "epoch": 0.13582855418050105, "grad_norm": 0.08499836176633835, "learning_rate": 0.00019688667496886678, "loss": 0.9177, "step": 225 }, { "epoch": 0.1364322366435255, "grad_norm": 0.06567627936601639, "learning_rate": 0.00019684516396845165, "loss": 0.9016, "step": 226 }, { "epoch": 0.13703591910654994, "grad_norm": 0.07324650138616562, "learning_rate": 0.00019680365296803653, "loss": 0.9649, "step": 227 }, { "epoch": 0.13763960156957442, "grad_norm": 0.06484885513782501, "learning_rate": 0.00019676214196762143, "loss": 0.824, "step": 228 }, { "epoch": 0.13824328403259886, "grad_norm": 0.06534811854362488, "learning_rate": 0.00019672063096720633, "loss": 0.7969, "step": 229 }, { "epoch": 0.1388469664956233, "grad_norm": 0.06635700166225433, "learning_rate": 0.0001966791199667912, "loss": 0.9041, "step": 230 }, { "epoch": 0.13945064895864775, "grad_norm": 0.06364124268293381, "learning_rate": 0.0001966376089663761, "loss": 0.8935, "step": 231 }, { "epoch": 0.1400543314216722, "grad_norm": 0.0640692487359047, "learning_rate": 0.000196596097965961, "loss": 0.9025, "step": 232 }, { "epoch": 0.14065801388469665, "grad_norm": 0.07015056163072586, "learning_rate": 0.00019655458696554587, "loss": 0.8065, "step": 233 }, { "epoch": 0.1412616963477211, "grad_norm": 0.07654477655887604, "learning_rate": 0.00019651307596513077, "loss": 0.8408, "step": 234 }, { "epoch": 0.14186537881074554, "grad_norm": 0.06782008707523346, "learning_rate": 0.00019647156496471567, "loss": 0.7275, "step": 235 }, { "epoch": 0.14246906127377, "grad_norm": 0.07276429980993271, "learning_rate": 0.00019643005396430055, "loss": 0.7955, "step": 236 }, { "epoch": 0.14307274373679443, "grad_norm": 0.08035031706094742, "learning_rate": 0.00019638854296388545, "loss": 0.7074, "step": 237 }, { "epoch": 0.1436764261998189, "grad_norm": 0.10624828189611435, "learning_rate": 0.00019634703196347032, "loss": 0.8799, "step": 238 }, { "epoch": 0.14428010866284335, "grad_norm": 0.0777585506439209, "learning_rate": 0.00019630552096305522, "loss": 0.7528, "step": 239 }, { "epoch": 0.1448837911258678, "grad_norm": 0.0803024098277092, "learning_rate": 0.00019626400996264012, "loss": 0.7702, "step": 240 }, { "epoch": 0.14548747358889225, "grad_norm": 0.0814545750617981, "learning_rate": 0.000196222498962225, "loss": 0.78, "step": 241 }, { "epoch": 0.1460911560519167, "grad_norm": 0.0940927267074585, "learning_rate": 0.0001961809879618099, "loss": 0.7294, "step": 242 }, { "epoch": 0.14669483851494114, "grad_norm": 0.08877824991941452, "learning_rate": 0.0001961394769613948, "loss": 0.7663, "step": 243 }, { "epoch": 0.14729852097796559, "grad_norm": 0.09321099519729614, "learning_rate": 0.00019609796596097966, "loss": 0.7524, "step": 244 }, { "epoch": 0.14790220344099003, "grad_norm": 0.09710489213466644, "learning_rate": 0.00019605645496056454, "loss": 0.8147, "step": 245 }, { "epoch": 0.14850588590401448, "grad_norm": 0.10399672389030457, "learning_rate": 0.00019601494396014946, "loss": 0.7327, "step": 246 }, { "epoch": 0.14910956836703892, "grad_norm": 0.11608180403709412, "learning_rate": 0.00019597343295973434, "loss": 0.7464, "step": 247 }, { "epoch": 0.1497132508300634, "grad_norm": 0.10949523001909256, "learning_rate": 0.0001959319219593192, "loss": 0.6295, "step": 248 }, { "epoch": 0.15031693329308785, "grad_norm": 0.10902351886034012, "learning_rate": 0.00019589041095890414, "loss": 0.6048, "step": 249 }, { "epoch": 0.1509206157561123, "grad_norm": 0.11830253154039383, "learning_rate": 0.000195848899958489, "loss": 0.4881, "step": 250 }, { "epoch": 0.15152429821913674, "grad_norm": 0.06550572067499161, "learning_rate": 0.00019580738895807388, "loss": 0.8051, "step": 251 }, { "epoch": 0.15212798068216118, "grad_norm": 0.08449156582355499, "learning_rate": 0.00019576587795765878, "loss": 1.0096, "step": 252 }, { "epoch": 0.15273166314518563, "grad_norm": 0.07215554267168045, "learning_rate": 0.00019572436695724368, "loss": 0.7212, "step": 253 }, { "epoch": 0.15333534560821008, "grad_norm": 0.0707031637430191, "learning_rate": 0.00019568285595682856, "loss": 0.8708, "step": 254 }, { "epoch": 0.15393902807123452, "grad_norm": 0.22556833922863007, "learning_rate": 0.00019564134495641346, "loss": 0.9448, "step": 255 }, { "epoch": 0.15454271053425897, "grad_norm": 0.06614769995212555, "learning_rate": 0.00019559983395599836, "loss": 1.2733, "step": 256 }, { "epoch": 0.15514639299728342, "grad_norm": 0.14129576086997986, "learning_rate": 0.00019555832295558323, "loss": 0.9814, "step": 257 }, { "epoch": 0.1557500754603079, "grad_norm": 0.061934180557727814, "learning_rate": 0.00019551681195516813, "loss": 0.8052, "step": 258 }, { "epoch": 0.15635375792333234, "grad_norm": 0.09368140995502472, "learning_rate": 0.000195475300954753, "loss": 0.9107, "step": 259 }, { "epoch": 0.15695744038635678, "grad_norm": 0.066087506711483, "learning_rate": 0.0001954337899543379, "loss": 0.7128, "step": 260 }, { "epoch": 0.15756112284938123, "grad_norm": 0.06857267022132874, "learning_rate": 0.0001953922789539228, "loss": 0.8081, "step": 261 }, { "epoch": 0.15816480531240568, "grad_norm": 0.06954636424779892, "learning_rate": 0.00019535076795350768, "loss": 0.8834, "step": 262 }, { "epoch": 0.15876848777543012, "grad_norm": 0.07501845061779022, "learning_rate": 0.0001953092569530926, "loss": 0.7618, "step": 263 }, { "epoch": 0.15937217023845457, "grad_norm": 0.06725212186574936, "learning_rate": 0.00019526774595267748, "loss": 0.852, "step": 264 }, { "epoch": 0.15997585270147902, "grad_norm": 0.06601674854755402, "learning_rate": 0.00019522623495226235, "loss": 0.8241, "step": 265 }, { "epoch": 0.16057953516450346, "grad_norm": 0.0759337842464447, "learning_rate": 0.00019518472395184725, "loss": 1.1918, "step": 266 }, { "epoch": 0.1611832176275279, "grad_norm": 0.0703561007976532, "learning_rate": 0.00019514321295143215, "loss": 0.8613, "step": 267 }, { "epoch": 0.16178690009055238, "grad_norm": 0.07018351554870605, "learning_rate": 0.00019510170195101702, "loss": 1.0508, "step": 268 }, { "epoch": 0.16239058255357683, "grad_norm": 0.06850980967283249, "learning_rate": 0.00019506019095060192, "loss": 0.9527, "step": 269 }, { "epoch": 0.16299426501660128, "grad_norm": 0.06927502900362015, "learning_rate": 0.00019501867995018682, "loss": 0.9985, "step": 270 }, { "epoch": 0.16359794747962572, "grad_norm": 0.07486286759376526, "learning_rate": 0.0001949771689497717, "loss": 0.9889, "step": 271 }, { "epoch": 0.16420162994265017, "grad_norm": 0.06840648502111435, "learning_rate": 0.0001949356579493566, "loss": 0.902, "step": 272 }, { "epoch": 0.16480531240567461, "grad_norm": 0.06660640239715576, "learning_rate": 0.00019489414694894147, "loss": 0.9774, "step": 273 }, { "epoch": 0.16540899486869906, "grad_norm": 0.06089378520846367, "learning_rate": 0.00019485263594852637, "loss": 1.0334, "step": 274 }, { "epoch": 0.1660126773317235, "grad_norm": 0.06978226453065872, "learning_rate": 0.00019481112494811127, "loss": 0.8932, "step": 275 }, { "epoch": 0.16661635979474795, "grad_norm": 0.06877858191728592, "learning_rate": 0.00019476961394769614, "loss": 0.91, "step": 276 }, { "epoch": 0.1672200422577724, "grad_norm": 0.06825589388608932, "learning_rate": 0.00019472810294728104, "loss": 0.9188, "step": 277 }, { "epoch": 0.16782372472079687, "grad_norm": 0.06272375583648682, "learning_rate": 0.00019468659194686594, "loss": 0.8665, "step": 278 }, { "epoch": 0.16842740718382132, "grad_norm": 0.07434337586164474, "learning_rate": 0.0001946450809464508, "loss": 0.9386, "step": 279 }, { "epoch": 0.16903108964684577, "grad_norm": 0.062427766621112823, "learning_rate": 0.0001946035699460357, "loss": 0.8038, "step": 280 }, { "epoch": 0.1696347721098702, "grad_norm": 0.07670903950929642, "learning_rate": 0.0001945620589456206, "loss": 0.8494, "step": 281 }, { "epoch": 0.17023845457289466, "grad_norm": 0.06715535372495651, "learning_rate": 0.00019452054794520549, "loss": 1.0171, "step": 282 }, { "epoch": 0.1708421370359191, "grad_norm": 0.0629987046122551, "learning_rate": 0.00019447903694479036, "loss": 0.8715, "step": 283 }, { "epoch": 0.17144581949894355, "grad_norm": 0.0751439705491066, "learning_rate": 0.00019443752594437529, "loss": 0.8137, "step": 284 }, { "epoch": 0.172049501961968, "grad_norm": 0.07195259630680084, "learning_rate": 0.00019439601494396016, "loss": 0.8276, "step": 285 }, { "epoch": 0.17265318442499245, "grad_norm": 0.06936553120613098, "learning_rate": 0.00019435450394354503, "loss": 0.7662, "step": 286 }, { "epoch": 0.1732568668880169, "grad_norm": 0.07795591652393341, "learning_rate": 0.00019431299294312993, "loss": 0.7729, "step": 287 }, { "epoch": 0.17386054935104137, "grad_norm": 0.07397985458374023, "learning_rate": 0.00019427148194271483, "loss": 0.7924, "step": 288 }, { "epoch": 0.1744642318140658, "grad_norm": 0.08324669301509857, "learning_rate": 0.0001942299709422997, "loss": 0.8163, "step": 289 }, { "epoch": 0.17506791427709026, "grad_norm": 0.08435431867837906, "learning_rate": 0.0001941884599418846, "loss": 0.7395, "step": 290 }, { "epoch": 0.1756715967401147, "grad_norm": 0.08156009763479233, "learning_rate": 0.0001941469489414695, "loss": 0.7299, "step": 291 }, { "epoch": 0.17627527920313915, "grad_norm": 0.08714080601930618, "learning_rate": 0.00019410543794105438, "loss": 0.7918, "step": 292 }, { "epoch": 0.1768789616661636, "grad_norm": 0.09345049411058426, "learning_rate": 0.00019406392694063928, "loss": 0.7841, "step": 293 }, { "epoch": 0.17748264412918804, "grad_norm": 0.09327434748411179, "learning_rate": 0.00019402241594022418, "loss": 0.7073, "step": 294 }, { "epoch": 0.1780863265922125, "grad_norm": 0.09284470230340958, "learning_rate": 0.00019398090493980905, "loss": 0.6427, "step": 295 }, { "epoch": 0.17869000905523694, "grad_norm": 0.09828530997037888, "learning_rate": 0.00019393939393939395, "loss": 0.6953, "step": 296 }, { "epoch": 0.17929369151826138, "grad_norm": 0.10501807928085327, "learning_rate": 0.00019389788293897882, "loss": 0.7286, "step": 297 }, { "epoch": 0.17989737398128586, "grad_norm": 0.14845532178878784, "learning_rate": 0.00019385637193856372, "loss": 0.6602, "step": 298 }, { "epoch": 0.1805010564443103, "grad_norm": 0.1208406463265419, "learning_rate": 0.00019381486093814862, "loss": 0.5862, "step": 299 }, { "epoch": 0.18110473890733475, "grad_norm": 0.127839133143425, "learning_rate": 0.0001937733499377335, "loss": 0.4804, "step": 300 }, { "epoch": 0.1817084213703592, "grad_norm": 0.07252588123083115, "learning_rate": 0.0001937318389373184, "loss": 0.8918, "step": 301 }, { "epoch": 0.18231210383338364, "grad_norm": 0.07854969054460526, "learning_rate": 0.0001936903279369033, "loss": 0.9295, "step": 302 }, { "epoch": 0.1829157862964081, "grad_norm": 0.07926355302333832, "learning_rate": 0.00019364881693648817, "loss": 1.0768, "step": 303 }, { "epoch": 0.18351946875943254, "grad_norm": 0.1117272675037384, "learning_rate": 0.00019360730593607307, "loss": 0.89, "step": 304 }, { "epoch": 0.18412315122245698, "grad_norm": 0.09249380975961685, "learning_rate": 0.00019356579493565797, "loss": 0.9915, "step": 305 }, { "epoch": 0.18472683368548143, "grad_norm": 0.0759844034910202, "learning_rate": 0.00019352428393524284, "loss": 0.9705, "step": 306 }, { "epoch": 0.18533051614850587, "grad_norm": 0.06819566339254379, "learning_rate": 0.00019348277293482774, "loss": 0.9801, "step": 307 }, { "epoch": 0.18593419861153032, "grad_norm": 0.07146774232387543, "learning_rate": 0.00019344126193441264, "loss": 0.8923, "step": 308 }, { "epoch": 0.1865378810745548, "grad_norm": 0.06504259258508682, "learning_rate": 0.00019339975093399752, "loss": 0.8941, "step": 309 }, { "epoch": 0.18714156353757924, "grad_norm": 0.06857950240373611, "learning_rate": 0.00019335823993358242, "loss": 0.9558, "step": 310 }, { "epoch": 0.1877452460006037, "grad_norm": 0.06270836293697357, "learning_rate": 0.0001933167289331673, "loss": 0.9532, "step": 311 }, { "epoch": 0.18834892846362813, "grad_norm": 0.13885007798671722, "learning_rate": 0.0001932752179327522, "loss": 0.8591, "step": 312 }, { "epoch": 0.18895261092665258, "grad_norm": 0.07498139888048172, "learning_rate": 0.0001932337069323371, "loss": 0.9812, "step": 313 }, { "epoch": 0.18955629338967703, "grad_norm": 0.0773036777973175, "learning_rate": 0.00019319219593192196, "loss": 0.8807, "step": 314 }, { "epoch": 0.19015997585270147, "grad_norm": 0.07549306750297546, "learning_rate": 0.00019315068493150686, "loss": 1.0929, "step": 315 }, { "epoch": 0.19076365831572592, "grad_norm": 0.06931062042713165, "learning_rate": 0.00019310917393109176, "loss": 0.9293, "step": 316 }, { "epoch": 0.19136734077875037, "grad_norm": 0.06612774729728699, "learning_rate": 0.00019306766293067664, "loss": 1.1528, "step": 317 }, { "epoch": 0.1919710232417748, "grad_norm": 0.07355131208896637, "learning_rate": 0.00019302615193026154, "loss": 1.4277, "step": 318 }, { "epoch": 0.1925747057047993, "grad_norm": 0.07465571165084839, "learning_rate": 0.00019298464092984644, "loss": 0.8799, "step": 319 }, { "epoch": 0.19317838816782373, "grad_norm": 0.07097172737121582, "learning_rate": 0.0001929431299294313, "loss": 0.7634, "step": 320 }, { "epoch": 0.19378207063084818, "grad_norm": 0.07077208161354065, "learning_rate": 0.00019290161892901618, "loss": 0.9721, "step": 321 }, { "epoch": 0.19438575309387263, "grad_norm": 0.06071571260690689, "learning_rate": 0.0001928601079286011, "loss": 0.8238, "step": 322 }, { "epoch": 0.19498943555689707, "grad_norm": 0.07852339744567871, "learning_rate": 0.00019281859692818598, "loss": 0.7897, "step": 323 }, { "epoch": 0.19559311801992152, "grad_norm": 0.06762294471263885, "learning_rate": 0.00019277708592777085, "loss": 0.9692, "step": 324 }, { "epoch": 0.19619680048294597, "grad_norm": 0.06589538604021072, "learning_rate": 0.00019273557492735575, "loss": 0.8701, "step": 325 }, { "epoch": 0.1968004829459704, "grad_norm": 0.07512197643518448, "learning_rate": 0.00019269406392694065, "loss": 0.8921, "step": 326 }, { "epoch": 0.19740416540899486, "grad_norm": 0.06501419097185135, "learning_rate": 0.00019265255292652553, "loss": 0.8559, "step": 327 }, { "epoch": 0.1980078478720193, "grad_norm": 0.06205839663743973, "learning_rate": 0.00019261104192611043, "loss": 0.9534, "step": 328 }, { "epoch": 0.19861153033504378, "grad_norm": 0.06113965064287186, "learning_rate": 0.00019256953092569533, "loss": 0.7965, "step": 329 }, { "epoch": 0.19921521279806823, "grad_norm": 0.06127781420946121, "learning_rate": 0.0001925280199252802, "loss": 0.8429, "step": 330 }, { "epoch": 0.19981889526109267, "grad_norm": 0.07560814172029495, "learning_rate": 0.0001924865089248651, "loss": 0.8128, "step": 331 }, { "epoch": 0.20042257772411712, "grad_norm": 0.06620073318481445, "learning_rate": 0.00019244499792445, "loss": 0.958, "step": 332 }, { "epoch": 0.20102626018714156, "grad_norm": 0.0671965479850769, "learning_rate": 0.00019240348692403487, "loss": 0.8134, "step": 333 }, { "epoch": 0.201629942650166, "grad_norm": 0.0731448382139206, "learning_rate": 0.00019236197592361977, "loss": 0.7264, "step": 334 }, { "epoch": 0.20223362511319046, "grad_norm": 0.06810946017503738, "learning_rate": 0.00019232046492320465, "loss": 0.7509, "step": 335 }, { "epoch": 0.2028373075762149, "grad_norm": 0.08012348413467407, "learning_rate": 0.00019227895392278955, "loss": 0.8237, "step": 336 }, { "epoch": 0.20344099003923935, "grad_norm": 0.07328704744577408, "learning_rate": 0.00019223744292237445, "loss": 0.7719, "step": 337 }, { "epoch": 0.2040446725022638, "grad_norm": 0.07838919013738632, "learning_rate": 0.00019219593192195932, "loss": 0.8404, "step": 338 }, { "epoch": 0.20464835496528827, "grad_norm": 0.07977066934108734, "learning_rate": 0.00019215442092154422, "loss": 0.7614, "step": 339 }, { "epoch": 0.20525203742831272, "grad_norm": 0.07425908744335175, "learning_rate": 0.00019211290992112912, "loss": 0.7439, "step": 340 }, { "epoch": 0.20585571989133716, "grad_norm": 0.08555571734905243, "learning_rate": 0.000192071398920714, "loss": 0.7711, "step": 341 }, { "epoch": 0.2064594023543616, "grad_norm": 0.0798811987042427, "learning_rate": 0.0001920298879202989, "loss": 0.7246, "step": 342 }, { "epoch": 0.20706308481738606, "grad_norm": 0.09403866529464722, "learning_rate": 0.0001919883769198838, "loss": 0.7171, "step": 343 }, { "epoch": 0.2076667672804105, "grad_norm": 0.09114021807909012, "learning_rate": 0.00019194686591946866, "loss": 0.7605, "step": 344 }, { "epoch": 0.20827044974343495, "grad_norm": 0.1013076901435852, "learning_rate": 0.00019190535491905354, "loss": 0.7583, "step": 345 }, { "epoch": 0.2088741322064594, "grad_norm": 0.10122162848711014, "learning_rate": 0.00019186384391863847, "loss": 0.7477, "step": 346 }, { "epoch": 0.20947781466948384, "grad_norm": 0.09927545487880707, "learning_rate": 0.00019182233291822334, "loss": 0.6728, "step": 347 }, { "epoch": 0.2100814971325083, "grad_norm": 0.12509341537952423, "learning_rate": 0.0001917808219178082, "loss": 0.6543, "step": 348 }, { "epoch": 0.21068517959553276, "grad_norm": 0.11812290549278259, "learning_rate": 0.0001917393109173931, "loss": 0.6701, "step": 349 }, { "epoch": 0.2112888620585572, "grad_norm": 0.12166016548871994, "learning_rate": 0.000191697799916978, "loss": 0.5261, "step": 350 }, { "epoch": 0.21189254452158165, "grad_norm": 0.0700993686914444, "learning_rate": 0.0001916562889165629, "loss": 1.2022, "step": 351 }, { "epoch": 0.2124962269846061, "grad_norm": 0.08300930261611938, "learning_rate": 0.00019161477791614778, "loss": 0.9249, "step": 352 }, { "epoch": 0.21309990944763055, "grad_norm": 0.06463124603033066, "learning_rate": 0.00019157326691573268, "loss": 0.8147, "step": 353 }, { "epoch": 0.213703591910655, "grad_norm": 0.06934931129217148, "learning_rate": 0.00019153175591531758, "loss": 0.9904, "step": 354 }, { "epoch": 0.21430727437367944, "grad_norm": 0.06555134057998657, "learning_rate": 0.00019149024491490246, "loss": 0.8825, "step": 355 }, { "epoch": 0.2149109568367039, "grad_norm": 0.07276731729507446, "learning_rate": 0.00019144873391448736, "loss": 1.0806, "step": 356 }, { "epoch": 0.21551463929972833, "grad_norm": 0.06886199861764908, "learning_rate": 0.00019140722291407226, "loss": 0.9756, "step": 357 }, { "epoch": 0.21611832176275278, "grad_norm": 0.07671529054641724, "learning_rate": 0.00019136571191365713, "loss": 0.9871, "step": 358 }, { "epoch": 0.21672200422577725, "grad_norm": 0.08558958023786545, "learning_rate": 0.000191324200913242, "loss": 0.9553, "step": 359 }, { "epoch": 0.2173256866888017, "grad_norm": 0.07093027234077454, "learning_rate": 0.00019128268991282693, "loss": 0.8247, "step": 360 }, { "epoch": 0.21792936915182615, "grad_norm": 0.07704441994428635, "learning_rate": 0.0001912411789124118, "loss": 1.0793, "step": 361 }, { "epoch": 0.2185330516148506, "grad_norm": 0.06594787538051605, "learning_rate": 0.00019119966791199668, "loss": 0.866, "step": 362 }, { "epoch": 0.21913673407787504, "grad_norm": 0.07074205577373505, "learning_rate": 0.00019115815691158158, "loss": 0.9088, "step": 363 }, { "epoch": 0.21974041654089949, "grad_norm": 0.06734868884086609, "learning_rate": 0.00019111664591116648, "loss": 0.8861, "step": 364 }, { "epoch": 0.22034409900392393, "grad_norm": 0.07831276208162308, "learning_rate": 0.00019107513491075135, "loss": 0.939, "step": 365 }, { "epoch": 0.22094778146694838, "grad_norm": 0.06146768108010292, "learning_rate": 0.00019103362391033625, "loss": 0.9476, "step": 366 }, { "epoch": 0.22155146392997283, "grad_norm": 0.062476493418216705, "learning_rate": 0.00019099211290992115, "loss": 0.821, "step": 367 }, { "epoch": 0.22215514639299727, "grad_norm": 0.07490051537752151, "learning_rate": 0.00019095060190950602, "loss": 0.9016, "step": 368 }, { "epoch": 0.22275882885602175, "grad_norm": 0.07467661798000336, "learning_rate": 0.00019090909090909092, "loss": 0.786, "step": 369 }, { "epoch": 0.2233625113190462, "grad_norm": 0.0667489618062973, "learning_rate": 0.00019086757990867582, "loss": 0.7985, "step": 370 }, { "epoch": 0.22396619378207064, "grad_norm": 0.06847196817398071, "learning_rate": 0.0001908260689082607, "loss": 0.7912, "step": 371 }, { "epoch": 0.22456987624509508, "grad_norm": 0.06785493344068527, "learning_rate": 0.0001907845579078456, "loss": 0.9787, "step": 372 }, { "epoch": 0.22517355870811953, "grad_norm": 0.06378093361854553, "learning_rate": 0.00019074304690743047, "loss": 1.0737, "step": 373 }, { "epoch": 0.22577724117114398, "grad_norm": 0.07684706896543503, "learning_rate": 0.00019070153590701537, "loss": 0.8541, "step": 374 }, { "epoch": 0.22638092363416842, "grad_norm": 0.06430470943450928, "learning_rate": 0.00019066002490660027, "loss": 0.8404, "step": 375 }, { "epoch": 0.22698460609719287, "grad_norm": 0.062282588332891464, "learning_rate": 0.00019061851390618514, "loss": 0.8361, "step": 376 }, { "epoch": 0.22758828856021732, "grad_norm": 0.06882327795028687, "learning_rate": 0.00019057700290577004, "loss": 0.7329, "step": 377 }, { "epoch": 0.22819197102324176, "grad_norm": 0.06568577885627747, "learning_rate": 0.00019053549190535494, "loss": 0.8607, "step": 378 }, { "epoch": 0.22879565348626624, "grad_norm": 0.07336780428886414, "learning_rate": 0.00019049398090493981, "loss": 0.8122, "step": 379 }, { "epoch": 0.22939933594929068, "grad_norm": 0.06682398915290833, "learning_rate": 0.0001904524699045247, "loss": 0.8005, "step": 380 }, { "epoch": 0.23000301841231513, "grad_norm": 0.07804804295301437, "learning_rate": 0.00019041095890410961, "loss": 0.8991, "step": 381 }, { "epoch": 0.23060670087533958, "grad_norm": 0.07914794981479645, "learning_rate": 0.0001903694479036945, "loss": 0.7798, "step": 382 }, { "epoch": 0.23121038333836402, "grad_norm": 0.10398049652576447, "learning_rate": 0.00019032793690327936, "loss": 0.7993, "step": 383 }, { "epoch": 0.23181406580138847, "grad_norm": 0.0717068761587143, "learning_rate": 0.0001902864259028643, "loss": 0.7845, "step": 384 }, { "epoch": 0.23241774826441292, "grad_norm": 0.07129625976085663, "learning_rate": 0.00019024491490244916, "loss": 0.7409, "step": 385 }, { "epoch": 0.23302143072743736, "grad_norm": 0.07239419221878052, "learning_rate": 0.00019020340390203403, "loss": 0.7352, "step": 386 }, { "epoch": 0.2336251131904618, "grad_norm": 0.07583223283290863, "learning_rate": 0.00019016189290161893, "loss": 0.8076, "step": 387 }, { "epoch": 0.23422879565348625, "grad_norm": 0.07693106681108475, "learning_rate": 0.00019012038190120383, "loss": 0.7828, "step": 388 }, { "epoch": 0.23483247811651073, "grad_norm": 0.08271142840385437, "learning_rate": 0.0001900788709007887, "loss": 0.7281, "step": 389 }, { "epoch": 0.23543616057953518, "grad_norm": 0.08440615981817245, "learning_rate": 0.0001900373599003736, "loss": 0.7776, "step": 390 }, { "epoch": 0.23603984304255962, "grad_norm": 0.08483248949050903, "learning_rate": 0.0001899958488999585, "loss": 0.7142, "step": 391 }, { "epoch": 0.23664352550558407, "grad_norm": 0.10398207604885101, "learning_rate": 0.00018995433789954338, "loss": 0.7971, "step": 392 }, { "epoch": 0.23724720796860851, "grad_norm": 0.09310369938611984, "learning_rate": 0.00018991282689912828, "loss": 0.7611, "step": 393 }, { "epoch": 0.23785089043163296, "grad_norm": 0.09080198407173157, "learning_rate": 0.00018987131589871315, "loss": 0.7963, "step": 394 }, { "epoch": 0.2384545728946574, "grad_norm": 0.09882273524999619, "learning_rate": 0.00018982980489829805, "loss": 0.7826, "step": 395 }, { "epoch": 0.23905825535768185, "grad_norm": 0.11651689559221268, "learning_rate": 0.00018978829389788295, "loss": 0.7177, "step": 396 }, { "epoch": 0.2396619378207063, "grad_norm": 0.10827039927244186, "learning_rate": 0.00018974678289746782, "loss": 0.6913, "step": 397 }, { "epoch": 0.24026562028373075, "grad_norm": 0.10637848824262619, "learning_rate": 0.00018970527189705275, "loss": 0.5721, "step": 398 }, { "epoch": 0.24086930274675522, "grad_norm": 0.11688965559005737, "learning_rate": 0.00018966376089663762, "loss": 0.5603, "step": 399 }, { "epoch": 0.24147298520977967, "grad_norm": 0.12776370346546173, "learning_rate": 0.0001896222498962225, "loss": 0.4937, "step": 400 }, { "epoch": 0.2420766676728041, "grad_norm": 0.06769760698080063, "learning_rate": 0.0001895807388958074, "loss": 0.8702, "step": 401 }, { "epoch": 0.24268035013582856, "grad_norm": 0.07302338629961014, "learning_rate": 0.0001895392278953923, "loss": 0.8243, "step": 402 }, { "epoch": 0.243284032598853, "grad_norm": 0.07331572473049164, "learning_rate": 0.00018949771689497717, "loss": 1.1457, "step": 403 }, { "epoch": 0.24388771506187745, "grad_norm": 0.07224598526954651, "learning_rate": 0.00018945620589456207, "loss": 1.0838, "step": 404 }, { "epoch": 0.2444913975249019, "grad_norm": 0.06487125158309937, "learning_rate": 0.00018941469489414697, "loss": 0.8927, "step": 405 }, { "epoch": 0.24509507998792635, "grad_norm": 0.06399485468864441, "learning_rate": 0.00018937318389373184, "loss": 0.7973, "step": 406 }, { "epoch": 0.2456987624509508, "grad_norm": 0.08675524592399597, "learning_rate": 0.00018933167289331674, "loss": 0.9525, "step": 407 }, { "epoch": 0.24630244491397524, "grad_norm": 0.0663706362247467, "learning_rate": 0.00018929016189290162, "loss": 0.7668, "step": 408 }, { "epoch": 0.24690612737699968, "grad_norm": 0.09714970737695694, "learning_rate": 0.00018924865089248652, "loss": 0.783, "step": 409 }, { "epoch": 0.24750980984002416, "grad_norm": 0.06401702761650085, "learning_rate": 0.00018920713989207142, "loss": 1.0896, "step": 410 }, { "epoch": 0.2481134923030486, "grad_norm": 0.07394849509000778, "learning_rate": 0.0001891656288916563, "loss": 0.79, "step": 411 }, { "epoch": 0.24871717476607305, "grad_norm": 0.1234453096985817, "learning_rate": 0.0001891241178912412, "loss": 0.841, "step": 412 }, { "epoch": 0.2493208572290975, "grad_norm": 0.06933286041021347, "learning_rate": 0.0001890826068908261, "loss": 1.1143, "step": 413 }, { "epoch": 0.24992453969212194, "grad_norm": 0.07103940844535828, "learning_rate": 0.00018904109589041096, "loss": 0.8918, "step": 414 }, { "epoch": 0.2505282221551464, "grad_norm": 0.11104138940572739, "learning_rate": 0.00018899958488999586, "loss": 0.922, "step": 415 }, { "epoch": 0.25113190461817086, "grad_norm": 0.06877300888299942, "learning_rate": 0.00018895807388958076, "loss": 0.9959, "step": 416 }, { "epoch": 0.2517355870811953, "grad_norm": 0.07087550312280655, "learning_rate": 0.00018891656288916564, "loss": 0.8376, "step": 417 }, { "epoch": 0.25233926954421976, "grad_norm": 0.08183194696903229, "learning_rate": 0.0001888750518887505, "loss": 0.8378, "step": 418 }, { "epoch": 0.2529429520072442, "grad_norm": 0.06973780691623688, "learning_rate": 0.00018883354088833544, "loss": 0.8195, "step": 419 }, { "epoch": 0.25354663447026865, "grad_norm": 0.07654432952404022, "learning_rate": 0.0001887920298879203, "loss": 0.8851, "step": 420 }, { "epoch": 0.2541503169332931, "grad_norm": 0.0699959322810173, "learning_rate": 0.00018875051888750518, "loss": 1.0005, "step": 421 }, { "epoch": 0.25475399939631754, "grad_norm": 0.06692960858345032, "learning_rate": 0.00018870900788709008, "loss": 0.8751, "step": 422 }, { "epoch": 0.255357681859342, "grad_norm": 0.06952139735221863, "learning_rate": 0.00018866749688667498, "loss": 0.968, "step": 423 }, { "epoch": 0.25596136432236644, "grad_norm": 0.0676087811589241, "learning_rate": 0.00018862598588625985, "loss": 0.9608, "step": 424 }, { "epoch": 0.2565650467853909, "grad_norm": 0.07644575089216232, "learning_rate": 0.00018858447488584475, "loss": 0.8708, "step": 425 }, { "epoch": 0.25716872924841533, "grad_norm": 0.0646698921918869, "learning_rate": 0.00018854296388542965, "loss": 0.7413, "step": 426 }, { "epoch": 0.2577724117114398, "grad_norm": 0.06822269409894943, "learning_rate": 0.00018850145288501453, "loss": 0.8585, "step": 427 }, { "epoch": 0.2583760941744642, "grad_norm": 0.23585358262062073, "learning_rate": 0.00018845994188459943, "loss": 0.8879, "step": 428 }, { "epoch": 0.25897977663748867, "grad_norm": 0.07148776948451996, "learning_rate": 0.00018841843088418433, "loss": 0.9396, "step": 429 }, { "epoch": 0.2595834591005131, "grad_norm": 0.07239923626184464, "learning_rate": 0.0001883769198837692, "loss": 0.9303, "step": 430 }, { "epoch": 0.26018714156353756, "grad_norm": 0.06573071330785751, "learning_rate": 0.0001883354088833541, "loss": 0.866, "step": 431 }, { "epoch": 0.260790824026562, "grad_norm": 0.06852304190397263, "learning_rate": 0.00018829389788293897, "loss": 1.0774, "step": 432 }, { "epoch": 0.26139450648958645, "grad_norm": 0.06725315749645233, "learning_rate": 0.00018825238688252387, "loss": 0.9174, "step": 433 }, { "epoch": 0.2619981889526109, "grad_norm": 0.07200594246387482, "learning_rate": 0.00018821087588210877, "loss": 0.7667, "step": 434 }, { "epoch": 0.2626018714156354, "grad_norm": 0.06961148232221603, "learning_rate": 0.00018816936488169365, "loss": 0.7284, "step": 435 }, { "epoch": 0.26320555387865985, "grad_norm": 0.07602632790803909, "learning_rate": 0.00018812785388127855, "loss": 0.8159, "step": 436 }, { "epoch": 0.2638092363416843, "grad_norm": 0.0729806199669838, "learning_rate": 0.00018808634288086345, "loss": 0.7094, "step": 437 }, { "epoch": 0.26441291880470874, "grad_norm": 0.07881097495555878, "learning_rate": 0.00018804483188044832, "loss": 0.7488, "step": 438 }, { "epoch": 0.2650166012677332, "grad_norm": 0.08327119052410126, "learning_rate": 0.00018800332088003322, "loss": 0.7672, "step": 439 }, { "epoch": 0.26562028373075763, "grad_norm": 0.08234728127717972, "learning_rate": 0.00018796180987961812, "loss": 0.7644, "step": 440 }, { "epoch": 0.2662239661937821, "grad_norm": 0.09000684320926666, "learning_rate": 0.000187920298879203, "loss": 0.793, "step": 441 }, { "epoch": 0.2668276486568065, "grad_norm": 0.07901246100664139, "learning_rate": 0.0001878787878787879, "loss": 0.7346, "step": 442 }, { "epoch": 0.267431331119831, "grad_norm": 0.09163561463356018, "learning_rate": 0.0001878372768783728, "loss": 0.762, "step": 443 }, { "epoch": 0.2680350135828554, "grad_norm": 0.09499283879995346, "learning_rate": 0.00018779576587795767, "loss": 0.7669, "step": 444 }, { "epoch": 0.26863869604587987, "grad_norm": 0.1023356169462204, "learning_rate": 0.00018775425487754257, "loss": 0.7431, "step": 445 }, { "epoch": 0.2692423785089043, "grad_norm": 0.10197634249925613, "learning_rate": 0.00018771274387712744, "loss": 0.7059, "step": 446 }, { "epoch": 0.26984606097192876, "grad_norm": 0.09975861012935638, "learning_rate": 0.00018767123287671234, "loss": 0.658, "step": 447 }, { "epoch": 0.2704497434349532, "grad_norm": 0.11030542105436325, "learning_rate": 0.00018762972187629724, "loss": 0.6281, "step": 448 }, { "epoch": 0.27105342589797765, "grad_norm": 0.11011619865894318, "learning_rate": 0.0001875882108758821, "loss": 0.5196, "step": 449 }, { "epoch": 0.2716571083610021, "grad_norm": 0.12410090863704681, "learning_rate": 0.000187546699875467, "loss": 0.4871, "step": 450 }, { "epoch": 0.27226079082402654, "grad_norm": 0.06774574518203735, "learning_rate": 0.0001875051888750519, "loss": 0.7875, "step": 451 }, { "epoch": 0.272864473287051, "grad_norm": 0.06862092763185501, "learning_rate": 0.00018746367787463678, "loss": 0.8497, "step": 452 }, { "epoch": 0.27346815575007544, "grad_norm": 0.07752003520727158, "learning_rate": 0.00018742216687422168, "loss": 0.7635, "step": 453 }, { "epoch": 0.2740718382130999, "grad_norm": 0.138156920671463, "learning_rate": 0.00018738065587380658, "loss": 0.8682, "step": 454 }, { "epoch": 0.2746755206761244, "grad_norm": 0.07247649133205414, "learning_rate": 0.00018733914487339146, "loss": 0.9772, "step": 455 }, { "epoch": 0.27527920313914883, "grad_norm": 0.06809130311012268, "learning_rate": 0.00018729763387297633, "loss": 0.9594, "step": 456 }, { "epoch": 0.2758828856021733, "grad_norm": 0.08479005098342896, "learning_rate": 0.00018725612287256126, "loss": 1.0778, "step": 457 }, { "epoch": 0.2764865680651977, "grad_norm": 0.06665640324354172, "learning_rate": 0.00018721461187214613, "loss": 0.9377, "step": 458 }, { "epoch": 0.27709025052822217, "grad_norm": 0.06898784637451172, "learning_rate": 0.000187173100871731, "loss": 0.8999, "step": 459 }, { "epoch": 0.2776939329912466, "grad_norm": 0.06468936055898666, "learning_rate": 0.0001871315898713159, "loss": 0.9559, "step": 460 }, { "epoch": 0.27829761545427106, "grad_norm": 0.0695393905043602, "learning_rate": 0.0001870900788709008, "loss": 0.8692, "step": 461 }, { "epoch": 0.2789012979172955, "grad_norm": 0.10467389971017838, "learning_rate": 0.00018704856787048568, "loss": 0.8858, "step": 462 }, { "epoch": 0.27950498038031996, "grad_norm": 0.06943607330322266, "learning_rate": 0.00018700705687007058, "loss": 1.0079, "step": 463 }, { "epoch": 0.2801086628433444, "grad_norm": 0.07984504848718643, "learning_rate": 0.00018696554586965548, "loss": 0.8257, "step": 464 }, { "epoch": 0.28071234530636885, "grad_norm": 0.06335430592298508, "learning_rate": 0.00018692403486924035, "loss": 1.1628, "step": 465 }, { "epoch": 0.2813160277693933, "grad_norm": 0.06936845183372498, "learning_rate": 0.00018688252386882525, "loss": 0.8205, "step": 466 }, { "epoch": 0.28191971023241774, "grad_norm": 0.0703270360827446, "learning_rate": 0.00018684101286841015, "loss": 1.053, "step": 467 }, { "epoch": 0.2825233926954422, "grad_norm": 0.06945156306028366, "learning_rate": 0.00018679950186799502, "loss": 0.8785, "step": 468 }, { "epoch": 0.28312707515846663, "grad_norm": 0.08337710797786713, "learning_rate": 0.00018675799086757992, "loss": 1.0398, "step": 469 }, { "epoch": 0.2837307576214911, "grad_norm": 0.07238437235355377, "learning_rate": 0.0001867164798671648, "loss": 0.9231, "step": 470 }, { "epoch": 0.2843344400845155, "grad_norm": 0.0756809413433075, "learning_rate": 0.0001866749688667497, "loss": 1.1661, "step": 471 }, { "epoch": 0.28493812254754, "grad_norm": 0.06537111848592758, "learning_rate": 0.0001866334578663346, "loss": 0.7645, "step": 472 }, { "epoch": 0.2855418050105644, "grad_norm": 0.07265693694353104, "learning_rate": 0.00018659194686591947, "loss": 1.1106, "step": 473 }, { "epoch": 0.28614548747358887, "grad_norm": 0.06632302701473236, "learning_rate": 0.00018655043586550437, "loss": 0.8943, "step": 474 }, { "epoch": 0.28674916993661337, "grad_norm": 0.06948743760585785, "learning_rate": 0.00018650892486508927, "loss": 1.0051, "step": 475 }, { "epoch": 0.2873528523996378, "grad_norm": 0.07059241831302643, "learning_rate": 0.00018646741386467414, "loss": 1.0785, "step": 476 }, { "epoch": 0.28795653486266226, "grad_norm": 0.06578975915908813, "learning_rate": 0.00018642590286425901, "loss": 0.8156, "step": 477 }, { "epoch": 0.2885602173256867, "grad_norm": 0.06178348883986473, "learning_rate": 0.00018638439186384394, "loss": 0.8512, "step": 478 }, { "epoch": 0.28916389978871115, "grad_norm": 0.06867639720439911, "learning_rate": 0.00018634288086342881, "loss": 0.8549, "step": 479 }, { "epoch": 0.2897675822517356, "grad_norm": 0.07352021336555481, "learning_rate": 0.0001863013698630137, "loss": 0.7585, "step": 480 }, { "epoch": 0.29037126471476005, "grad_norm": 0.07471666485071182, "learning_rate": 0.00018625985886259861, "loss": 0.9468, "step": 481 }, { "epoch": 0.2909749471777845, "grad_norm": 0.06977469474077225, "learning_rate": 0.0001862183478621835, "loss": 0.7439, "step": 482 }, { "epoch": 0.29157862964080894, "grad_norm": 0.07040760666131973, "learning_rate": 0.00018617683686176836, "loss": 0.7331, "step": 483 }, { "epoch": 0.2921823121038334, "grad_norm": 0.07527907937765121, "learning_rate": 0.00018613532586135326, "loss": 0.8511, "step": 484 }, { "epoch": 0.29278599456685783, "grad_norm": 0.07468552142381668, "learning_rate": 0.00018609381486093816, "loss": 0.8146, "step": 485 }, { "epoch": 0.2933896770298823, "grad_norm": 0.0728183388710022, "learning_rate": 0.00018605230386052306, "loss": 0.8189, "step": 486 }, { "epoch": 0.2939933594929067, "grad_norm": 0.07946665585041046, "learning_rate": 0.00018601079286010793, "loss": 0.8002, "step": 487 }, { "epoch": 0.29459704195593117, "grad_norm": 0.08301442861557007, "learning_rate": 0.00018596928185969283, "loss": 0.76, "step": 488 }, { "epoch": 0.2952007244189556, "grad_norm": 0.07893984019756317, "learning_rate": 0.00018592777085927773, "loss": 0.7711, "step": 489 }, { "epoch": 0.29580440688198006, "grad_norm": 0.08230159431695938, "learning_rate": 0.0001858862598588626, "loss": 0.7041, "step": 490 }, { "epoch": 0.2964080893450045, "grad_norm": 0.09270324558019638, "learning_rate": 0.00018584474885844748, "loss": 0.7666, "step": 491 }, { "epoch": 0.29701177180802896, "grad_norm": 0.09311135858297348, "learning_rate": 0.0001858032378580324, "loss": 0.7878, "step": 492 }, { "epoch": 0.2976154542710534, "grad_norm": 0.09099919348955154, "learning_rate": 0.00018576172685761728, "loss": 0.7074, "step": 493 }, { "epoch": 0.29821913673407785, "grad_norm": 0.09507730603218079, "learning_rate": 0.00018572021585720215, "loss": 0.6735, "step": 494 }, { "epoch": 0.29882281919710235, "grad_norm": 0.09427333623170853, "learning_rate": 0.00018567870485678708, "loss": 0.7592, "step": 495 }, { "epoch": 0.2994265016601268, "grad_norm": 0.1038491502404213, "learning_rate": 0.00018563719385637195, "loss": 0.7356, "step": 496 }, { "epoch": 0.30003018412315124, "grad_norm": 0.1097760945558548, "learning_rate": 0.00018559568285595683, "loss": 0.6825, "step": 497 }, { "epoch": 0.3006338665861757, "grad_norm": 0.10912280529737473, "learning_rate": 0.00018555417185554173, "loss": 0.6431, "step": 498 }, { "epoch": 0.30123754904920014, "grad_norm": 0.1107010692358017, "learning_rate": 0.00018551266085512663, "loss": 0.5303, "step": 499 }, { "epoch": 0.3018412315122246, "grad_norm": 0.12602561712265015, "learning_rate": 0.0001854711498547115, "loss": 0.4541, "step": 500 }, { "epoch": 0.3018412315122246, "eval_loss": 0.8565130233764648, "eval_runtime": 1219.0924, "eval_samples_per_second": 2.289, "eval_steps_per_second": 0.286, "step": 500 }, { "epoch": 0.30244491397524903, "grad_norm": 0.11287853866815567, "learning_rate": 0.0001854296388542964, "loss": 0.8697, "step": 501 }, { "epoch": 0.3030485964382735, "grad_norm": 0.08270236849784851, "learning_rate": 0.0001853881278538813, "loss": 0.8881, "step": 502 }, { "epoch": 0.3036522789012979, "grad_norm": 0.07861533761024475, "learning_rate": 0.00018534661685346617, "loss": 0.8053, "step": 503 }, { "epoch": 0.30425596136432237, "grad_norm": 0.07589036971330643, "learning_rate": 0.00018530510585305107, "loss": 1.0165, "step": 504 }, { "epoch": 0.3048596438273468, "grad_norm": 0.07618770003318787, "learning_rate": 0.00018526359485263594, "loss": 1.0332, "step": 505 }, { "epoch": 0.30546332629037126, "grad_norm": 0.07502375543117523, "learning_rate": 0.00018522208385222084, "loss": 0.976, "step": 506 }, { "epoch": 0.3060670087533957, "grad_norm": 0.07355045527219772, "learning_rate": 0.00018518057285180574, "loss": 0.7887, "step": 507 }, { "epoch": 0.30667069121642015, "grad_norm": 0.08850781619548798, "learning_rate": 0.00018513906185139062, "loss": 1.0632, "step": 508 }, { "epoch": 0.3072743736794446, "grad_norm": 0.06971225887537003, "learning_rate": 0.00018509755085097552, "loss": 0.9073, "step": 509 }, { "epoch": 0.30787805614246905, "grad_norm": 0.16970930993556976, "learning_rate": 0.00018505603985056042, "loss": 0.9196, "step": 510 }, { "epoch": 0.3084817386054935, "grad_norm": 0.07109450548887253, "learning_rate": 0.0001850145288501453, "loss": 0.8324, "step": 511 }, { "epoch": 0.30908542106851794, "grad_norm": 0.06761594116687775, "learning_rate": 0.0001849730178497302, "loss": 0.8924, "step": 512 }, { "epoch": 0.3096891035315424, "grad_norm": 0.06660860776901245, "learning_rate": 0.0001849315068493151, "loss": 0.8244, "step": 513 }, { "epoch": 0.31029278599456683, "grad_norm": 0.08837427198886871, "learning_rate": 0.00018488999584889996, "loss": 0.8228, "step": 514 }, { "epoch": 0.3108964684575913, "grad_norm": 0.09773479402065277, "learning_rate": 0.00018484848484848484, "loss": 0.8686, "step": 515 }, { "epoch": 0.3115001509206158, "grad_norm": 0.07643985003232956, "learning_rate": 0.00018480697384806976, "loss": 0.8456, "step": 516 }, { "epoch": 0.31210383338364023, "grad_norm": 0.06785845011472702, "learning_rate": 0.00018476546284765464, "loss": 0.8381, "step": 517 }, { "epoch": 0.3127075158466647, "grad_norm": 0.0758255124092102, "learning_rate": 0.0001847239518472395, "loss": 0.8728, "step": 518 }, { "epoch": 0.3133111983096891, "grad_norm": 0.0750250369310379, "learning_rate": 0.0001846824408468244, "loss": 0.7163, "step": 519 }, { "epoch": 0.31391488077271357, "grad_norm": 0.06817881017923355, "learning_rate": 0.0001846409298464093, "loss": 0.7658, "step": 520 }, { "epoch": 0.314518563235738, "grad_norm": 0.06935089081525803, "learning_rate": 0.00018459941884599418, "loss": 0.8448, "step": 521 }, { "epoch": 0.31512224569876246, "grad_norm": 0.0738990381360054, "learning_rate": 0.00018455790784557908, "loss": 0.8026, "step": 522 }, { "epoch": 0.3157259281617869, "grad_norm": 0.07354303449392319, "learning_rate": 0.00018451639684516398, "loss": 0.8194, "step": 523 }, { "epoch": 0.31632961062481135, "grad_norm": 0.07702893018722534, "learning_rate": 0.00018447488584474886, "loss": 0.8923, "step": 524 }, { "epoch": 0.3169332930878358, "grad_norm": 0.09026607125997543, "learning_rate": 0.00018443337484433376, "loss": 0.9738, "step": 525 }, { "epoch": 0.31753697555086025, "grad_norm": 0.08617381006479263, "learning_rate": 0.00018439186384391866, "loss": 0.9765, "step": 526 }, { "epoch": 0.3181406580138847, "grad_norm": 0.07624173164367676, "learning_rate": 0.00018435035284350353, "loss": 1.0301, "step": 527 }, { "epoch": 0.31874434047690914, "grad_norm": 0.06759200245141983, "learning_rate": 0.00018430884184308843, "loss": 0.8237, "step": 528 }, { "epoch": 0.3193480229399336, "grad_norm": 0.07457785308361053, "learning_rate": 0.0001842673308426733, "loss": 0.7363, "step": 529 }, { "epoch": 0.31995170540295803, "grad_norm": 0.08332669734954834, "learning_rate": 0.00018422581984225823, "loss": 0.7471, "step": 530 }, { "epoch": 0.3205553878659825, "grad_norm": 0.0689413771033287, "learning_rate": 0.0001841843088418431, "loss": 0.8585, "step": 531 }, { "epoch": 0.3211590703290069, "grad_norm": 0.06793367117643356, "learning_rate": 0.00018414279784142797, "loss": 1.033, "step": 532 }, { "epoch": 0.32176275279203137, "grad_norm": 0.07640314847230911, "learning_rate": 0.00018410128684101287, "loss": 1.0033, "step": 533 }, { "epoch": 0.3223664352550558, "grad_norm": 0.07172773033380508, "learning_rate": 0.00018405977584059777, "loss": 0.7355, "step": 534 }, { "epoch": 0.32297011771808026, "grad_norm": 0.07412228733301163, "learning_rate": 0.00018401826484018265, "loss": 0.7548, "step": 535 }, { "epoch": 0.32357380018110476, "grad_norm": 0.07675584405660629, "learning_rate": 0.00018397675383976755, "loss": 0.7105, "step": 536 }, { "epoch": 0.3241774826441292, "grad_norm": 0.07256907969713211, "learning_rate": 0.00018393524283935245, "loss": 0.7383, "step": 537 }, { "epoch": 0.32478116510715366, "grad_norm": 0.08796676248311996, "learning_rate": 0.00018389373183893732, "loss": 0.7606, "step": 538 }, { "epoch": 0.3253848475701781, "grad_norm": 0.07517845928668976, "learning_rate": 0.00018385222083852222, "loss": 0.8343, "step": 539 }, { "epoch": 0.32598853003320255, "grad_norm": 0.08098675310611725, "learning_rate": 0.00018381070983810712, "loss": 0.7752, "step": 540 }, { "epoch": 0.326592212496227, "grad_norm": 0.08791480958461761, "learning_rate": 0.000183769198837692, "loss": 0.6954, "step": 541 }, { "epoch": 0.32719589495925144, "grad_norm": 0.09234142303466797, "learning_rate": 0.0001837276878372769, "loss": 0.7773, "step": 542 }, { "epoch": 0.3277995774222759, "grad_norm": 0.09258239716291428, "learning_rate": 0.00018368617683686177, "loss": 0.7787, "step": 543 }, { "epoch": 0.32840325988530034, "grad_norm": 0.09329955279827118, "learning_rate": 0.00018364466583644667, "loss": 0.6927, "step": 544 }, { "epoch": 0.3290069423483248, "grad_norm": 0.1046968549489975, "learning_rate": 0.00018360315483603157, "loss": 0.742, "step": 545 }, { "epoch": 0.32961062481134923, "grad_norm": 0.11193925142288208, "learning_rate": 0.00018356164383561644, "loss": 0.7062, "step": 546 }, { "epoch": 0.3302143072743737, "grad_norm": 0.10192117840051651, "learning_rate": 0.00018352013283520134, "loss": 0.6246, "step": 547 }, { "epoch": 0.3308179897373981, "grad_norm": 0.11968918889760971, "learning_rate": 0.00018347862183478624, "loss": 0.6263, "step": 548 }, { "epoch": 0.33142167220042257, "grad_norm": 0.11495087295770645, "learning_rate": 0.0001834371108343711, "loss": 0.5845, "step": 549 }, { "epoch": 0.332025354663447, "grad_norm": 0.1315176784992218, "learning_rate": 0.000183395599833956, "loss": 0.5137, "step": 550 }, { "epoch": 0.33262903712647146, "grad_norm": 0.07526993751525879, "learning_rate": 0.0001833540888335409, "loss": 0.8475, "step": 551 }, { "epoch": 0.3332327195894959, "grad_norm": 0.08781188726425171, "learning_rate": 0.00018331257783312579, "loss": 1.0003, "step": 552 }, { "epoch": 0.33383640205252035, "grad_norm": 0.07417803257703781, "learning_rate": 0.00018327106683271066, "loss": 0.9112, "step": 553 }, { "epoch": 0.3344400845155448, "grad_norm": 0.08806382119655609, "learning_rate": 0.00018322955583229559, "loss": 0.9675, "step": 554 }, { "epoch": 0.33504376697856925, "grad_norm": 0.08356613665819168, "learning_rate": 0.00018318804483188046, "loss": 0.808, "step": 555 }, { "epoch": 0.33564744944159375, "grad_norm": 0.08373204618692398, "learning_rate": 0.00018314653383146533, "loss": 1.1798, "step": 556 }, { "epoch": 0.3362511319046182, "grad_norm": 0.07093155384063721, "learning_rate": 0.00018310502283105023, "loss": 1.0393, "step": 557 }, { "epoch": 0.33685481436764264, "grad_norm": 0.11354014277458191, "learning_rate": 0.00018306351183063513, "loss": 0.8586, "step": 558 }, { "epoch": 0.3374584968306671, "grad_norm": 0.07332495599985123, "learning_rate": 0.00018302200083022, "loss": 0.6829, "step": 559 }, { "epoch": 0.33806217929369153, "grad_norm": 0.07165679335594177, "learning_rate": 0.0001829804898298049, "loss": 0.8619, "step": 560 }, { "epoch": 0.338665861756716, "grad_norm": 0.07097924500703812, "learning_rate": 0.0001829389788293898, "loss": 0.7529, "step": 561 }, { "epoch": 0.3392695442197404, "grad_norm": 0.06817946583032608, "learning_rate": 0.00018289746782897468, "loss": 0.9555, "step": 562 }, { "epoch": 0.3398732266827649, "grad_norm": 0.07513990998268127, "learning_rate": 0.00018285595682855958, "loss": 0.874, "step": 563 }, { "epoch": 0.3404769091457893, "grad_norm": 0.07172597199678421, "learning_rate": 0.00018281444582814448, "loss": 0.8608, "step": 564 }, { "epoch": 0.34108059160881377, "grad_norm": 0.08141127973794937, "learning_rate": 0.00018277293482772935, "loss": 1.1752, "step": 565 }, { "epoch": 0.3416842740718382, "grad_norm": 0.06964030116796494, "learning_rate": 0.00018273142382731425, "loss": 0.8235, "step": 566 }, { "epoch": 0.34228795653486266, "grad_norm": 0.07054075598716736, "learning_rate": 0.00018268991282689912, "loss": 0.9064, "step": 567 }, { "epoch": 0.3428916389978871, "grad_norm": 0.08551137149333954, "learning_rate": 0.00018264840182648402, "loss": 1.1779, "step": 568 }, { "epoch": 0.34349532146091155, "grad_norm": 0.06879527121782303, "learning_rate": 0.00018260689082606892, "loss": 0.8866, "step": 569 }, { "epoch": 0.344099003923936, "grad_norm": 0.07618753612041473, "learning_rate": 0.0001825653798256538, "loss": 0.9287, "step": 570 }, { "epoch": 0.34470268638696044, "grad_norm": 0.0731632187962532, "learning_rate": 0.0001825238688252387, "loss": 1.2391, "step": 571 }, { "epoch": 0.3453063688499849, "grad_norm": 0.06929552555084229, "learning_rate": 0.0001824823578248236, "loss": 0.8374, "step": 572 }, { "epoch": 0.34591005131300934, "grad_norm": 0.08287245035171509, "learning_rate": 0.00018244084682440847, "loss": 0.9951, "step": 573 }, { "epoch": 0.3465137337760338, "grad_norm": 0.08636925369501114, "learning_rate": 0.00018239933582399337, "loss": 0.7663, "step": 574 }, { "epoch": 0.34711741623905823, "grad_norm": 0.07229752093553543, "learning_rate": 0.00018235782482357827, "loss": 0.9063, "step": 575 }, { "epoch": 0.34772109870208273, "grad_norm": 0.08006583899259567, "learning_rate": 0.00018231631382316314, "loss": 0.8608, "step": 576 }, { "epoch": 0.3483247811651072, "grad_norm": 0.07500839233398438, "learning_rate": 0.00018227480282274804, "loss": 1.0615, "step": 577 }, { "epoch": 0.3489284636281316, "grad_norm": 0.07585518062114716, "learning_rate": 0.00018223329182233294, "loss": 1.3967, "step": 578 }, { "epoch": 0.34953214609115607, "grad_norm": 0.06830618530511856, "learning_rate": 0.00018219178082191782, "loss": 1.0998, "step": 579 }, { "epoch": 0.3501358285541805, "grad_norm": 0.0734933540225029, "learning_rate": 0.00018215026982150272, "loss": 0.8375, "step": 580 }, { "epoch": 0.35073951101720496, "grad_norm": 0.07250437140464783, "learning_rate": 0.0001821087588210876, "loss": 1.0154, "step": 581 }, { "epoch": 0.3513431934802294, "grad_norm": 0.07496879994869232, "learning_rate": 0.0001820672478206725, "loss": 0.8673, "step": 582 }, { "epoch": 0.35194687594325386, "grad_norm": 0.07836568355560303, "learning_rate": 0.0001820257368202574, "loss": 0.8566, "step": 583 }, { "epoch": 0.3525505584062783, "grad_norm": 0.07284071296453476, "learning_rate": 0.00018198422581984226, "loss": 0.7801, "step": 584 }, { "epoch": 0.35315424086930275, "grad_norm": 0.07430974394083023, "learning_rate": 0.00018194271481942716, "loss": 0.7227, "step": 585 }, { "epoch": 0.3537579233323272, "grad_norm": 0.0871773213148117, "learning_rate": 0.00018190120381901206, "loss": 0.6884, "step": 586 }, { "epoch": 0.35436160579535164, "grad_norm": 0.08565318584442139, "learning_rate": 0.00018185969281859693, "loss": 0.7762, "step": 587 }, { "epoch": 0.3549652882583761, "grad_norm": 0.07946131378412247, "learning_rate": 0.00018181818181818183, "loss": 1.008, "step": 588 }, { "epoch": 0.35556897072140053, "grad_norm": 0.07984606176614761, "learning_rate": 0.00018177667081776673, "loss": 0.8231, "step": 589 }, { "epoch": 0.356172653184425, "grad_norm": 0.08642979711294174, "learning_rate": 0.0001817351598173516, "loss": 0.6995, "step": 590 }, { "epoch": 0.3567763356474494, "grad_norm": 0.0884912982583046, "learning_rate": 0.00018169364881693648, "loss": 0.7622, "step": 591 }, { "epoch": 0.3573800181104739, "grad_norm": 0.08437898755073547, "learning_rate": 0.0001816521378165214, "loss": 0.6997, "step": 592 }, { "epoch": 0.3579837005734983, "grad_norm": 0.1158236563205719, "learning_rate": 0.00018161062681610628, "loss": 0.7354, "step": 593 }, { "epoch": 0.35858738303652277, "grad_norm": 0.09563788026571274, "learning_rate": 0.00018156911581569115, "loss": 0.8278, "step": 594 }, { "epoch": 0.3591910654995472, "grad_norm": 0.10473807901144028, "learning_rate": 0.00018152760481527605, "loss": 0.7105, "step": 595 }, { "epoch": 0.3597947479625717, "grad_norm": 0.09579318761825562, "learning_rate": 0.00018148609381486095, "loss": 0.6555, "step": 596 }, { "epoch": 0.36039843042559616, "grad_norm": 0.10552844405174255, "learning_rate": 0.00018144458281444583, "loss": 0.6662, "step": 597 }, { "epoch": 0.3610021128886206, "grad_norm": 0.10778263211250305, "learning_rate": 0.00018140307181403073, "loss": 0.6501, "step": 598 }, { "epoch": 0.36160579535164505, "grad_norm": 0.11160840839147568, "learning_rate": 0.00018136156081361563, "loss": 0.5741, "step": 599 }, { "epoch": 0.3622094778146695, "grad_norm": 0.11790695786476135, "learning_rate": 0.0001813200498132005, "loss": 0.507, "step": 600 }, { "epoch": 0.36281316027769395, "grad_norm": 0.09489453583955765, "learning_rate": 0.0001812785388127854, "loss": 0.905, "step": 601 }, { "epoch": 0.3634168427407184, "grad_norm": 0.07241594046354294, "learning_rate": 0.0001812370278123703, "loss": 0.9352, "step": 602 }, { "epoch": 0.36402052520374284, "grad_norm": 0.07990288734436035, "learning_rate": 0.00018119551681195517, "loss": 0.9155, "step": 603 }, { "epoch": 0.3646242076667673, "grad_norm": 0.07297532260417938, "learning_rate": 0.00018115400581154007, "loss": 0.9952, "step": 604 }, { "epoch": 0.36522789012979173, "grad_norm": 0.10625027120113373, "learning_rate": 0.00018111249481112495, "loss": 1.0103, "step": 605 }, { "epoch": 0.3658315725928162, "grad_norm": 0.0755513533949852, "learning_rate": 0.00018107098381070985, "loss": 0.9527, "step": 606 }, { "epoch": 0.3664352550558406, "grad_norm": 0.0824398621916771, "learning_rate": 0.00018102947281029475, "loss": 1.032, "step": 607 }, { "epoch": 0.36703893751886507, "grad_norm": 0.07464707642793655, "learning_rate": 0.00018098796180987962, "loss": 0.9073, "step": 608 }, { "epoch": 0.3676426199818895, "grad_norm": 0.07036054134368896, "learning_rate": 0.00018094645080946452, "loss": 0.8517, "step": 609 }, { "epoch": 0.36824630244491396, "grad_norm": 0.10687808692455292, "learning_rate": 0.00018090493980904942, "loss": 0.7676, "step": 610 }, { "epoch": 0.3688499849079384, "grad_norm": 0.07665056735277176, "learning_rate": 0.0001808634288086343, "loss": 0.9801, "step": 611 }, { "epoch": 0.36945366737096286, "grad_norm": 0.06765671819448471, "learning_rate": 0.00018082191780821916, "loss": 0.7543, "step": 612 }, { "epoch": 0.3700573498339873, "grad_norm": 0.07217054814100266, "learning_rate": 0.0001807804068078041, "loss": 1.0022, "step": 613 }, { "epoch": 0.37066103229701175, "grad_norm": 0.14387869834899902, "learning_rate": 0.00018073889580738896, "loss": 0.7746, "step": 614 }, { "epoch": 0.3712647147600362, "grad_norm": 0.07988094538450241, "learning_rate": 0.00018069738480697384, "loss": 1.327, "step": 615 }, { "epoch": 0.37186839722306064, "grad_norm": 0.06843268126249313, "learning_rate": 0.00018065587380655876, "loss": 0.7978, "step": 616 }, { "epoch": 0.37247207968608514, "grad_norm": 0.0766974464058876, "learning_rate": 0.00018061436280614364, "loss": 0.7766, "step": 617 }, { "epoch": 0.3730757621491096, "grad_norm": 0.08933945000171661, "learning_rate": 0.0001805728518057285, "loss": 0.7874, "step": 618 }, { "epoch": 0.37367944461213404, "grad_norm": 0.07440406084060669, "learning_rate": 0.0001805313408053134, "loss": 0.722, "step": 619 }, { "epoch": 0.3742831270751585, "grad_norm": 0.0785699188709259, "learning_rate": 0.0001804898298048983, "loss": 1.0223, "step": 620 }, { "epoch": 0.37488680953818293, "grad_norm": 0.07815410941839218, "learning_rate": 0.0001804483188044832, "loss": 0.8886, "step": 621 }, { "epoch": 0.3754904920012074, "grad_norm": 0.06365236639976501, "learning_rate": 0.00018040680780406808, "loss": 0.7909, "step": 622 }, { "epoch": 0.3760941744642318, "grad_norm": 0.07768423855304718, "learning_rate": 0.00018036529680365298, "loss": 0.8831, "step": 623 }, { "epoch": 0.37669785692725627, "grad_norm": 0.07100118696689606, "learning_rate": 0.00018032378580323788, "loss": 0.8649, "step": 624 }, { "epoch": 0.3773015393902807, "grad_norm": 0.08485390990972519, "learning_rate": 0.00018028227480282276, "loss": 0.731, "step": 625 }, { "epoch": 0.37790522185330516, "grad_norm": 0.07133664935827255, "learning_rate": 0.00018024076380240763, "loss": 0.7825, "step": 626 }, { "epoch": 0.3785089043163296, "grad_norm": 0.06745673716068268, "learning_rate": 0.00018019925280199256, "loss": 0.7547, "step": 627 }, { "epoch": 0.37911258677935405, "grad_norm": 0.08499856293201447, "learning_rate": 0.00018015774180157743, "loss": 0.7289, "step": 628 }, { "epoch": 0.3797162692423785, "grad_norm": 0.07550381869077682, "learning_rate": 0.0001801162308011623, "loss": 1.0874, "step": 629 }, { "epoch": 0.38031995170540295, "grad_norm": 0.07187377661466599, "learning_rate": 0.00018007471980074723, "loss": 0.7885, "step": 630 }, { "epoch": 0.3809236341684274, "grad_norm": 0.07693067193031311, "learning_rate": 0.0001800332088003321, "loss": 0.9349, "step": 631 }, { "epoch": 0.38152731663145184, "grad_norm": 0.0718834400177002, "learning_rate": 0.00017999169779991698, "loss": 0.8609, "step": 632 }, { "epoch": 0.3821309990944763, "grad_norm": 0.07122069597244263, "learning_rate": 0.00017995018679950188, "loss": 0.8418, "step": 633 }, { "epoch": 0.38273468155750073, "grad_norm": 0.0743737518787384, "learning_rate": 0.00017990867579908678, "loss": 0.7195, "step": 634 }, { "epoch": 0.3833383640205252, "grad_norm": 0.07467867434024811, "learning_rate": 0.00017986716479867165, "loss": 0.7802, "step": 635 }, { "epoch": 0.3839420464835496, "grad_norm": 0.07535380125045776, "learning_rate": 0.00017982565379825655, "loss": 0.7726, "step": 636 }, { "epoch": 0.38454572894657413, "grad_norm": 0.07515112310647964, "learning_rate": 0.00017978414279784145, "loss": 0.7936, "step": 637 }, { "epoch": 0.3851494114095986, "grad_norm": 0.08222845941781998, "learning_rate": 0.00017974263179742632, "loss": 0.7473, "step": 638 }, { "epoch": 0.385753093872623, "grad_norm": 0.09565534442663193, "learning_rate": 0.00017970112079701122, "loss": 0.7899, "step": 639 }, { "epoch": 0.38635677633564747, "grad_norm": 0.0862174928188324, "learning_rate": 0.0001796596097965961, "loss": 0.7633, "step": 640 }, { "epoch": 0.3869604587986719, "grad_norm": 0.08321114629507065, "learning_rate": 0.000179618098796181, "loss": 0.6477, "step": 641 }, { "epoch": 0.38756414126169636, "grad_norm": 0.09201504290103912, "learning_rate": 0.0001795765877957659, "loss": 0.7508, "step": 642 }, { "epoch": 0.3881678237247208, "grad_norm": 0.09954454004764557, "learning_rate": 0.00017953507679535077, "loss": 0.7636, "step": 643 }, { "epoch": 0.38877150618774525, "grad_norm": 0.10669595003128052, "learning_rate": 0.00017949356579493567, "loss": 0.7761, "step": 644 }, { "epoch": 0.3893751886507697, "grad_norm": 0.09882800281047821, "learning_rate": 0.00017945205479452057, "loss": 0.7333, "step": 645 }, { "epoch": 0.38997887111379415, "grad_norm": 0.10011550784111023, "learning_rate": 0.00017941054379410544, "loss": 0.6698, "step": 646 }, { "epoch": 0.3905825535768186, "grad_norm": 0.1063833087682724, "learning_rate": 0.00017936903279369034, "loss": 0.6371, "step": 647 }, { "epoch": 0.39118623603984304, "grad_norm": 0.11087442189455032, "learning_rate": 0.00017932752179327524, "loss": 0.6137, "step": 648 }, { "epoch": 0.3917899185028675, "grad_norm": 0.1278572678565979, "learning_rate": 0.0001792860107928601, "loss": 0.6264, "step": 649 }, { "epoch": 0.39239360096589193, "grad_norm": 0.13009613752365112, "learning_rate": 0.00017924449979244499, "loss": 0.5163, "step": 650 }, { "epoch": 0.3929972834289164, "grad_norm": 0.08168315887451172, "learning_rate": 0.0001792029887920299, "loss": 0.8955, "step": 651 }, { "epoch": 0.3936009658919408, "grad_norm": 0.08920527994632721, "learning_rate": 0.00017916147779161479, "loss": 1.0973, "step": 652 }, { "epoch": 0.39420464835496527, "grad_norm": 0.0853213369846344, "learning_rate": 0.00017911996679119966, "loss": 0.8951, "step": 653 }, { "epoch": 0.3948083308179897, "grad_norm": 0.07048202306032181, "learning_rate": 0.00017907845579078456, "loss": 0.6935, "step": 654 }, { "epoch": 0.39541201328101416, "grad_norm": 0.08051397651433945, "learning_rate": 0.00017903694479036946, "loss": 1.1476, "step": 655 }, { "epoch": 0.3960156957440386, "grad_norm": 0.07060935348272324, "learning_rate": 0.00017899543378995433, "loss": 0.7518, "step": 656 }, { "epoch": 0.3966193782070631, "grad_norm": 0.0792861208319664, "learning_rate": 0.00017895392278953923, "loss": 0.843, "step": 657 }, { "epoch": 0.39722306067008756, "grad_norm": 0.07856278121471405, "learning_rate": 0.00017891241178912413, "loss": 0.949, "step": 658 }, { "epoch": 0.397826743133112, "grad_norm": 0.07348600775003433, "learning_rate": 0.000178870900788709, "loss": 0.902, "step": 659 }, { "epoch": 0.39843042559613645, "grad_norm": 0.06807007640600204, "learning_rate": 0.0001788293897882939, "loss": 0.8443, "step": 660 }, { "epoch": 0.3990341080591609, "grad_norm": 0.07370761781930923, "learning_rate": 0.0001787878787878788, "loss": 0.8161, "step": 661 }, { "epoch": 0.39963779052218534, "grad_norm": 0.08829770982265472, "learning_rate": 0.00017874636778746368, "loss": 0.7679, "step": 662 }, { "epoch": 0.4002414729852098, "grad_norm": 0.07680663466453552, "learning_rate": 0.00017870485678704858, "loss": 0.8106, "step": 663 }, { "epoch": 0.40084515544823424, "grad_norm": 0.07080180197954178, "learning_rate": 0.00017866334578663345, "loss": 0.8348, "step": 664 }, { "epoch": 0.4014488379112587, "grad_norm": 0.0829470232129097, "learning_rate": 0.00017862183478621838, "loss": 0.8589, "step": 665 }, { "epoch": 0.40205252037428313, "grad_norm": 0.065013587474823, "learning_rate": 0.00017858032378580325, "loss": 0.736, "step": 666 }, { "epoch": 0.4026562028373076, "grad_norm": 0.07887616008520126, "learning_rate": 0.00017853881278538812, "loss": 0.8547, "step": 667 }, { "epoch": 0.403259885300332, "grad_norm": 0.07337549328804016, "learning_rate": 0.00017849730178497302, "loss": 0.8766, "step": 668 }, { "epoch": 0.40386356776335647, "grad_norm": 0.06863244622945786, "learning_rate": 0.00017845579078455792, "loss": 0.7484, "step": 669 }, { "epoch": 0.4044672502263809, "grad_norm": 0.07943541556596756, "learning_rate": 0.0001784142797841428, "loss": 0.8951, "step": 670 }, { "epoch": 0.40507093268940536, "grad_norm": 0.07468891143798828, "learning_rate": 0.0001783727687837277, "loss": 0.8805, "step": 671 }, { "epoch": 0.4056746151524298, "grad_norm": 0.06805651634931564, "learning_rate": 0.0001783312577833126, "loss": 0.9149, "step": 672 }, { "epoch": 0.40627829761545425, "grad_norm": 0.08728659898042679, "learning_rate": 0.00017828974678289747, "loss": 1.1018, "step": 673 }, { "epoch": 0.4068819800784787, "grad_norm": 0.07230555266141891, "learning_rate": 0.00017824823578248237, "loss": 0.8377, "step": 674 }, { "epoch": 0.40748566254150315, "grad_norm": 0.07561583071947098, "learning_rate": 0.00017820672478206727, "loss": 0.8613, "step": 675 }, { "epoch": 0.4080893450045276, "grad_norm": 0.07437833398580551, "learning_rate": 0.00017816521378165214, "loss": 0.9668, "step": 676 }, { "epoch": 0.4086930274675521, "grad_norm": 0.09480807930231094, "learning_rate": 0.00017812370278123704, "loss": 0.9128, "step": 677 }, { "epoch": 0.40929670993057654, "grad_norm": 0.07243344187736511, "learning_rate": 0.00017808219178082192, "loss": 1.0159, "step": 678 }, { "epoch": 0.409900392393601, "grad_norm": 0.07010567933320999, "learning_rate": 0.00017804068078040682, "loss": 0.7627, "step": 679 }, { "epoch": 0.41050407485662543, "grad_norm": 0.07201089709997177, "learning_rate": 0.00017799916977999172, "loss": 1.0637, "step": 680 }, { "epoch": 0.4111077573196499, "grad_norm": 0.0743207260966301, "learning_rate": 0.0001779576587795766, "loss": 0.8637, "step": 681 }, { "epoch": 0.4117114397826743, "grad_norm": 0.06971647590398788, "learning_rate": 0.0001779161477791615, "loss": 0.8119, "step": 682 }, { "epoch": 0.4123151222456988, "grad_norm": 0.07709638774394989, "learning_rate": 0.0001778746367787464, "loss": 0.8523, "step": 683 }, { "epoch": 0.4129188047087232, "grad_norm": 0.07961713522672653, "learning_rate": 0.00017783312577833126, "loss": 0.8088, "step": 684 }, { "epoch": 0.41352248717174767, "grad_norm": 0.0781940221786499, "learning_rate": 0.00017779161477791616, "loss": 0.7029, "step": 685 }, { "epoch": 0.4141261696347721, "grad_norm": 0.07617820799350739, "learning_rate": 0.00017775010377750106, "loss": 0.7593, "step": 686 }, { "epoch": 0.41472985209779656, "grad_norm": 0.08119190484285355, "learning_rate": 0.00017770859277708593, "loss": 0.8067, "step": 687 }, { "epoch": 0.415333534560821, "grad_norm": 0.07956133782863617, "learning_rate": 0.0001776670817766708, "loss": 0.7693, "step": 688 }, { "epoch": 0.41593721702384545, "grad_norm": 0.08153844624757767, "learning_rate": 0.00017762557077625574, "loss": 0.7713, "step": 689 }, { "epoch": 0.4165408994868699, "grad_norm": 0.08829324692487717, "learning_rate": 0.0001775840597758406, "loss": 0.7574, "step": 690 }, { "epoch": 0.41714458194989434, "grad_norm": 0.09734578430652618, "learning_rate": 0.00017754254877542548, "loss": 0.7962, "step": 691 }, { "epoch": 0.4177482644129188, "grad_norm": 0.08630529791116714, "learning_rate": 0.00017750103777501038, "loss": 0.7482, "step": 692 }, { "epoch": 0.41835194687594324, "grad_norm": 0.09447003155946732, "learning_rate": 0.00017745952677459528, "loss": 0.716, "step": 693 }, { "epoch": 0.4189556293389677, "grad_norm": 0.09416269510984421, "learning_rate": 0.00017741801577418015, "loss": 0.7481, "step": 694 }, { "epoch": 0.41955931180199213, "grad_norm": 0.10847954452037811, "learning_rate": 0.00017737650477376505, "loss": 0.668, "step": 695 }, { "epoch": 0.4201629942650166, "grad_norm": 0.10643991082906723, "learning_rate": 0.00017733499377334995, "loss": 0.6973, "step": 696 }, { "epoch": 0.4207666767280411, "grad_norm": 0.10840442776679993, "learning_rate": 0.00017729348277293483, "loss": 0.6755, "step": 697 }, { "epoch": 0.4213703591910655, "grad_norm": 0.10803359001874924, "learning_rate": 0.00017725197177251973, "loss": 0.6234, "step": 698 }, { "epoch": 0.42197404165408997, "grad_norm": 0.12027863413095474, "learning_rate": 0.00017721046077210463, "loss": 0.6337, "step": 699 }, { "epoch": 0.4225777241171144, "grad_norm": 0.1258537471294403, "learning_rate": 0.0001771689497716895, "loss": 0.447, "step": 700 }, { "epoch": 0.42318140658013886, "grad_norm": 0.07856849581003189, "learning_rate": 0.0001771274387712744, "loss": 1.0295, "step": 701 }, { "epoch": 0.4237850890431633, "grad_norm": 0.08135174959897995, "learning_rate": 0.00017708592777085927, "loss": 0.8548, "step": 702 }, { "epoch": 0.42438877150618776, "grad_norm": 0.08222603797912598, "learning_rate": 0.00017704441677044417, "loss": 0.8463, "step": 703 }, { "epoch": 0.4249924539692122, "grad_norm": 0.07562818378210068, "learning_rate": 0.00017700290577002907, "loss": 0.7474, "step": 704 }, { "epoch": 0.42559613643223665, "grad_norm": 0.11416960507631302, "learning_rate": 0.00017696139476961395, "loss": 0.9287, "step": 705 }, { "epoch": 0.4261998188952611, "grad_norm": 0.07070968300104141, "learning_rate": 0.00017691988376919885, "loss": 0.7686, "step": 706 }, { "epoch": 0.42680350135828554, "grad_norm": 0.10440545529127121, "learning_rate": 0.00017687837276878375, "loss": 0.8466, "step": 707 }, { "epoch": 0.42740718382131, "grad_norm": 0.10694804787635803, "learning_rate": 0.00017683686176836862, "loss": 0.7819, "step": 708 }, { "epoch": 0.42801086628433443, "grad_norm": 0.07630432397127151, "learning_rate": 0.00017679535076795352, "loss": 0.8324, "step": 709 }, { "epoch": 0.4286145487473589, "grad_norm": 0.07171212136745453, "learning_rate": 0.00017675383976753842, "loss": 0.819, "step": 710 }, { "epoch": 0.4292182312103833, "grad_norm": 0.06875535845756531, "learning_rate": 0.0001767123287671233, "loss": 0.7489, "step": 711 }, { "epoch": 0.4298219136734078, "grad_norm": 0.07789517194032669, "learning_rate": 0.0001766708177667082, "loss": 0.8193, "step": 712 }, { "epoch": 0.4304255961364322, "grad_norm": 0.07542435824871063, "learning_rate": 0.0001766293067662931, "loss": 1.0486, "step": 713 }, { "epoch": 0.43102927859945667, "grad_norm": 0.07148928195238113, "learning_rate": 0.00017658779576587796, "loss": 1.1104, "step": 714 }, { "epoch": 0.4316329610624811, "grad_norm": 0.0701369047164917, "learning_rate": 0.00017654628476546286, "loss": 0.8409, "step": 715 }, { "epoch": 0.43223664352550556, "grad_norm": 0.07325640320777893, "learning_rate": 0.00017650477376504774, "loss": 0.8249, "step": 716 }, { "epoch": 0.43284032598853, "grad_norm": 0.07507720589637756, "learning_rate": 0.00017646326276463264, "loss": 0.8903, "step": 717 }, { "epoch": 0.4334440084515545, "grad_norm": 0.07389452308416367, "learning_rate": 0.00017642175176421754, "loss": 0.9039, "step": 718 }, { "epoch": 0.43404769091457895, "grad_norm": 0.07989213615655899, "learning_rate": 0.0001763802407638024, "loss": 0.8314, "step": 719 }, { "epoch": 0.4346513733776034, "grad_norm": 0.0715138241648674, "learning_rate": 0.0001763387297633873, "loss": 0.8579, "step": 720 }, { "epoch": 0.43525505584062785, "grad_norm": 0.09497915208339691, "learning_rate": 0.0001762972187629722, "loss": 0.8342, "step": 721 }, { "epoch": 0.4358587383036523, "grad_norm": 0.07308909296989441, "learning_rate": 0.00017625570776255708, "loss": 0.87, "step": 722 }, { "epoch": 0.43646242076667674, "grad_norm": 0.08746104687452316, "learning_rate": 0.00017621419676214196, "loss": 1.1309, "step": 723 }, { "epoch": 0.4370661032297012, "grad_norm": 0.09257882833480835, "learning_rate": 0.00017617268576172688, "loss": 0.7745, "step": 724 }, { "epoch": 0.43766978569272563, "grad_norm": 0.08195153623819351, "learning_rate": 0.00017613117476131176, "loss": 0.7545, "step": 725 }, { "epoch": 0.4382734681557501, "grad_norm": 0.07948730885982513, "learning_rate": 0.00017608966376089663, "loss": 0.8637, "step": 726 }, { "epoch": 0.4388771506187745, "grad_norm": 0.08171603083610535, "learning_rate": 0.00017604815276048156, "loss": 0.8005, "step": 727 }, { "epoch": 0.43948083308179897, "grad_norm": 0.0735010951757431, "learning_rate": 0.00017600664176006643, "loss": 0.8667, "step": 728 }, { "epoch": 0.4400845155448234, "grad_norm": 0.07493755221366882, "learning_rate": 0.0001759651307596513, "loss": 0.8406, "step": 729 }, { "epoch": 0.44068819800784786, "grad_norm": 0.07382465153932571, "learning_rate": 0.0001759236197592362, "loss": 0.9226, "step": 730 }, { "epoch": 0.4412918804708723, "grad_norm": 0.10297898948192596, "learning_rate": 0.0001758821087588211, "loss": 0.8471, "step": 731 }, { "epoch": 0.44189556293389676, "grad_norm": 0.08343175053596497, "learning_rate": 0.00017584059775840598, "loss": 0.8474, "step": 732 }, { "epoch": 0.4424992453969212, "grad_norm": 0.07784884423017502, "learning_rate": 0.00017579908675799088, "loss": 0.7984, "step": 733 }, { "epoch": 0.44310292785994565, "grad_norm": 0.07451200485229492, "learning_rate": 0.00017575757575757578, "loss": 0.8766, "step": 734 }, { "epoch": 0.4437066103229701, "grad_norm": 0.08384682983160019, "learning_rate": 0.00017571606475716065, "loss": 0.806, "step": 735 }, { "epoch": 0.44431029278599454, "grad_norm": 0.0747980996966362, "learning_rate": 0.00017567455375674555, "loss": 0.6971, "step": 736 }, { "epoch": 0.444913975249019, "grad_norm": 0.07966098934412003, "learning_rate": 0.00017563304275633042, "loss": 0.8149, "step": 737 }, { "epoch": 0.4455176577120435, "grad_norm": 0.08828859031200409, "learning_rate": 0.00017559153175591532, "loss": 0.7579, "step": 738 }, { "epoch": 0.44612134017506794, "grad_norm": 0.08065015077590942, "learning_rate": 0.00017555002075550022, "loss": 0.7766, "step": 739 }, { "epoch": 0.4467250226380924, "grad_norm": 0.09102361649274826, "learning_rate": 0.0001755085097550851, "loss": 0.8112, "step": 740 }, { "epoch": 0.44732870510111683, "grad_norm": 0.09253661334514618, "learning_rate": 0.00017546699875467, "loss": 0.7208, "step": 741 }, { "epoch": 0.4479323875641413, "grad_norm": 0.11388830095529556, "learning_rate": 0.0001754254877542549, "loss": 0.759, "step": 742 }, { "epoch": 0.4485360700271657, "grad_norm": 0.09276237338781357, "learning_rate": 0.00017538397675383977, "loss": 0.7928, "step": 743 }, { "epoch": 0.44913975249019017, "grad_norm": 0.09758086502552032, "learning_rate": 0.00017534246575342467, "loss": 0.7086, "step": 744 }, { "epoch": 0.4497434349532146, "grad_norm": 0.1056419312953949, "learning_rate": 0.00017530095475300957, "loss": 0.7533, "step": 745 }, { "epoch": 0.45034711741623906, "grad_norm": 0.10619323700666428, "learning_rate": 0.00017525944375259444, "loss": 0.7371, "step": 746 }, { "epoch": 0.4509507998792635, "grad_norm": 0.13212867081165314, "learning_rate": 0.00017521793275217931, "loss": 0.6729, "step": 747 }, { "epoch": 0.45155448234228796, "grad_norm": 0.12233047932386398, "learning_rate": 0.00017517642175176424, "loss": 0.6855, "step": 748 }, { "epoch": 0.4521581648053124, "grad_norm": 0.11795574426651001, "learning_rate": 0.00017513491075134911, "loss": 0.522, "step": 749 }, { "epoch": 0.45276184726833685, "grad_norm": 0.12232359498739243, "learning_rate": 0.000175093399750934, "loss": 0.5003, "step": 750 }, { "epoch": 0.4533655297313613, "grad_norm": 0.11266329884529114, "learning_rate": 0.0001750518887505189, "loss": 0.7975, "step": 751 }, { "epoch": 0.45396921219438574, "grad_norm": 0.12558221817016602, "learning_rate": 0.0001750103777501038, "loss": 1.0336, "step": 752 }, { "epoch": 0.4545728946574102, "grad_norm": 0.08540918678045273, "learning_rate": 0.0001749688667496887, "loss": 0.8133, "step": 753 }, { "epoch": 0.45517657712043463, "grad_norm": 0.07365544140338898, "learning_rate": 0.00017492735574927356, "loss": 0.9823, "step": 754 }, { "epoch": 0.4557802595834591, "grad_norm": 0.07597804814577103, "learning_rate": 0.00017488584474885846, "loss": 0.8333, "step": 755 }, { "epoch": 0.4563839420464835, "grad_norm": 0.12274730950593948, "learning_rate": 0.00017484433374844336, "loss": 0.8916, "step": 756 }, { "epoch": 0.456987624509508, "grad_norm": 0.07644704729318619, "learning_rate": 0.00017480282274802823, "loss": 1.2536, "step": 757 }, { "epoch": 0.4575913069725325, "grad_norm": 0.0743841826915741, "learning_rate": 0.00017476131174761313, "loss": 0.8892, "step": 758 }, { "epoch": 0.4581949894355569, "grad_norm": 0.07549133896827698, "learning_rate": 0.00017471980074719803, "loss": 0.9356, "step": 759 }, { "epoch": 0.45879867189858137, "grad_norm": 0.10283336043357849, "learning_rate": 0.0001746782897467829, "loss": 0.8478, "step": 760 }, { "epoch": 0.4594023543616058, "grad_norm": 0.0773346945643425, "learning_rate": 0.00017463677874636778, "loss": 0.8264, "step": 761 }, { "epoch": 0.46000603682463026, "grad_norm": 0.08246394246816635, "learning_rate": 0.0001745952677459527, "loss": 0.8789, "step": 762 }, { "epoch": 0.4606097192876547, "grad_norm": 0.0820598155260086, "learning_rate": 0.00017455375674553758, "loss": 0.8355, "step": 763 }, { "epoch": 0.46121340175067915, "grad_norm": 0.08660144358873367, "learning_rate": 0.00017451224574512245, "loss": 0.9191, "step": 764 }, { "epoch": 0.4618170842137036, "grad_norm": 0.07887571305036545, "learning_rate": 0.00017447073474470735, "loss": 0.8801, "step": 765 }, { "epoch": 0.46242076667672805, "grad_norm": 0.07366171479225159, "learning_rate": 0.00017442922374429225, "loss": 0.9114, "step": 766 }, { "epoch": 0.4630244491397525, "grad_norm": 0.07119464874267578, "learning_rate": 0.00017438771274387712, "loss": 0.7584, "step": 767 }, { "epoch": 0.46362813160277694, "grad_norm": 0.07550892978906631, "learning_rate": 0.00017434620174346202, "loss": 0.9126, "step": 768 }, { "epoch": 0.4642318140658014, "grad_norm": 0.07863172888755798, "learning_rate": 0.00017430469074304692, "loss": 0.8234, "step": 769 }, { "epoch": 0.46483549652882583, "grad_norm": 0.08048754185438156, "learning_rate": 0.0001742631797426318, "loss": 0.7634, "step": 770 }, { "epoch": 0.4654391789918503, "grad_norm": 0.0739179328083992, "learning_rate": 0.0001742216687422167, "loss": 0.782, "step": 771 }, { "epoch": 0.4660428614548747, "grad_norm": 0.07369952648878098, "learning_rate": 0.0001741801577418016, "loss": 0.9965, "step": 772 }, { "epoch": 0.46664654391789917, "grad_norm": 0.0785854384303093, "learning_rate": 0.00017413864674138647, "loss": 0.9077, "step": 773 }, { "epoch": 0.4672502263809236, "grad_norm": 0.07083092629909515, "learning_rate": 0.00017409713574097137, "loss": 0.8157, "step": 774 }, { "epoch": 0.46785390884394806, "grad_norm": 0.07597315311431885, "learning_rate": 0.00017405562474055624, "loss": 1.1588, "step": 775 }, { "epoch": 0.4684575913069725, "grad_norm": 0.07124720513820648, "learning_rate": 0.00017401411374014114, "loss": 0.8374, "step": 776 }, { "epoch": 0.46906127376999696, "grad_norm": 0.0762951672077179, "learning_rate": 0.00017397260273972604, "loss": 0.7713, "step": 777 }, { "epoch": 0.46966495623302146, "grad_norm": 0.09306569397449493, "learning_rate": 0.00017393109173931092, "loss": 0.7561, "step": 778 }, { "epoch": 0.4702686386960459, "grad_norm": 0.09182307124137878, "learning_rate": 0.00017388958073889582, "loss": 0.816, "step": 779 }, { "epoch": 0.47087232115907035, "grad_norm": 0.08006258308887482, "learning_rate": 0.00017384806973848072, "loss": 0.7516, "step": 780 }, { "epoch": 0.4714760036220948, "grad_norm": 0.07114402204751968, "learning_rate": 0.0001738065587380656, "loss": 0.8256, "step": 781 }, { "epoch": 0.47207968608511924, "grad_norm": 0.08175129443407059, "learning_rate": 0.0001737650477376505, "loss": 0.9031, "step": 782 }, { "epoch": 0.4726833685481437, "grad_norm": 0.06829746812582016, "learning_rate": 0.0001737235367372354, "loss": 0.7038, "step": 783 }, { "epoch": 0.47328705101116814, "grad_norm": 0.07961759716272354, "learning_rate": 0.00017368202573682026, "loss": 0.8112, "step": 784 }, { "epoch": 0.4738907334741926, "grad_norm": 0.08282366394996643, "learning_rate": 0.00017364051473640514, "loss": 0.7348, "step": 785 }, { "epoch": 0.47449441593721703, "grad_norm": 0.07623188942670822, "learning_rate": 0.00017359900373599006, "loss": 0.7162, "step": 786 }, { "epoch": 0.4750980984002415, "grad_norm": 0.07791578769683838, "learning_rate": 0.00017355749273557494, "loss": 0.7827, "step": 787 }, { "epoch": 0.4757017808632659, "grad_norm": 0.07546679675579071, "learning_rate": 0.0001735159817351598, "loss": 0.733, "step": 788 }, { "epoch": 0.47630546332629037, "grad_norm": 0.08498945832252502, "learning_rate": 0.0001734744707347447, "loss": 0.8845, "step": 789 }, { "epoch": 0.4769091457893148, "grad_norm": 0.08883391320705414, "learning_rate": 0.0001734329597343296, "loss": 0.802, "step": 790 }, { "epoch": 0.47751282825233926, "grad_norm": 0.08220940828323364, "learning_rate": 0.00017339144873391448, "loss": 0.7421, "step": 791 }, { "epoch": 0.4781165107153637, "grad_norm": 0.0887073203921318, "learning_rate": 0.00017334993773349938, "loss": 0.7697, "step": 792 }, { "epoch": 0.47872019317838815, "grad_norm": 0.09844100475311279, "learning_rate": 0.00017330842673308428, "loss": 0.7502, "step": 793 }, { "epoch": 0.4793238756414126, "grad_norm": 0.10373784601688385, "learning_rate": 0.00017326691573266915, "loss": 0.7348, "step": 794 }, { "epoch": 0.47992755810443705, "grad_norm": 0.09787629544734955, "learning_rate": 0.00017322540473225405, "loss": 0.7995, "step": 795 }, { "epoch": 0.4805312405674615, "grad_norm": 0.10174565017223358, "learning_rate": 0.00017318389373183895, "loss": 0.6651, "step": 796 }, { "epoch": 0.48113492303048594, "grad_norm": 0.10913451015949249, "learning_rate": 0.00017314238273142385, "loss": 0.6945, "step": 797 }, { "epoch": 0.48173860549351044, "grad_norm": 0.109565369784832, "learning_rate": 0.00017310087173100873, "loss": 0.6027, "step": 798 }, { "epoch": 0.4823422879565349, "grad_norm": 0.12080711126327515, "learning_rate": 0.0001730593607305936, "loss": 0.533, "step": 799 }, { "epoch": 0.48294597041955933, "grad_norm": 0.11455202102661133, "learning_rate": 0.00017301784973017853, "loss": 0.4213, "step": 800 }, { "epoch": 0.4835496528825838, "grad_norm": 0.08588794618844986, "learning_rate": 0.0001729763387297634, "loss": 0.8639, "step": 801 }, { "epoch": 0.4841533353456082, "grad_norm": 0.08318472653627396, "learning_rate": 0.00017293482772934827, "loss": 0.9486, "step": 802 }, { "epoch": 0.4847570178086327, "grad_norm": 0.08030420541763306, "learning_rate": 0.00017289331672893317, "loss": 0.8172, "step": 803 }, { "epoch": 0.4853607002716571, "grad_norm": 0.08063428103923798, "learning_rate": 0.00017285180572851807, "loss": 0.7898, "step": 804 }, { "epoch": 0.48596438273468157, "grad_norm": 0.07770851999521255, "learning_rate": 0.00017281029472810295, "loss": 1.0558, "step": 805 }, { "epoch": 0.486568065197706, "grad_norm": 0.08149967342615128, "learning_rate": 0.00017276878372768785, "loss": 0.8185, "step": 806 }, { "epoch": 0.48717174766073046, "grad_norm": 0.08028626441955566, "learning_rate": 0.00017272727272727275, "loss": 0.858, "step": 807 }, { "epoch": 0.4877754301237549, "grad_norm": 0.07975967228412628, "learning_rate": 0.00017268576172685762, "loss": 0.861, "step": 808 }, { "epoch": 0.48837911258677935, "grad_norm": 0.17568281292915344, "learning_rate": 0.00017264425072644252, "loss": 0.8738, "step": 809 }, { "epoch": 0.4889827950498038, "grad_norm": 0.0747470036149025, "learning_rate": 0.00017260273972602742, "loss": 1.0316, "step": 810 }, { "epoch": 0.48958647751282824, "grad_norm": 0.0755024403333664, "learning_rate": 0.0001725612287256123, "loss": 0.8248, "step": 811 }, { "epoch": 0.4901901599758527, "grad_norm": 0.0713796466588974, "learning_rate": 0.0001725197177251972, "loss": 0.764, "step": 812 }, { "epoch": 0.49079384243887714, "grad_norm": 0.07347027212381363, "learning_rate": 0.00017247820672478207, "loss": 0.8605, "step": 813 }, { "epoch": 0.4913975249019016, "grad_norm": 0.07424753159284592, "learning_rate": 0.00017243669572436697, "loss": 0.7838, "step": 814 }, { "epoch": 0.49200120736492603, "grad_norm": 0.08433883637189865, "learning_rate": 0.00017239518472395187, "loss": 0.8561, "step": 815 }, { "epoch": 0.4926048898279505, "grad_norm": 0.07728437334299088, "learning_rate": 0.00017235367372353674, "loss": 0.8175, "step": 816 }, { "epoch": 0.4932085722909749, "grad_norm": 0.0686701089143753, "learning_rate": 0.00017231216272312164, "loss": 0.9421, "step": 817 }, { "epoch": 0.49381225475399937, "grad_norm": 0.08988066762685776, "learning_rate": 0.00017227065172270654, "loss": 0.8417, "step": 818 }, { "epoch": 0.49441593721702387, "grad_norm": 0.07794896513223648, "learning_rate": 0.0001722291407222914, "loss": 0.9656, "step": 819 }, { "epoch": 0.4950196196800483, "grad_norm": 0.0833602100610733, "learning_rate": 0.00017218762972187628, "loss": 0.8488, "step": 820 }, { "epoch": 0.49562330214307276, "grad_norm": 0.08479736745357513, "learning_rate": 0.0001721461187214612, "loss": 0.8215, "step": 821 }, { "epoch": 0.4962269846060972, "grad_norm": 0.07611726969480515, "learning_rate": 0.00017210460772104608, "loss": 0.7847, "step": 822 }, { "epoch": 0.49683066706912166, "grad_norm": 0.07969488203525543, "learning_rate": 0.00017206309672063096, "loss": 0.7988, "step": 823 }, { "epoch": 0.4974343495321461, "grad_norm": 0.07162702083587646, "learning_rate": 0.00017202158572021588, "loss": 0.8497, "step": 824 }, { "epoch": 0.49803803199517055, "grad_norm": 0.0738188773393631, "learning_rate": 0.00017198007471980076, "loss": 0.9446, "step": 825 }, { "epoch": 0.498641714458195, "grad_norm": 0.36350712180137634, "learning_rate": 0.00017193856371938563, "loss": 0.7566, "step": 826 }, { "epoch": 0.49924539692121944, "grad_norm": 0.0755830928683281, "learning_rate": 0.00017189705271897053, "loss": 1.0117, "step": 827 }, { "epoch": 0.4998490793842439, "grad_norm": 0.07795432209968567, "learning_rate": 0.00017185554171855543, "loss": 0.8203, "step": 828 }, { "epoch": 0.5004527618472684, "grad_norm": 0.08904866874217987, "learning_rate": 0.0001718140307181403, "loss": 0.7696, "step": 829 }, { "epoch": 0.5010564443102928, "grad_norm": 0.07873449474573135, "learning_rate": 0.0001717725197177252, "loss": 0.8633, "step": 830 }, { "epoch": 0.5016601267733173, "grad_norm": 0.0743393748998642, "learning_rate": 0.0001717310087173101, "loss": 0.8497, "step": 831 }, { "epoch": 0.5022638092363417, "grad_norm": 0.07956568151712418, "learning_rate": 0.00017168949771689498, "loss": 0.7725, "step": 832 }, { "epoch": 0.5028674916993662, "grad_norm": 0.07717841863632202, "learning_rate": 0.00017164798671647988, "loss": 0.8644, "step": 833 }, { "epoch": 0.5034711741623906, "grad_norm": 0.07158881425857544, "learning_rate": 0.00017160647571606475, "loss": 0.7757, "step": 834 }, { "epoch": 0.5040748566254151, "grad_norm": 0.0767531618475914, "learning_rate": 0.00017156496471564965, "loss": 0.7311, "step": 835 }, { "epoch": 0.5046785390884395, "grad_norm": 0.07811619341373444, "learning_rate": 0.00017152345371523455, "loss": 0.7659, "step": 836 }, { "epoch": 0.505282221551464, "grad_norm": 0.07905134558677673, "learning_rate": 0.00017148194271481942, "loss": 0.7226, "step": 837 }, { "epoch": 0.5058859040144884, "grad_norm": 0.08344464004039764, "learning_rate": 0.00017144043171440432, "loss": 0.7939, "step": 838 }, { "epoch": 0.5064895864775129, "grad_norm": 0.07940410822629929, "learning_rate": 0.00017139892071398922, "loss": 0.7231, "step": 839 }, { "epoch": 0.5070932689405373, "grad_norm": 0.08555116504430771, "learning_rate": 0.0001713574097135741, "loss": 0.7344, "step": 840 }, { "epoch": 0.5076969514035617, "grad_norm": 0.09922269731760025, "learning_rate": 0.000171315898713159, "loss": 0.6382, "step": 841 }, { "epoch": 0.5083006338665862, "grad_norm": 0.08947370946407318, "learning_rate": 0.0001712743877127439, "loss": 0.7337, "step": 842 }, { "epoch": 0.5089043163296106, "grad_norm": 0.11347758769989014, "learning_rate": 0.00017123287671232877, "loss": 0.7492, "step": 843 }, { "epoch": 0.5095079987926351, "grad_norm": 0.09824637323617935, "learning_rate": 0.00017119136571191367, "loss": 0.6762, "step": 844 }, { "epoch": 0.5101116812556595, "grad_norm": 0.1099235787987709, "learning_rate": 0.00017114985471149857, "loss": 0.7085, "step": 845 }, { "epoch": 0.510715363718684, "grad_norm": 0.11087027192115784, "learning_rate": 0.00017110834371108344, "loss": 0.7089, "step": 846 }, { "epoch": 0.5113190461817084, "grad_norm": 0.13687917590141296, "learning_rate": 0.00017106683271066834, "loss": 0.6154, "step": 847 }, { "epoch": 0.5119227286447329, "grad_norm": 0.11547640711069107, "learning_rate": 0.00017102532171025321, "loss": 0.5773, "step": 848 }, { "epoch": 0.5125264111077573, "grad_norm": 0.12279754877090454, "learning_rate": 0.00017098381070983811, "loss": 0.5398, "step": 849 }, { "epoch": 0.5131300935707818, "grad_norm": 0.12324254214763641, "learning_rate": 0.00017094229970942301, "loss": 0.4505, "step": 850 }, { "epoch": 0.5137337760338062, "grad_norm": 0.07736624777317047, "learning_rate": 0.0001709007887090079, "loss": 0.9161, "step": 851 }, { "epoch": 0.5143374584968307, "grad_norm": 0.08110132068395615, "learning_rate": 0.0001708592777085928, "loss": 0.7757, "step": 852 }, { "epoch": 0.5149411409598551, "grad_norm": 0.08207719773054123, "learning_rate": 0.0001708177667081777, "loss": 0.9293, "step": 853 }, { "epoch": 0.5155448234228796, "grad_norm": 0.07672082632780075, "learning_rate": 0.00017077625570776256, "loss": 0.8568, "step": 854 }, { "epoch": 0.516148505885904, "grad_norm": 0.07514925301074982, "learning_rate": 0.00017073474470734746, "loss": 0.9212, "step": 855 }, { "epoch": 0.5167521883489284, "grad_norm": 0.07040940970182419, "learning_rate": 0.00017069323370693236, "loss": 0.7008, "step": 856 }, { "epoch": 0.5173558708119529, "grad_norm": 0.07006466388702393, "learning_rate": 0.00017065172270651723, "loss": 0.8403, "step": 857 }, { "epoch": 0.5179595532749773, "grad_norm": 0.0859081968665123, "learning_rate": 0.0001706102117061021, "loss": 0.8658, "step": 858 }, { "epoch": 0.5185632357380018, "grad_norm": 0.08266257494688034, "learning_rate": 0.00017056870070568703, "loss": 1.0393, "step": 859 }, { "epoch": 0.5191669182010262, "grad_norm": 0.06764436513185501, "learning_rate": 0.0001705271897052719, "loss": 0.7812, "step": 860 }, { "epoch": 0.5197706006640507, "grad_norm": 0.0848420113325119, "learning_rate": 0.00017048567870485678, "loss": 0.8968, "step": 861 }, { "epoch": 0.5203742831270751, "grad_norm": 0.11803008615970612, "learning_rate": 0.00017044416770444168, "loss": 0.8015, "step": 862 }, { "epoch": 0.5209779655900996, "grad_norm": 0.07445228844881058, "learning_rate": 0.00017040265670402658, "loss": 0.9567, "step": 863 }, { "epoch": 0.521581648053124, "grad_norm": 0.0766722708940506, "learning_rate": 0.00017036114570361145, "loss": 1.2654, "step": 864 }, { "epoch": 0.5221853305161485, "grad_norm": 0.0825088769197464, "learning_rate": 0.00017031963470319635, "loss": 1.1549, "step": 865 }, { "epoch": 0.5227890129791729, "grad_norm": 0.0870489627122879, "learning_rate": 0.00017027812370278125, "loss": 0.945, "step": 866 }, { "epoch": 0.5233926954421974, "grad_norm": 0.07320686429738998, "learning_rate": 0.00017023661270236613, "loss": 0.7858, "step": 867 }, { "epoch": 0.5239963779052218, "grad_norm": 0.16626937687397003, "learning_rate": 0.00017019510170195103, "loss": 0.8496, "step": 868 }, { "epoch": 0.5246000603682464, "grad_norm": 0.07288725674152374, "learning_rate": 0.00017015359070153593, "loss": 1.0189, "step": 869 }, { "epoch": 0.5252037428312708, "grad_norm": 0.07230374217033386, "learning_rate": 0.0001701120797011208, "loss": 0.7273, "step": 870 }, { "epoch": 0.5258074252942952, "grad_norm": 0.0818004459142685, "learning_rate": 0.0001700705687007057, "loss": 1.0229, "step": 871 }, { "epoch": 0.5264111077573197, "grad_norm": 0.08718733489513397, "learning_rate": 0.00017002905770029057, "loss": 0.9639, "step": 872 }, { "epoch": 0.5270147902203441, "grad_norm": 0.07159387320280075, "learning_rate": 0.00016998754669987547, "loss": 0.796, "step": 873 }, { "epoch": 0.5276184726833686, "grad_norm": 0.08361268043518066, "learning_rate": 0.00016994603569946037, "loss": 0.8615, "step": 874 }, { "epoch": 0.528222155146393, "grad_norm": 0.07266636192798615, "learning_rate": 0.00016990452469904524, "loss": 0.7398, "step": 875 }, { "epoch": 0.5288258376094175, "grad_norm": 0.08292002975940704, "learning_rate": 0.00016986301369863014, "loss": 0.773, "step": 876 }, { "epoch": 0.5294295200724419, "grad_norm": 0.08378379046916962, "learning_rate": 0.00016982150269821504, "loss": 0.882, "step": 877 }, { "epoch": 0.5300332025354664, "grad_norm": 0.07400650531053543, "learning_rate": 0.00016977999169779992, "loss": 0.8248, "step": 878 }, { "epoch": 0.5306368849984908, "grad_norm": 0.07494434714317322, "learning_rate": 0.00016973848069738482, "loss": 1.0832, "step": 879 }, { "epoch": 0.5312405674615153, "grad_norm": 0.07507789880037308, "learning_rate": 0.00016969696969696972, "loss": 1.004, "step": 880 }, { "epoch": 0.5318442499245397, "grad_norm": 0.08284460753202438, "learning_rate": 0.0001696554586965546, "loss": 0.7898, "step": 881 }, { "epoch": 0.5324479323875642, "grad_norm": 0.07790551334619522, "learning_rate": 0.00016961394769613946, "loss": 1.0397, "step": 882 }, { "epoch": 0.5330516148505886, "grad_norm": 0.07756227999925613, "learning_rate": 0.0001695724366957244, "loss": 0.7647, "step": 883 }, { "epoch": 0.533655297313613, "grad_norm": 0.0776314064860344, "learning_rate": 0.00016953092569530926, "loss": 0.7354, "step": 884 }, { "epoch": 0.5342589797766375, "grad_norm": 0.09555763006210327, "learning_rate": 0.00016948941469489414, "loss": 0.7396, "step": 885 }, { "epoch": 0.534862662239662, "grad_norm": 0.08198418468236923, "learning_rate": 0.00016944790369447904, "loss": 0.8173, "step": 886 }, { "epoch": 0.5354663447026864, "grad_norm": 0.0823531225323677, "learning_rate": 0.00016940639269406394, "loss": 0.7131, "step": 887 }, { "epoch": 0.5360700271657108, "grad_norm": 0.08781224489212036, "learning_rate": 0.00016936488169364884, "loss": 0.6971, "step": 888 }, { "epoch": 0.5366737096287353, "grad_norm": 0.0982750728726387, "learning_rate": 0.0001693233706932337, "loss": 0.8086, "step": 889 }, { "epoch": 0.5372773920917597, "grad_norm": 0.08869129419326782, "learning_rate": 0.0001692818596928186, "loss": 0.7577, "step": 890 }, { "epoch": 0.5378810745547842, "grad_norm": 0.08565336465835571, "learning_rate": 0.0001692403486924035, "loss": 0.7097, "step": 891 }, { "epoch": 0.5384847570178086, "grad_norm": 0.08914349973201752, "learning_rate": 0.00016919883769198838, "loss": 0.6995, "step": 892 }, { "epoch": 0.5390884394808331, "grad_norm": 0.10026069730520248, "learning_rate": 0.00016915732669157328, "loss": 0.7999, "step": 893 }, { "epoch": 0.5396921219438575, "grad_norm": 0.1060984879732132, "learning_rate": 0.00016911581569115818, "loss": 0.7441, "step": 894 }, { "epoch": 0.540295804406882, "grad_norm": 0.10654401034116745, "learning_rate": 0.00016907430469074306, "loss": 0.7129, "step": 895 }, { "epoch": 0.5408994868699064, "grad_norm": 0.1091652438044548, "learning_rate": 0.00016903279369032793, "loss": 0.6522, "step": 896 }, { "epoch": 0.5415031693329309, "grad_norm": 0.11495837569236755, "learning_rate": 0.00016899128268991286, "loss": 0.6387, "step": 897 }, { "epoch": 0.5421068517959553, "grad_norm": 0.1222093477845192, "learning_rate": 0.00016894977168949773, "loss": 0.6086, "step": 898 }, { "epoch": 0.5427105342589797, "grad_norm": 0.11351132392883301, "learning_rate": 0.0001689082606890826, "loss": 0.5575, "step": 899 }, { "epoch": 0.5433142167220042, "grad_norm": 0.12081324309110641, "learning_rate": 0.0001688667496886675, "loss": 0.484, "step": 900 }, { "epoch": 0.5439178991850286, "grad_norm": 0.10098535567522049, "learning_rate": 0.0001688252386882524, "loss": 0.8019, "step": 901 }, { "epoch": 0.5445215816480531, "grad_norm": 0.07991302758455276, "learning_rate": 0.00016878372768783727, "loss": 0.8477, "step": 902 }, { "epoch": 0.5451252641110775, "grad_norm": 0.07697251439094543, "learning_rate": 0.00016874221668742217, "loss": 0.7631, "step": 903 }, { "epoch": 0.545728946574102, "grad_norm": 0.10373490303754807, "learning_rate": 0.00016870070568700707, "loss": 0.9223, "step": 904 }, { "epoch": 0.5463326290371264, "grad_norm": 0.0777481198310852, "learning_rate": 0.00016865919468659195, "loss": 0.882, "step": 905 }, { "epoch": 0.5469363115001509, "grad_norm": 0.08426318317651749, "learning_rate": 0.00016861768368617685, "loss": 1.0619, "step": 906 }, { "epoch": 0.5475399939631753, "grad_norm": 0.07732190936803818, "learning_rate": 0.00016857617268576175, "loss": 0.7125, "step": 907 }, { "epoch": 0.5481436764261998, "grad_norm": 0.07979828119277954, "learning_rate": 0.00016853466168534662, "loss": 0.8596, "step": 908 }, { "epoch": 0.5487473588892242, "grad_norm": 0.10036496073007584, "learning_rate": 0.00016849315068493152, "loss": 0.8396, "step": 909 }, { "epoch": 0.5493510413522488, "grad_norm": 0.07785385847091675, "learning_rate": 0.0001684516396845164, "loss": 0.7962, "step": 910 }, { "epoch": 0.5499547238152732, "grad_norm": 0.07757363468408585, "learning_rate": 0.0001684101286841013, "loss": 0.9085, "step": 911 }, { "epoch": 0.5505584062782977, "grad_norm": 0.07866904139518738, "learning_rate": 0.0001683686176836862, "loss": 0.7676, "step": 912 }, { "epoch": 0.5511620887413221, "grad_norm": 0.08171574026346207, "learning_rate": 0.00016832710668327107, "loss": 0.8823, "step": 913 }, { "epoch": 0.5517657712043466, "grad_norm": 0.09291453659534454, "learning_rate": 0.00016828559568285597, "loss": 0.8212, "step": 914 }, { "epoch": 0.552369453667371, "grad_norm": 0.06771212071180344, "learning_rate": 0.00016824408468244087, "loss": 0.8255, "step": 915 }, { "epoch": 0.5529731361303954, "grad_norm": 0.07624577730894089, "learning_rate": 0.00016820257368202574, "loss": 1.1116, "step": 916 }, { "epoch": 0.5535768185934199, "grad_norm": 0.06806713342666626, "learning_rate": 0.00016816106268161064, "loss": 0.7341, "step": 917 }, { "epoch": 0.5541805010564443, "grad_norm": 0.1294786036014557, "learning_rate": 0.00016811955168119554, "loss": 0.8429, "step": 918 }, { "epoch": 0.5547841835194688, "grad_norm": 0.08023308962583542, "learning_rate": 0.0001680780406807804, "loss": 1.1129, "step": 919 }, { "epoch": 0.5553878659824932, "grad_norm": 0.09215756505727768, "learning_rate": 0.00016803652968036529, "loss": 0.935, "step": 920 }, { "epoch": 0.5559915484455177, "grad_norm": 0.11087724566459656, "learning_rate": 0.0001679950186799502, "loss": 0.764, "step": 921 }, { "epoch": 0.5565952309085421, "grad_norm": 0.07170010358095169, "learning_rate": 0.00016795350767953509, "loss": 0.71, "step": 922 }, { "epoch": 0.5571989133715666, "grad_norm": 0.07650398463010788, "learning_rate": 0.00016791199667911996, "loss": 0.7885, "step": 923 }, { "epoch": 0.557802595834591, "grad_norm": 0.08096078783273697, "learning_rate": 0.00016787048567870486, "loss": 0.7443, "step": 924 }, { "epoch": 0.5584062782976155, "grad_norm": 0.07324434071779251, "learning_rate": 0.00016782897467828976, "loss": 0.8045, "step": 925 }, { "epoch": 0.5590099607606399, "grad_norm": 0.11291930824518204, "learning_rate": 0.00016778746367787463, "loss": 0.8388, "step": 926 }, { "epoch": 0.5596136432236644, "grad_norm": 0.07567749172449112, "learning_rate": 0.00016774595267745953, "loss": 0.8203, "step": 927 }, { "epoch": 0.5602173256866888, "grad_norm": 0.07367605715990067, "learning_rate": 0.00016770444167704443, "loss": 0.7428, "step": 928 }, { "epoch": 0.5608210081497133, "grad_norm": 0.11504574120044708, "learning_rate": 0.0001676629306766293, "loss": 1.0331, "step": 929 }, { "epoch": 0.5614246906127377, "grad_norm": 0.06944482028484344, "learning_rate": 0.0001676214196762142, "loss": 1.0409, "step": 930 }, { "epoch": 0.5620283730757621, "grad_norm": 0.07487515360116959, "learning_rate": 0.0001675799086757991, "loss": 0.747, "step": 931 }, { "epoch": 0.5626320555387866, "grad_norm": 0.09043581038713455, "learning_rate": 0.000167538397675384, "loss": 0.8308, "step": 932 }, { "epoch": 0.563235738001811, "grad_norm": 0.07596288621425629, "learning_rate": 0.00016749688667496888, "loss": 0.6969, "step": 933 }, { "epoch": 0.5638394204648355, "grad_norm": 0.08029479533433914, "learning_rate": 0.00016745537567455375, "loss": 0.7532, "step": 934 }, { "epoch": 0.5644431029278599, "grad_norm": 0.08100748807191849, "learning_rate": 0.00016741386467413868, "loss": 0.7859, "step": 935 }, { "epoch": 0.5650467853908844, "grad_norm": 0.08451302349567413, "learning_rate": 0.00016737235367372355, "loss": 0.7942, "step": 936 }, { "epoch": 0.5656504678539088, "grad_norm": 0.0821182057261467, "learning_rate": 0.00016733084267330842, "loss": 0.7002, "step": 937 }, { "epoch": 0.5662541503169333, "grad_norm": 0.08984285593032837, "learning_rate": 0.00016728933167289332, "loss": 0.7721, "step": 938 }, { "epoch": 0.5668578327799577, "grad_norm": 0.09170673787593842, "learning_rate": 0.00016724782067247822, "loss": 0.7705, "step": 939 }, { "epoch": 0.5674615152429822, "grad_norm": 0.09316025674343109, "learning_rate": 0.0001672063096720631, "loss": 0.7349, "step": 940 }, { "epoch": 0.5680651977060066, "grad_norm": 0.09121593087911606, "learning_rate": 0.000167164798671648, "loss": 0.7074, "step": 941 }, { "epoch": 0.568668880169031, "grad_norm": 0.099585622549057, "learning_rate": 0.0001671232876712329, "loss": 0.772, "step": 942 }, { "epoch": 0.5692725626320555, "grad_norm": 0.09845862537622452, "learning_rate": 0.00016708177667081777, "loss": 0.6837, "step": 943 }, { "epoch": 0.56987624509508, "grad_norm": 0.10467004030942917, "learning_rate": 0.00016704026567040267, "loss": 0.7469, "step": 944 }, { "epoch": 0.5704799275581044, "grad_norm": 0.10466741770505905, "learning_rate": 0.00016699875466998757, "loss": 0.709, "step": 945 }, { "epoch": 0.5710836100211288, "grad_norm": 0.11000916361808777, "learning_rate": 0.00016695724366957244, "loss": 0.6697, "step": 946 }, { "epoch": 0.5716872924841533, "grad_norm": 0.1157151609659195, "learning_rate": 0.00016691573266915734, "loss": 0.6396, "step": 947 }, { "epoch": 0.5722909749471777, "grad_norm": 0.11827955394983292, "learning_rate": 0.00016687422166874222, "loss": 0.6224, "step": 948 }, { "epoch": 0.5728946574102022, "grad_norm": 0.12374337762594223, "learning_rate": 0.00016683271066832712, "loss": 0.5617, "step": 949 }, { "epoch": 0.5734983398732267, "grad_norm": 0.12392137944698334, "learning_rate": 0.00016679119966791202, "loss": 0.4256, "step": 950 }, { "epoch": 0.5741020223362512, "grad_norm": 0.09002061933279037, "learning_rate": 0.0001667496886674969, "loss": 0.8641, "step": 951 }, { "epoch": 0.5747057047992756, "grad_norm": 0.0768359899520874, "learning_rate": 0.0001667081776670818, "loss": 0.6958, "step": 952 }, { "epoch": 0.5753093872623001, "grad_norm": 0.07474057376384735, "learning_rate": 0.0001666666666666667, "loss": 0.9245, "step": 953 }, { "epoch": 0.5759130697253245, "grad_norm": 0.0858532041311264, "learning_rate": 0.00016662515566625156, "loss": 0.801, "step": 954 }, { "epoch": 0.576516752188349, "grad_norm": 0.08939801156520844, "learning_rate": 0.00016658364466583643, "loss": 0.8461, "step": 955 }, { "epoch": 0.5771204346513734, "grad_norm": 0.07809244841337204, "learning_rate": 0.00016654213366542136, "loss": 0.9988, "step": 956 }, { "epoch": 0.5777241171143979, "grad_norm": 0.08366213738918304, "learning_rate": 0.00016650062266500623, "loss": 0.8169, "step": 957 }, { "epoch": 0.5783277995774223, "grad_norm": 0.08782380819320679, "learning_rate": 0.0001664591116645911, "loss": 0.8715, "step": 958 }, { "epoch": 0.5789314820404468, "grad_norm": 0.0959751307964325, "learning_rate": 0.00016641760066417603, "loss": 0.9998, "step": 959 }, { "epoch": 0.5795351645034712, "grad_norm": 0.07628454267978668, "learning_rate": 0.0001663760896637609, "loss": 0.8369, "step": 960 }, { "epoch": 0.5801388469664956, "grad_norm": 0.0932275727391243, "learning_rate": 0.00016633457866334578, "loss": 1.0541, "step": 961 }, { "epoch": 0.5807425294295201, "grad_norm": 0.09154736250638962, "learning_rate": 0.00016629306766293068, "loss": 0.7972, "step": 962 }, { "epoch": 0.5813462118925445, "grad_norm": 0.09588005393743515, "learning_rate": 0.00016625155666251558, "loss": 0.9873, "step": 963 }, { "epoch": 0.581949894355569, "grad_norm": 0.06718328595161438, "learning_rate": 0.00016621004566210045, "loss": 0.839, "step": 964 }, { "epoch": 0.5825535768185934, "grad_norm": 0.09526249766349792, "learning_rate": 0.00016616853466168535, "loss": 0.8999, "step": 965 }, { "epoch": 0.5831572592816179, "grad_norm": 0.0714835673570633, "learning_rate": 0.00016612702366127025, "loss": 0.7127, "step": 966 }, { "epoch": 0.5837609417446423, "grad_norm": 0.07606557011604309, "learning_rate": 0.00016608551266085513, "loss": 0.8162, "step": 967 }, { "epoch": 0.5843646242076668, "grad_norm": 0.07503029704093933, "learning_rate": 0.00016604400166044003, "loss": 0.9174, "step": 968 }, { "epoch": 0.5849683066706912, "grad_norm": 0.08567387610673904, "learning_rate": 0.0001660024906600249, "loss": 0.9408, "step": 969 }, { "epoch": 0.5855719891337157, "grad_norm": 0.08126292377710342, "learning_rate": 0.0001659609796596098, "loss": 0.9288, "step": 970 }, { "epoch": 0.5861756715967401, "grad_norm": 0.07483746111392975, "learning_rate": 0.0001659194686591947, "loss": 0.8598, "step": 971 }, { "epoch": 0.5867793540597646, "grad_norm": 0.07251130044460297, "learning_rate": 0.00016587795765877957, "loss": 0.9461, "step": 972 }, { "epoch": 0.587383036522789, "grad_norm": 0.08908132463693619, "learning_rate": 0.00016583644665836447, "loss": 0.9473, "step": 973 }, { "epoch": 0.5879867189858135, "grad_norm": 0.08767648786306381, "learning_rate": 0.00016579493565794937, "loss": 0.757, "step": 974 }, { "epoch": 0.5885904014488379, "grad_norm": 0.07573854178190231, "learning_rate": 0.00016575342465753425, "loss": 0.7972, "step": 975 }, { "epoch": 0.5891940839118623, "grad_norm": 0.074210524559021, "learning_rate": 0.00016571191365711915, "loss": 0.8437, "step": 976 }, { "epoch": 0.5897977663748868, "grad_norm": 0.08098217099905014, "learning_rate": 0.00016567040265670405, "loss": 0.9021, "step": 977 }, { "epoch": 0.5904014488379112, "grad_norm": 0.07945941388607025, "learning_rate": 0.00016562889165628892, "loss": 0.8371, "step": 978 }, { "epoch": 0.5910051313009357, "grad_norm": 0.10039619356393814, "learning_rate": 0.00016558738065587382, "loss": 0.7602, "step": 979 }, { "epoch": 0.5916088137639601, "grad_norm": 0.08295184373855591, "learning_rate": 0.00016554586965545872, "loss": 0.761, "step": 980 }, { "epoch": 0.5922124962269846, "grad_norm": 0.08036410063505173, "learning_rate": 0.0001655043586550436, "loss": 0.9951, "step": 981 }, { "epoch": 0.592816178690009, "grad_norm": 0.07915826141834259, "learning_rate": 0.0001654628476546285, "loss": 0.82, "step": 982 }, { "epoch": 0.5934198611530335, "grad_norm": 0.07575780898332596, "learning_rate": 0.00016542133665421336, "loss": 0.7732, "step": 983 }, { "epoch": 0.5940235436160579, "grad_norm": 0.08057866990566254, "learning_rate": 0.00016537982565379826, "loss": 0.7388, "step": 984 }, { "epoch": 0.5946272260790824, "grad_norm": 0.07897903025150299, "learning_rate": 0.00016533831465338316, "loss": 0.733, "step": 985 }, { "epoch": 0.5952309085421068, "grad_norm": 0.1108403429389, "learning_rate": 0.00016529680365296804, "loss": 0.7831, "step": 986 }, { "epoch": 0.5958345910051313, "grad_norm": 0.08219526708126068, "learning_rate": 0.00016525529265255294, "loss": 0.754, "step": 987 }, { "epoch": 0.5964382734681557, "grad_norm": 0.09057065844535828, "learning_rate": 0.00016521378165213784, "loss": 0.7478, "step": 988 }, { "epoch": 0.5970419559311801, "grad_norm": 0.09317967295646667, "learning_rate": 0.0001651722706517227, "loss": 0.7583, "step": 989 }, { "epoch": 0.5976456383942047, "grad_norm": 0.09358835965394974, "learning_rate": 0.0001651307596513076, "loss": 0.6958, "step": 990 }, { "epoch": 0.5982493208572291, "grad_norm": 0.100360207259655, "learning_rate": 0.0001650892486508925, "loss": 0.7544, "step": 991 }, { "epoch": 0.5988530033202536, "grad_norm": 0.09013186395168304, "learning_rate": 0.00016504773765047738, "loss": 0.7112, "step": 992 }, { "epoch": 0.599456685783278, "grad_norm": 0.1008266881108284, "learning_rate": 0.00016500622665006226, "loss": 0.767, "step": 993 }, { "epoch": 0.6000603682463025, "grad_norm": 0.09665394574403763, "learning_rate": 0.00016496471564964718, "loss": 0.6782, "step": 994 }, { "epoch": 0.6006640507093269, "grad_norm": 0.10552255064249039, "learning_rate": 0.00016492320464923206, "loss": 0.6667, "step": 995 }, { "epoch": 0.6012677331723514, "grad_norm": 0.11099070310592651, "learning_rate": 0.00016488169364881693, "loss": 0.7023, "step": 996 }, { "epoch": 0.6018714156353758, "grad_norm": 0.10965953767299652, "learning_rate": 0.00016484018264840183, "loss": 0.6285, "step": 997 }, { "epoch": 0.6024750980984003, "grad_norm": 0.11270109564065933, "learning_rate": 0.00016479867164798673, "loss": 0.612, "step": 998 }, { "epoch": 0.6030787805614247, "grad_norm": 0.11348733305931091, "learning_rate": 0.0001647571606475716, "loss": 0.5591, "step": 999 }, { "epoch": 0.6036824630244492, "grad_norm": 0.1182008683681488, "learning_rate": 0.0001647156496471565, "loss": 0.4379, "step": 1000 }, { "epoch": 0.6036824630244492, "eval_loss": 0.8275489211082458, "eval_runtime": 1218.7348, "eval_samples_per_second": 2.289, "eval_steps_per_second": 0.286, "step": 1000 }, { "epoch": 0.6042861454874736, "grad_norm": 0.0789058580994606, "learning_rate": 0.0001646741386467414, "loss": 0.7819, "step": 1001 }, { "epoch": 0.6048898279504981, "grad_norm": 0.0920637771487236, "learning_rate": 0.00016463262764632627, "loss": 1.4056, "step": 1002 }, { "epoch": 0.6054935104135225, "grad_norm": 0.07369931042194366, "learning_rate": 0.00016459111664591118, "loss": 0.9731, "step": 1003 }, { "epoch": 0.606097192876547, "grad_norm": 0.08552141487598419, "learning_rate": 0.00016454960564549608, "loss": 0.7628, "step": 1004 }, { "epoch": 0.6067008753395714, "grad_norm": 0.07296184450387955, "learning_rate": 0.00016450809464508095, "loss": 0.7961, "step": 1005 }, { "epoch": 0.6073045578025958, "grad_norm": 0.08106445521116257, "learning_rate": 0.00016446658364466585, "loss": 0.9692, "step": 1006 }, { "epoch": 0.6079082402656203, "grad_norm": 0.08443611115217209, "learning_rate": 0.00016442507264425072, "loss": 1.3388, "step": 1007 }, { "epoch": 0.6085119227286447, "grad_norm": 0.08953455090522766, "learning_rate": 0.00016438356164383562, "loss": 0.9006, "step": 1008 }, { "epoch": 0.6091156051916692, "grad_norm": 0.09113611280918121, "learning_rate": 0.00016434205064342052, "loss": 0.9404, "step": 1009 }, { "epoch": 0.6097192876546936, "grad_norm": 0.08092870563268661, "learning_rate": 0.0001643005396430054, "loss": 0.8747, "step": 1010 }, { "epoch": 0.6103229701177181, "grad_norm": 0.07962828874588013, "learning_rate": 0.0001642590286425903, "loss": 1.0154, "step": 1011 }, { "epoch": 0.6109266525807425, "grad_norm": 0.07279300689697266, "learning_rate": 0.0001642175176421752, "loss": 0.8086, "step": 1012 }, { "epoch": 0.611530335043767, "grad_norm": 0.08411548286676407, "learning_rate": 0.00016417600664176007, "loss": 0.8582, "step": 1013 }, { "epoch": 0.6121340175067914, "grad_norm": 0.0790749341249466, "learning_rate": 0.00016413449564134497, "loss": 0.7265, "step": 1014 }, { "epoch": 0.6127376999698159, "grad_norm": 0.0777488648891449, "learning_rate": 0.00016409298464092987, "loss": 0.8086, "step": 1015 }, { "epoch": 0.6133413824328403, "grad_norm": 0.09185469895601273, "learning_rate": 0.00016405147364051474, "loss": 0.8752, "step": 1016 }, { "epoch": 0.6139450648958648, "grad_norm": 0.09490189701318741, "learning_rate": 0.0001640099626400996, "loss": 0.9276, "step": 1017 }, { "epoch": 0.6145487473588892, "grad_norm": 0.10677061229944229, "learning_rate": 0.00016396845163968454, "loss": 1.1395, "step": 1018 }, { "epoch": 0.6151524298219136, "grad_norm": 0.08552467823028564, "learning_rate": 0.0001639269406392694, "loss": 1.0333, "step": 1019 }, { "epoch": 0.6157561122849381, "grad_norm": 0.10448099672794342, "learning_rate": 0.0001638854296388543, "loss": 1.003, "step": 1020 }, { "epoch": 0.6163597947479625, "grad_norm": 0.08592557907104492, "learning_rate": 0.00016384391863843919, "loss": 0.7925, "step": 1021 }, { "epoch": 0.616963477210987, "grad_norm": 0.082502081990242, "learning_rate": 0.00016380240763802409, "loss": 0.8321, "step": 1022 }, { "epoch": 0.6175671596740114, "grad_norm": 0.08332712948322296, "learning_rate": 0.00016376089663760899, "loss": 0.9372, "step": 1023 }, { "epoch": 0.6181708421370359, "grad_norm": 0.09017034620046616, "learning_rate": 0.00016371938563719386, "loss": 0.8795, "step": 1024 }, { "epoch": 0.6187745246000603, "grad_norm": 0.08691825717687607, "learning_rate": 0.00016367787463677876, "loss": 0.7948, "step": 1025 }, { "epoch": 0.6193782070630848, "grad_norm": 0.07839145511388779, "learning_rate": 0.00016363636363636366, "loss": 0.7879, "step": 1026 }, { "epoch": 0.6199818895261092, "grad_norm": 0.09319109469652176, "learning_rate": 0.00016359485263594853, "loss": 0.8614, "step": 1027 }, { "epoch": 0.6205855719891337, "grad_norm": 0.0755649283528328, "learning_rate": 0.00016355334163553343, "loss": 0.7723, "step": 1028 }, { "epoch": 0.6211892544521581, "grad_norm": 0.08007936924695969, "learning_rate": 0.00016351183063511833, "loss": 0.7863, "step": 1029 }, { "epoch": 0.6217929369151826, "grad_norm": 0.11180046945810318, "learning_rate": 0.0001634703196347032, "loss": 1.0614, "step": 1030 }, { "epoch": 0.6223966193782071, "grad_norm": 0.09498503804206848, "learning_rate": 0.00016342880863428808, "loss": 0.8302, "step": 1031 }, { "epoch": 0.6230003018412316, "grad_norm": 0.08199071139097214, "learning_rate": 0.000163387297633873, "loss": 1.0062, "step": 1032 }, { "epoch": 0.623603984304256, "grad_norm": 0.08216526359319687, "learning_rate": 0.00016334578663345788, "loss": 0.6999, "step": 1033 }, { "epoch": 0.6242076667672805, "grad_norm": 0.07529989629983902, "learning_rate": 0.00016330427563304275, "loss": 0.8036, "step": 1034 }, { "epoch": 0.6248113492303049, "grad_norm": 0.08288833498954773, "learning_rate": 0.00016326276463262765, "loss": 0.7338, "step": 1035 }, { "epoch": 0.6254150316933293, "grad_norm": 0.08039028942584991, "learning_rate": 0.00016322125363221255, "loss": 0.7554, "step": 1036 }, { "epoch": 0.6260187141563538, "grad_norm": 0.08144210278987885, "learning_rate": 0.00016317974263179742, "loss": 0.7421, "step": 1037 }, { "epoch": 0.6266223966193782, "grad_norm": 0.08654642850160599, "learning_rate": 0.00016313823163138232, "loss": 0.7317, "step": 1038 }, { "epoch": 0.6272260790824027, "grad_norm": 0.08834250271320343, "learning_rate": 0.00016309672063096722, "loss": 0.7208, "step": 1039 }, { "epoch": 0.6278297615454271, "grad_norm": 0.08896996825933456, "learning_rate": 0.0001630552096305521, "loss": 0.7637, "step": 1040 }, { "epoch": 0.6284334440084516, "grad_norm": 0.09299125522375107, "learning_rate": 0.000163013698630137, "loss": 0.6993, "step": 1041 }, { "epoch": 0.629037126471476, "grad_norm": 0.0911908969283104, "learning_rate": 0.0001629721876297219, "loss": 0.7455, "step": 1042 }, { "epoch": 0.6296408089345005, "grad_norm": 0.0976100042462349, "learning_rate": 0.00016293067662930677, "loss": 0.6683, "step": 1043 }, { "epoch": 0.6302444913975249, "grad_norm": 0.10183601081371307, "learning_rate": 0.00016288916562889167, "loss": 0.7513, "step": 1044 }, { "epoch": 0.6308481738605494, "grad_norm": 0.10491038113832474, "learning_rate": 0.00016284765462847654, "loss": 0.7886, "step": 1045 }, { "epoch": 0.6314518563235738, "grad_norm": 0.1056092381477356, "learning_rate": 0.00016280614362806144, "loss": 0.6803, "step": 1046 }, { "epoch": 0.6320555387865983, "grad_norm": 0.1121145635843277, "learning_rate": 0.00016276463262764634, "loss": 0.6256, "step": 1047 }, { "epoch": 0.6326592212496227, "grad_norm": 0.11745337396860123, "learning_rate": 0.00016272312162723122, "loss": 0.6546, "step": 1048 }, { "epoch": 0.6332629037126472, "grad_norm": 0.12515030801296234, "learning_rate": 0.00016268161062681612, "loss": 0.5382, "step": 1049 }, { "epoch": 0.6338665861756716, "grad_norm": 0.13032299280166626, "learning_rate": 0.00016264009962640102, "loss": 0.4321, "step": 1050 }, { "epoch": 0.634470268638696, "grad_norm": 0.08480330556631088, "learning_rate": 0.0001625985886259859, "loss": 0.8055, "step": 1051 }, { "epoch": 0.6350739511017205, "grad_norm": 0.08330792188644409, "learning_rate": 0.00016255707762557076, "loss": 1.1714, "step": 1052 }, { "epoch": 0.6356776335647449, "grad_norm": 0.09432794153690338, "learning_rate": 0.0001625155666251557, "loss": 1.0071, "step": 1053 }, { "epoch": 0.6362813160277694, "grad_norm": 0.084816575050354, "learning_rate": 0.00016247405562474056, "loss": 0.7843, "step": 1054 }, { "epoch": 0.6368849984907938, "grad_norm": 0.09600014239549637, "learning_rate": 0.00016243254462432543, "loss": 0.8295, "step": 1055 }, { "epoch": 0.6374886809538183, "grad_norm": 0.10093321651220322, "learning_rate": 0.00016239103362391036, "loss": 0.8685, "step": 1056 }, { "epoch": 0.6380923634168427, "grad_norm": 0.07668054848909378, "learning_rate": 0.00016234952262349523, "loss": 1.042, "step": 1057 }, { "epoch": 0.6386960458798672, "grad_norm": 0.12126658856868744, "learning_rate": 0.0001623080116230801, "loss": 0.8384, "step": 1058 }, { "epoch": 0.6392997283428916, "grad_norm": 0.07691983133554459, "learning_rate": 0.000162266500622665, "loss": 0.9138, "step": 1059 }, { "epoch": 0.6399034108059161, "grad_norm": 0.07935553044080734, "learning_rate": 0.0001622249896222499, "loss": 0.7028, "step": 1060 }, { "epoch": 0.6405070932689405, "grad_norm": 0.0814066082239151, "learning_rate": 0.00016218347862183478, "loss": 0.8901, "step": 1061 }, { "epoch": 0.641110775731965, "grad_norm": 0.09054214507341385, "learning_rate": 0.00016214196762141968, "loss": 0.9142, "step": 1062 }, { "epoch": 0.6417144581949894, "grad_norm": 0.0810336098074913, "learning_rate": 0.00016210045662100458, "loss": 0.7975, "step": 1063 }, { "epoch": 0.6423181406580138, "grad_norm": 0.07674489170312881, "learning_rate": 0.00016205894562058945, "loss": 0.9827, "step": 1064 }, { "epoch": 0.6429218231210383, "grad_norm": 0.07564074546098709, "learning_rate": 0.00016201743462017435, "loss": 0.9164, "step": 1065 }, { "epoch": 0.6435255055840627, "grad_norm": 0.082376629114151, "learning_rate": 0.00016197592361975923, "loss": 1.0141, "step": 1066 }, { "epoch": 0.6441291880470872, "grad_norm": 0.09569506347179413, "learning_rate": 0.00016193441261934415, "loss": 1.031, "step": 1067 }, { "epoch": 0.6447328705101116, "grad_norm": 0.07831252366304398, "learning_rate": 0.00016189290161892903, "loss": 0.7855, "step": 1068 }, { "epoch": 0.6453365529731361, "grad_norm": 0.12247852981090546, "learning_rate": 0.0001618513906185139, "loss": 0.9584, "step": 1069 }, { "epoch": 0.6459402354361605, "grad_norm": 0.07706090807914734, "learning_rate": 0.00016180987961809883, "loss": 0.9001, "step": 1070 }, { "epoch": 0.6465439178991851, "grad_norm": 0.07961270213127136, "learning_rate": 0.0001617683686176837, "loss": 0.725, "step": 1071 }, { "epoch": 0.6471476003622095, "grad_norm": 0.08146402984857559, "learning_rate": 0.00016172685761726857, "loss": 0.8213, "step": 1072 }, { "epoch": 0.647751282825234, "grad_norm": 0.1310458779335022, "learning_rate": 0.00016168534661685347, "loss": 0.8039, "step": 1073 }, { "epoch": 0.6483549652882584, "grad_norm": 0.09053052216768265, "learning_rate": 0.00016164383561643837, "loss": 0.8154, "step": 1074 }, { "epoch": 0.6489586477512829, "grad_norm": 0.07922950387001038, "learning_rate": 0.00016160232461602325, "loss": 0.8074, "step": 1075 }, { "epoch": 0.6495623302143073, "grad_norm": 0.07535174489021301, "learning_rate": 0.00016156081361560815, "loss": 0.7056, "step": 1076 }, { "epoch": 0.6501660126773318, "grad_norm": 0.08822023123502731, "learning_rate": 0.00016151930261519305, "loss": 0.8269, "step": 1077 }, { "epoch": 0.6507696951403562, "grad_norm": 0.12088391184806824, "learning_rate": 0.00016147779161477792, "loss": 1.0016, "step": 1078 }, { "epoch": 0.6513733776033807, "grad_norm": 0.08213537931442261, "learning_rate": 0.00016143628061436282, "loss": 0.7239, "step": 1079 }, { "epoch": 0.6519770600664051, "grad_norm": 0.08252882957458496, "learning_rate": 0.0001613947696139477, "loss": 0.8806, "step": 1080 }, { "epoch": 0.6525807425294295, "grad_norm": 0.06710948050022125, "learning_rate": 0.0001613532586135326, "loss": 0.9193, "step": 1081 }, { "epoch": 0.653184424992454, "grad_norm": 0.14059540629386902, "learning_rate": 0.0001613117476131175, "loss": 0.8201, "step": 1082 }, { "epoch": 0.6537881074554784, "grad_norm": 0.08148171007633209, "learning_rate": 0.00016127023661270236, "loss": 0.7983, "step": 1083 }, { "epoch": 0.6543917899185029, "grad_norm": 0.07753555476665497, "learning_rate": 0.00016122872561228726, "loss": 0.6901, "step": 1084 }, { "epoch": 0.6549954723815273, "grad_norm": 0.08410129696130753, "learning_rate": 0.00016118721461187216, "loss": 0.7755, "step": 1085 }, { "epoch": 0.6555991548445518, "grad_norm": 0.08018611371517181, "learning_rate": 0.00016114570361145704, "loss": 0.7569, "step": 1086 }, { "epoch": 0.6562028373075762, "grad_norm": 0.08290990442037582, "learning_rate": 0.00016110419261104194, "loss": 0.7694, "step": 1087 }, { "epoch": 0.6568065197706007, "grad_norm": 0.08168758451938629, "learning_rate": 0.00016106268161062684, "loss": 0.7458, "step": 1088 }, { "epoch": 0.6574102022336251, "grad_norm": 0.09103231132030487, "learning_rate": 0.0001610211706102117, "loss": 0.7203, "step": 1089 }, { "epoch": 0.6580138846966496, "grad_norm": 0.0930967926979065, "learning_rate": 0.00016097965960979658, "loss": 0.764, "step": 1090 }, { "epoch": 0.658617567159674, "grad_norm": 0.0999932810664177, "learning_rate": 0.0001609381486093815, "loss": 0.7885, "step": 1091 }, { "epoch": 0.6592212496226985, "grad_norm": 0.09345666319131851, "learning_rate": 0.00016089663760896638, "loss": 0.7567, "step": 1092 }, { "epoch": 0.6598249320857229, "grad_norm": 0.09461873769760132, "learning_rate": 0.00016085512660855126, "loss": 0.6643, "step": 1093 }, { "epoch": 0.6604286145487474, "grad_norm": 0.10697057098150253, "learning_rate": 0.00016081361560813616, "loss": 0.7219, "step": 1094 }, { "epoch": 0.6610322970117718, "grad_norm": 0.11102855950593948, "learning_rate": 0.00016077210460772106, "loss": 0.6777, "step": 1095 }, { "epoch": 0.6616359794747962, "grad_norm": 0.11301318556070328, "learning_rate": 0.00016073059360730593, "loss": 0.7065, "step": 1096 }, { "epoch": 0.6622396619378207, "grad_norm": 0.11578387022018433, "learning_rate": 0.00016068908260689083, "loss": 0.731, "step": 1097 }, { "epoch": 0.6628433444008451, "grad_norm": 0.11346578598022461, "learning_rate": 0.00016064757160647573, "loss": 0.579, "step": 1098 }, { "epoch": 0.6634470268638696, "grad_norm": 0.1209607645869255, "learning_rate": 0.0001606060606060606, "loss": 0.5572, "step": 1099 }, { "epoch": 0.664050709326894, "grad_norm": 0.11950193345546722, "learning_rate": 0.0001605645496056455, "loss": 0.3567, "step": 1100 }, { "epoch": 0.6646543917899185, "grad_norm": 0.07838748395442963, "learning_rate": 0.0001605230386052304, "loss": 1.0775, "step": 1101 }, { "epoch": 0.6652580742529429, "grad_norm": 0.1073046401143074, "learning_rate": 0.00016048152760481528, "loss": 0.693, "step": 1102 }, { "epoch": 0.6658617567159674, "grad_norm": 0.0886329859495163, "learning_rate": 0.00016044001660440018, "loss": 0.7818, "step": 1103 }, { "epoch": 0.6664654391789918, "grad_norm": 0.07882886379957199, "learning_rate": 0.00016039850560398505, "loss": 0.7407, "step": 1104 }, { "epoch": 0.6670691216420163, "grad_norm": 0.09764014929533005, "learning_rate": 0.00016035699460356995, "loss": 0.8944, "step": 1105 }, { "epoch": 0.6676728041050407, "grad_norm": 0.08643536269664764, "learning_rate": 0.00016031548360315485, "loss": 0.8511, "step": 1106 }, { "epoch": 0.6682764865680652, "grad_norm": 0.08230648189783096, "learning_rate": 0.00016027397260273972, "loss": 0.9837, "step": 1107 }, { "epoch": 0.6688801690310896, "grad_norm": 0.08523669838905334, "learning_rate": 0.00016023246160232462, "loss": 0.7902, "step": 1108 }, { "epoch": 0.669483851494114, "grad_norm": 0.08347252011299133, "learning_rate": 0.00016019095060190952, "loss": 0.7791, "step": 1109 }, { "epoch": 0.6700875339571385, "grad_norm": 0.08182717114686966, "learning_rate": 0.0001601494396014944, "loss": 0.8484, "step": 1110 }, { "epoch": 0.6706912164201629, "grad_norm": 0.0879136249423027, "learning_rate": 0.0001601079286010793, "loss": 0.8469, "step": 1111 }, { "epoch": 0.6712948988831875, "grad_norm": 0.08389199525117874, "learning_rate": 0.0001600664176006642, "loss": 0.9403, "step": 1112 }, { "epoch": 0.6718985813462119, "grad_norm": 0.07789173722267151, "learning_rate": 0.00016002490660024907, "loss": 0.8443, "step": 1113 }, { "epoch": 0.6725022638092364, "grad_norm": 0.07873908430337906, "learning_rate": 0.00015998339559983397, "loss": 0.8464, "step": 1114 }, { "epoch": 0.6731059462722608, "grad_norm": 0.10451683402061462, "learning_rate": 0.00015994188459941887, "loss": 0.8507, "step": 1115 }, { "epoch": 0.6737096287352853, "grad_norm": 0.10225295275449753, "learning_rate": 0.00015990037359900374, "loss": 1.1292, "step": 1116 }, { "epoch": 0.6743133111983097, "grad_norm": 0.08053860068321228, "learning_rate": 0.00015985886259858864, "loss": 0.8511, "step": 1117 }, { "epoch": 0.6749169936613342, "grad_norm": 0.15555590391159058, "learning_rate": 0.00015981735159817351, "loss": 0.6958, "step": 1118 }, { "epoch": 0.6755206761243586, "grad_norm": 0.09447921067476273, "learning_rate": 0.00015977584059775841, "loss": 0.8228, "step": 1119 }, { "epoch": 0.6761243585873831, "grad_norm": 0.11645516008138657, "learning_rate": 0.00015973432959734331, "loss": 0.8909, "step": 1120 }, { "epoch": 0.6767280410504075, "grad_norm": 0.09540687501430511, "learning_rate": 0.0001596928185969282, "loss": 0.7483, "step": 1121 }, { "epoch": 0.677331723513432, "grad_norm": 0.08288270980119705, "learning_rate": 0.0001596513075965131, "loss": 0.837, "step": 1122 }, { "epoch": 0.6779354059764564, "grad_norm": 0.0898602157831192, "learning_rate": 0.000159609796596098, "loss": 0.7893, "step": 1123 }, { "epoch": 0.6785390884394809, "grad_norm": 0.08173252642154694, "learning_rate": 0.00015956828559568286, "loss": 0.7965, "step": 1124 }, { "epoch": 0.6791427709025053, "grad_norm": 0.08245470374822617, "learning_rate": 0.00015952677459526776, "loss": 0.9573, "step": 1125 }, { "epoch": 0.6797464533655297, "grad_norm": 0.07909370958805084, "learning_rate": 0.00015948526359485266, "loss": 0.7852, "step": 1126 }, { "epoch": 0.6803501358285542, "grad_norm": 0.07963291555643082, "learning_rate": 0.00015944375259443753, "loss": 0.8806, "step": 1127 }, { "epoch": 0.6809538182915786, "grad_norm": 0.07449831813573837, "learning_rate": 0.0001594022415940224, "loss": 0.8094, "step": 1128 }, { "epoch": 0.6815575007546031, "grad_norm": 0.07845521718263626, "learning_rate": 0.00015936073059360733, "loss": 0.8813, "step": 1129 }, { "epoch": 0.6821611832176275, "grad_norm": 0.0760309100151062, "learning_rate": 0.0001593192195931922, "loss": 0.9867, "step": 1130 }, { "epoch": 0.682764865680652, "grad_norm": 0.07355882227420807, "learning_rate": 0.00015927770859277708, "loss": 0.8448, "step": 1131 }, { "epoch": 0.6833685481436764, "grad_norm": 0.07743940502405167, "learning_rate": 0.00015923619759236198, "loss": 1.0274, "step": 1132 }, { "epoch": 0.6839722306067009, "grad_norm": 0.08236207067966461, "learning_rate": 0.00015919468659194688, "loss": 0.7691, "step": 1133 }, { "epoch": 0.6845759130697253, "grad_norm": 0.07984744012355804, "learning_rate": 0.00015915317559153175, "loss": 0.7605, "step": 1134 }, { "epoch": 0.6851795955327498, "grad_norm": 0.08415329456329346, "learning_rate": 0.00015911166459111665, "loss": 0.7576, "step": 1135 }, { "epoch": 0.6857832779957742, "grad_norm": 0.08308225870132446, "learning_rate": 0.00015907015359070155, "loss": 0.6959, "step": 1136 }, { "epoch": 0.6863869604587987, "grad_norm": 0.08338841050863266, "learning_rate": 0.00015902864259028642, "loss": 0.6998, "step": 1137 }, { "epoch": 0.6869906429218231, "grad_norm": 0.08652734011411667, "learning_rate": 0.00015898713158987132, "loss": 0.7469, "step": 1138 }, { "epoch": 0.6875943253848475, "grad_norm": 0.11346086114645004, "learning_rate": 0.00015894562058945622, "loss": 0.6995, "step": 1139 }, { "epoch": 0.688198007847872, "grad_norm": 0.08581293374300003, "learning_rate": 0.0001589041095890411, "loss": 0.7138, "step": 1140 }, { "epoch": 0.6888016903108964, "grad_norm": 0.0915643498301506, "learning_rate": 0.000158862598588626, "loss": 0.7836, "step": 1141 }, { "epoch": 0.6894053727739209, "grad_norm": 0.09856559336185455, "learning_rate": 0.00015882108758821087, "loss": 0.7151, "step": 1142 }, { "epoch": 0.6900090552369453, "grad_norm": 0.09949114173650742, "learning_rate": 0.00015877957658779577, "loss": 0.7292, "step": 1143 }, { "epoch": 0.6906127376999698, "grad_norm": 0.10931304842233658, "learning_rate": 0.00015873806558738067, "loss": 0.777, "step": 1144 }, { "epoch": 0.6912164201629942, "grad_norm": 0.10665369033813477, "learning_rate": 0.00015869655458696554, "loss": 0.6729, "step": 1145 }, { "epoch": 0.6918201026260187, "grad_norm": 0.1097274124622345, "learning_rate": 0.00015865504358655044, "loss": 0.6532, "step": 1146 }, { "epoch": 0.6924237850890431, "grad_norm": 0.12797416746616364, "learning_rate": 0.00015861353258613534, "loss": 0.6375, "step": 1147 }, { "epoch": 0.6930274675520676, "grad_norm": 0.11834930628538132, "learning_rate": 0.00015857202158572022, "loss": 0.6374, "step": 1148 }, { "epoch": 0.693631150015092, "grad_norm": 0.11373134702444077, "learning_rate": 0.00015853051058530512, "loss": 0.5455, "step": 1149 }, { "epoch": 0.6942348324781165, "grad_norm": 0.13020096719264984, "learning_rate": 0.00015848899958489002, "loss": 0.4624, "step": 1150 }, { "epoch": 0.6948385149411409, "grad_norm": 0.09528189897537231, "learning_rate": 0.0001584474885844749, "loss": 0.8045, "step": 1151 }, { "epoch": 0.6954421974041655, "grad_norm": 0.09046769887208939, "learning_rate": 0.00015840597758405976, "loss": 0.7992, "step": 1152 }, { "epoch": 0.6960458798671899, "grad_norm": 0.10954567044973373, "learning_rate": 0.0001583644665836447, "loss": 0.7242, "step": 1153 }, { "epoch": 0.6966495623302144, "grad_norm": 0.08049467206001282, "learning_rate": 0.00015832295558322956, "loss": 0.9732, "step": 1154 }, { "epoch": 0.6972532447932388, "grad_norm": 0.08102589845657349, "learning_rate": 0.00015828144458281446, "loss": 1.0654, "step": 1155 }, { "epoch": 0.6978569272562632, "grad_norm": 0.07799682021141052, "learning_rate": 0.00015823993358239934, "loss": 0.879, "step": 1156 }, { "epoch": 0.6984606097192877, "grad_norm": 0.08080107718706131, "learning_rate": 0.00015819842258198424, "loss": 0.8515, "step": 1157 }, { "epoch": 0.6990642921823121, "grad_norm": 0.07894068211317062, "learning_rate": 0.00015815691158156914, "loss": 0.8024, "step": 1158 }, { "epoch": 0.6996679746453366, "grad_norm": 0.0848923772573471, "learning_rate": 0.000158115400581154, "loss": 0.7698, "step": 1159 }, { "epoch": 0.700271657108361, "grad_norm": 0.08654095232486725, "learning_rate": 0.0001580738895807389, "loss": 0.8809, "step": 1160 }, { "epoch": 0.7008753395713855, "grad_norm": 0.07776283472776413, "learning_rate": 0.0001580323785803238, "loss": 0.7659, "step": 1161 }, { "epoch": 0.7014790220344099, "grad_norm": 0.09321299195289612, "learning_rate": 0.00015799086757990868, "loss": 0.9147, "step": 1162 }, { "epoch": 0.7020827044974344, "grad_norm": 0.10445485264062881, "learning_rate": 0.00015794935657949358, "loss": 0.7182, "step": 1163 }, { "epoch": 0.7026863869604588, "grad_norm": 0.08144387602806091, "learning_rate": 0.00015790784557907848, "loss": 0.9075, "step": 1164 }, { "epoch": 0.7032900694234833, "grad_norm": 0.08363913744688034, "learning_rate": 0.00015786633457866335, "loss": 0.7935, "step": 1165 }, { "epoch": 0.7038937518865077, "grad_norm": 0.07295918464660645, "learning_rate": 0.00015782482357824823, "loss": 0.99, "step": 1166 }, { "epoch": 0.7044974343495322, "grad_norm": 0.0843958631157875, "learning_rate": 0.00015778331257783315, "loss": 0.8588, "step": 1167 }, { "epoch": 0.7051011168125566, "grad_norm": 0.08498518168926239, "learning_rate": 0.00015774180157741803, "loss": 0.7837, "step": 1168 }, { "epoch": 0.705704799275581, "grad_norm": 0.08214520663022995, "learning_rate": 0.0001577002905770029, "loss": 0.8411, "step": 1169 }, { "epoch": 0.7063084817386055, "grad_norm": 0.08001639693975449, "learning_rate": 0.0001576587795765878, "loss": 0.7665, "step": 1170 }, { "epoch": 0.70691216420163, "grad_norm": 0.08271230757236481, "learning_rate": 0.0001576172685761727, "loss": 0.8341, "step": 1171 }, { "epoch": 0.7075158466646544, "grad_norm": 0.08699033409357071, "learning_rate": 0.00015757575757575757, "loss": 0.7784, "step": 1172 }, { "epoch": 0.7081195291276788, "grad_norm": 0.09423791617155075, "learning_rate": 0.00015753424657534247, "loss": 0.9262, "step": 1173 }, { "epoch": 0.7087232115907033, "grad_norm": 0.07842440158128738, "learning_rate": 0.00015749273557492737, "loss": 0.9249, "step": 1174 }, { "epoch": 0.7093268940537277, "grad_norm": 0.08183681219816208, "learning_rate": 0.00015745122457451225, "loss": 0.7941, "step": 1175 }, { "epoch": 0.7099305765167522, "grad_norm": 0.08315775543451309, "learning_rate": 0.00015740971357409715, "loss": 0.8225, "step": 1176 }, { "epoch": 0.7105342589797766, "grad_norm": 0.08027949929237366, "learning_rate": 0.00015736820257368205, "loss": 0.8441, "step": 1177 }, { "epoch": 0.7111379414428011, "grad_norm": 0.08332298696041107, "learning_rate": 0.00015732669157326692, "loss": 0.9679, "step": 1178 }, { "epoch": 0.7117416239058255, "grad_norm": 0.09133647382259369, "learning_rate": 0.00015728518057285182, "loss": 0.9468, "step": 1179 }, { "epoch": 0.71234530636885, "grad_norm": 0.08058296889066696, "learning_rate": 0.0001572436695724367, "loss": 0.8374, "step": 1180 }, { "epoch": 0.7129489888318744, "grad_norm": 0.08813148736953735, "learning_rate": 0.0001572021585720216, "loss": 0.9568, "step": 1181 }, { "epoch": 0.7135526712948989, "grad_norm": 0.07533196359872818, "learning_rate": 0.0001571606475716065, "loss": 0.8627, "step": 1182 }, { "epoch": 0.7141563537579233, "grad_norm": 0.08658763766288757, "learning_rate": 0.00015711913657119137, "loss": 0.7429, "step": 1183 }, { "epoch": 0.7147600362209477, "grad_norm": 0.0844459980726242, "learning_rate": 0.00015707762557077627, "loss": 0.7966, "step": 1184 }, { "epoch": 0.7153637186839722, "grad_norm": 0.08175187557935715, "learning_rate": 0.00015703611457036117, "loss": 0.7226, "step": 1185 }, { "epoch": 0.7159674011469966, "grad_norm": 0.08582815527915955, "learning_rate": 0.00015699460356994604, "loss": 0.717, "step": 1186 }, { "epoch": 0.7165710836100211, "grad_norm": 0.08503274619579315, "learning_rate": 0.0001569530925695309, "loss": 0.7526, "step": 1187 }, { "epoch": 0.7171747660730455, "grad_norm": 0.08774889260530472, "learning_rate": 0.00015691158156911584, "loss": 0.7016, "step": 1188 }, { "epoch": 0.71777844853607, "grad_norm": 0.10124623775482178, "learning_rate": 0.0001568700705687007, "loss": 0.6455, "step": 1189 }, { "epoch": 0.7183821309990944, "grad_norm": 0.10203047096729279, "learning_rate": 0.00015682855956828558, "loss": 0.6779, "step": 1190 }, { "epoch": 0.7189858134621189, "grad_norm": 0.09329473972320557, "learning_rate": 0.0001567870485678705, "loss": 0.7313, "step": 1191 }, { "epoch": 0.7195894959251434, "grad_norm": 0.09363296627998352, "learning_rate": 0.00015674553756745538, "loss": 0.6968, "step": 1192 }, { "epoch": 0.7201931783881679, "grad_norm": 0.10082225501537323, "learning_rate": 0.00015670402656704026, "loss": 0.737, "step": 1193 }, { "epoch": 0.7207968608511923, "grad_norm": 0.10495683550834656, "learning_rate": 0.00015666251556662516, "loss": 0.7186, "step": 1194 }, { "epoch": 0.7214005433142168, "grad_norm": 0.10491074621677399, "learning_rate": 0.00015662100456621006, "loss": 0.6949, "step": 1195 }, { "epoch": 0.7220042257772412, "grad_norm": 0.12182420492172241, "learning_rate": 0.00015657949356579493, "loss": 0.667, "step": 1196 }, { "epoch": 0.7226079082402657, "grad_norm": 0.12086670100688934, "learning_rate": 0.00015653798256537983, "loss": 0.5885, "step": 1197 }, { "epoch": 0.7232115907032901, "grad_norm": 0.11433517187833786, "learning_rate": 0.00015649647156496473, "loss": 0.6147, "step": 1198 }, { "epoch": 0.7238152731663146, "grad_norm": 0.12221046537160873, "learning_rate": 0.00015645496056454963, "loss": 0.5129, "step": 1199 }, { "epoch": 0.724418955629339, "grad_norm": 0.12147875875234604, "learning_rate": 0.0001564134495641345, "loss": 0.4205, "step": 1200 }, { "epoch": 0.7250226380923634, "grad_norm": 0.08808968216180801, "learning_rate": 0.00015637193856371938, "loss": 0.8195, "step": 1201 }, { "epoch": 0.7256263205553879, "grad_norm": 0.08320089429616928, "learning_rate": 0.0001563304275633043, "loss": 0.8505, "step": 1202 }, { "epoch": 0.7262300030184123, "grad_norm": 0.0852508470416069, "learning_rate": 0.00015628891656288918, "loss": 1.0964, "step": 1203 }, { "epoch": 0.7268336854814368, "grad_norm": 0.09474988281726837, "learning_rate": 0.00015624740556247405, "loss": 0.8924, "step": 1204 }, { "epoch": 0.7274373679444612, "grad_norm": 0.092880479991436, "learning_rate": 0.00015620589456205898, "loss": 0.8223, "step": 1205 }, { "epoch": 0.7280410504074857, "grad_norm": 0.10932306200265884, "learning_rate": 0.00015616438356164385, "loss": 0.8786, "step": 1206 }, { "epoch": 0.7286447328705101, "grad_norm": 0.0782761201262474, "learning_rate": 0.00015612287256122872, "loss": 0.7801, "step": 1207 }, { "epoch": 0.7292484153335346, "grad_norm": 0.09566731005907059, "learning_rate": 0.00015608136156081362, "loss": 0.9474, "step": 1208 }, { "epoch": 0.729852097796559, "grad_norm": 0.08776892721652985, "learning_rate": 0.00015603985056039852, "loss": 0.7888, "step": 1209 }, { "epoch": 0.7304557802595835, "grad_norm": 0.07871197909116745, "learning_rate": 0.0001559983395599834, "loss": 0.7624, "step": 1210 }, { "epoch": 0.7310594627226079, "grad_norm": 0.19957715272903442, "learning_rate": 0.0001559568285595683, "loss": 0.8708, "step": 1211 }, { "epoch": 0.7316631451856324, "grad_norm": 0.0865522176027298, "learning_rate": 0.0001559153175591532, "loss": 0.9895, "step": 1212 }, { "epoch": 0.7322668276486568, "grad_norm": 0.08585958927869797, "learning_rate": 0.00015587380655873807, "loss": 0.8575, "step": 1213 }, { "epoch": 0.7328705101116813, "grad_norm": 0.07868848741054535, "learning_rate": 0.00015583229555832297, "loss": 0.768, "step": 1214 }, { "epoch": 0.7334741925747057, "grad_norm": 0.08140038698911667, "learning_rate": 0.00015579078455790784, "loss": 0.7544, "step": 1215 }, { "epoch": 0.7340778750377301, "grad_norm": 0.08099745959043503, "learning_rate": 0.00015574927355749274, "loss": 0.7234, "step": 1216 }, { "epoch": 0.7346815575007546, "grad_norm": 0.08431069552898407, "learning_rate": 0.00015570776255707764, "loss": 0.8079, "step": 1217 }, { "epoch": 0.735285239963779, "grad_norm": 0.08134233951568604, "learning_rate": 0.00015566625155666251, "loss": 1.0639, "step": 1218 }, { "epoch": 0.7358889224268035, "grad_norm": 0.08087371289730072, "learning_rate": 0.00015562474055624741, "loss": 0.7534, "step": 1219 }, { "epoch": 0.7364926048898279, "grad_norm": 0.08062789589166641, "learning_rate": 0.00015558322955583231, "loss": 0.8352, "step": 1220 }, { "epoch": 0.7370962873528524, "grad_norm": 0.08649458736181259, "learning_rate": 0.0001555417185554172, "loss": 0.7991, "step": 1221 }, { "epoch": 0.7376999698158768, "grad_norm": 0.0848165899515152, "learning_rate": 0.0001555002075550021, "loss": 0.8912, "step": 1222 }, { "epoch": 0.7383036522789013, "grad_norm": 0.07689525932073593, "learning_rate": 0.000155458696554587, "loss": 0.8181, "step": 1223 }, { "epoch": 0.7389073347419257, "grad_norm": 0.07876717299222946, "learning_rate": 0.00015541718555417186, "loss": 0.7244, "step": 1224 }, { "epoch": 0.7395110172049502, "grad_norm": 0.07784243673086166, "learning_rate": 0.00015537567455375673, "loss": 1.539, "step": 1225 }, { "epoch": 0.7401146996679746, "grad_norm": 0.08357193320989609, "learning_rate": 0.00015533416355334166, "loss": 0.8915, "step": 1226 }, { "epoch": 0.740718382130999, "grad_norm": 0.07806842029094696, "learning_rate": 0.00015529265255292653, "loss": 0.801, "step": 1227 }, { "epoch": 0.7413220645940235, "grad_norm": 0.08752121776342392, "learning_rate": 0.0001552511415525114, "loss": 0.7526, "step": 1228 }, { "epoch": 0.741925747057048, "grad_norm": 0.0805983915925026, "learning_rate": 0.0001552096305520963, "loss": 0.9307, "step": 1229 }, { "epoch": 0.7425294295200724, "grad_norm": 0.08306834846735, "learning_rate": 0.0001551681195516812, "loss": 0.7313, "step": 1230 }, { "epoch": 0.7431331119830968, "grad_norm": 0.08837762475013733, "learning_rate": 0.00015512660855126608, "loss": 0.9053, "step": 1231 }, { "epoch": 0.7437367944461213, "grad_norm": 0.07785294204950333, "learning_rate": 0.00015508509755085098, "loss": 0.7965, "step": 1232 }, { "epoch": 0.7443404769091458, "grad_norm": 0.0776657834649086, "learning_rate": 0.00015504358655043588, "loss": 0.8275, "step": 1233 }, { "epoch": 0.7449441593721703, "grad_norm": 0.08299911767244339, "learning_rate": 0.00015500207555002075, "loss": 0.7684, "step": 1234 }, { "epoch": 0.7455478418351947, "grad_norm": 0.09403832256793976, "learning_rate": 0.00015496056454960565, "loss": 0.8317, "step": 1235 }, { "epoch": 0.7461515242982192, "grad_norm": 0.08008915185928345, "learning_rate": 0.00015491905354919055, "loss": 0.6594, "step": 1236 }, { "epoch": 0.7467552067612436, "grad_norm": 0.08133239299058914, "learning_rate": 0.00015487754254877543, "loss": 0.6942, "step": 1237 }, { "epoch": 0.7473588892242681, "grad_norm": 0.08288057893514633, "learning_rate": 0.00015483603154836033, "loss": 0.7323, "step": 1238 }, { "epoch": 0.7479625716872925, "grad_norm": 0.08818881958723068, "learning_rate": 0.0001547945205479452, "loss": 0.6966, "step": 1239 }, { "epoch": 0.748566254150317, "grad_norm": 0.09948495030403137, "learning_rate": 0.0001547530095475301, "loss": 0.7126, "step": 1240 }, { "epoch": 0.7491699366133414, "grad_norm": 0.12626221776008606, "learning_rate": 0.000154711498547115, "loss": 0.7647, "step": 1241 }, { "epoch": 0.7497736190763659, "grad_norm": 0.10528393089771271, "learning_rate": 0.00015466998754669987, "loss": 0.8261, "step": 1242 }, { "epoch": 0.7503773015393903, "grad_norm": 0.10515029728412628, "learning_rate": 0.00015462847654628477, "loss": 0.6662, "step": 1243 }, { "epoch": 0.7509809840024148, "grad_norm": 0.10484447330236435, "learning_rate": 0.00015458696554586967, "loss": 0.7143, "step": 1244 }, { "epoch": 0.7515846664654392, "grad_norm": 0.10932087153196335, "learning_rate": 0.00015454545454545454, "loss": 0.6561, "step": 1245 }, { "epoch": 0.7521883489284636, "grad_norm": 0.10715696215629578, "learning_rate": 0.00015450394354503944, "loss": 0.6272, "step": 1246 }, { "epoch": 0.7527920313914881, "grad_norm": 0.11597730964422226, "learning_rate": 0.00015446243254462434, "loss": 0.6093, "step": 1247 }, { "epoch": 0.7533957138545125, "grad_norm": 0.11862548440694809, "learning_rate": 0.00015442092154420922, "loss": 0.6208, "step": 1248 }, { "epoch": 0.753999396317537, "grad_norm": 0.12388130277395248, "learning_rate": 0.00015437941054379412, "loss": 0.5587, "step": 1249 }, { "epoch": 0.7546030787805614, "grad_norm": 0.12081338465213776, "learning_rate": 0.00015433789954337902, "loss": 0.432, "step": 1250 }, { "epoch": 0.7552067612435859, "grad_norm": 0.08639662712812424, "learning_rate": 0.0001542963885429639, "loss": 0.9072, "step": 1251 }, { "epoch": 0.7558104437066103, "grad_norm": 0.0837317556142807, "learning_rate": 0.0001542548775425488, "loss": 1.1732, "step": 1252 }, { "epoch": 0.7564141261696348, "grad_norm": 0.09014665335416794, "learning_rate": 0.00015421336654213366, "loss": 0.8012, "step": 1253 }, { "epoch": 0.7570178086326592, "grad_norm": 0.09888771176338196, "learning_rate": 0.00015417185554171856, "loss": 0.8288, "step": 1254 }, { "epoch": 0.7576214910956837, "grad_norm": 0.09830440580844879, "learning_rate": 0.00015413034454130346, "loss": 0.9115, "step": 1255 }, { "epoch": 0.7582251735587081, "grad_norm": 0.08318249881267548, "learning_rate": 0.00015408883354088834, "loss": 0.8073, "step": 1256 }, { "epoch": 0.7588288560217326, "grad_norm": 0.09623551368713379, "learning_rate": 0.00015404732254047324, "loss": 0.8798, "step": 1257 }, { "epoch": 0.759432538484757, "grad_norm": 0.18773970007896423, "learning_rate": 0.00015400581154005814, "loss": 0.8237, "step": 1258 }, { "epoch": 0.7600362209477814, "grad_norm": 0.07493336498737335, "learning_rate": 0.000153964300539643, "loss": 0.9224, "step": 1259 }, { "epoch": 0.7606399034108059, "grad_norm": 0.08891238272190094, "learning_rate": 0.0001539227895392279, "loss": 0.8604, "step": 1260 }, { "epoch": 0.7612435858738303, "grad_norm": 0.0810379758477211, "learning_rate": 0.0001538812785388128, "loss": 0.7966, "step": 1261 }, { "epoch": 0.7618472683368548, "grad_norm": 0.08036971092224121, "learning_rate": 0.00015383976753839768, "loss": 0.7795, "step": 1262 }, { "epoch": 0.7624509507998792, "grad_norm": 0.08294712007045746, "learning_rate": 0.00015379825653798256, "loss": 1.0023, "step": 1263 }, { "epoch": 0.7630546332629037, "grad_norm": 0.0813121646642685, "learning_rate": 0.00015375674553756748, "loss": 1.0195, "step": 1264 }, { "epoch": 0.7636583157259281, "grad_norm": 0.08313670009374619, "learning_rate": 0.00015371523453715236, "loss": 0.9885, "step": 1265 }, { "epoch": 0.7642619981889526, "grad_norm": 0.07355551421642303, "learning_rate": 0.00015367372353673723, "loss": 0.8474, "step": 1266 }, { "epoch": 0.764865680651977, "grad_norm": 0.13657139241695404, "learning_rate": 0.00015363221253632213, "loss": 0.9142, "step": 1267 }, { "epoch": 0.7654693631150015, "grad_norm": 0.09950345009565353, "learning_rate": 0.00015359070153590703, "loss": 1.0635, "step": 1268 }, { "epoch": 0.7660730455780259, "grad_norm": 0.07820272445678711, "learning_rate": 0.0001535491905354919, "loss": 1.1326, "step": 1269 }, { "epoch": 0.7666767280410504, "grad_norm": 0.07532312721014023, "learning_rate": 0.0001535076795350768, "loss": 0.9895, "step": 1270 }, { "epoch": 0.7672804105040748, "grad_norm": 0.09391959011554718, "learning_rate": 0.0001534661685346617, "loss": 0.7384, "step": 1271 }, { "epoch": 0.7678840929670993, "grad_norm": 0.07820689678192139, "learning_rate": 0.00015342465753424657, "loss": 0.734, "step": 1272 }, { "epoch": 0.7684877754301238, "grad_norm": 0.08966294676065445, "learning_rate": 0.00015338314653383147, "loss": 0.7925, "step": 1273 }, { "epoch": 0.7690914578931483, "grad_norm": 0.08040351420640945, "learning_rate": 0.00015334163553341637, "loss": 0.7811, "step": 1274 }, { "epoch": 0.7696951403561727, "grad_norm": 0.08416478335857391, "learning_rate": 0.00015330012453300125, "loss": 0.9939, "step": 1275 }, { "epoch": 0.7702988228191971, "grad_norm": 0.08706527948379517, "learning_rate": 0.00015325861353258615, "loss": 0.8099, "step": 1276 }, { "epoch": 0.7709025052822216, "grad_norm": 0.10363662242889404, "learning_rate": 0.00015321710253217102, "loss": 0.7835, "step": 1277 }, { "epoch": 0.771506187745246, "grad_norm": 0.080405592918396, "learning_rate": 0.00015317559153175592, "loss": 0.7816, "step": 1278 }, { "epoch": 0.7721098702082705, "grad_norm": 0.0954577773809433, "learning_rate": 0.00015313408053134082, "loss": 1.0492, "step": 1279 }, { "epoch": 0.7727135526712949, "grad_norm": 0.08522694557905197, "learning_rate": 0.0001530925695309257, "loss": 0.7938, "step": 1280 }, { "epoch": 0.7733172351343194, "grad_norm": 0.0870218351483345, "learning_rate": 0.0001530510585305106, "loss": 1.0509, "step": 1281 }, { "epoch": 0.7739209175973438, "grad_norm": 0.07454058527946472, "learning_rate": 0.0001530095475300955, "loss": 0.8381, "step": 1282 }, { "epoch": 0.7745246000603683, "grad_norm": 0.0701836496591568, "learning_rate": 0.00015296803652968037, "loss": 0.6133, "step": 1283 }, { "epoch": 0.7751282825233927, "grad_norm": 0.08319991081953049, "learning_rate": 0.00015292652552926524, "loss": 0.7434, "step": 1284 }, { "epoch": 0.7757319649864172, "grad_norm": 0.07945888489484787, "learning_rate": 0.00015288501452885017, "loss": 0.7266, "step": 1285 }, { "epoch": 0.7763356474494416, "grad_norm": 0.0800202265381813, "learning_rate": 0.00015284350352843504, "loss": 0.7203, "step": 1286 }, { "epoch": 0.7769393299124661, "grad_norm": 0.08976299315690994, "learning_rate": 0.0001528019925280199, "loss": 0.7075, "step": 1287 }, { "epoch": 0.7775430123754905, "grad_norm": 0.08427184820175171, "learning_rate": 0.00015276048152760484, "loss": 0.6855, "step": 1288 }, { "epoch": 0.778146694838515, "grad_norm": 0.09182146191596985, "learning_rate": 0.0001527189705271897, "loss": 0.7375, "step": 1289 }, { "epoch": 0.7787503773015394, "grad_norm": 0.0932517945766449, "learning_rate": 0.0001526774595267746, "loss": 0.7168, "step": 1290 }, { "epoch": 0.7793540597645638, "grad_norm": 0.11614131927490234, "learning_rate": 0.00015263594852635949, "loss": 0.7134, "step": 1291 }, { "epoch": 0.7799577422275883, "grad_norm": 0.10372986644506454, "learning_rate": 0.00015259443752594439, "loss": 0.7539, "step": 1292 }, { "epoch": 0.7805614246906127, "grad_norm": 0.10486335307359695, "learning_rate": 0.00015255292652552929, "loss": 0.7198, "step": 1293 }, { "epoch": 0.7811651071536372, "grad_norm": 0.11475211381912231, "learning_rate": 0.00015251141552511416, "loss": 0.7086, "step": 1294 }, { "epoch": 0.7817687896166616, "grad_norm": 0.10555735975503922, "learning_rate": 0.00015246990452469906, "loss": 0.6697, "step": 1295 }, { "epoch": 0.7823724720796861, "grad_norm": 0.10920953750610352, "learning_rate": 0.00015242839352428396, "loss": 0.6179, "step": 1296 }, { "epoch": 0.7829761545427105, "grad_norm": 0.13240636885166168, "learning_rate": 0.00015238688252386883, "loss": 0.6766, "step": 1297 }, { "epoch": 0.783579837005735, "grad_norm": 0.1218361034989357, "learning_rate": 0.0001523453715234537, "loss": 0.6364, "step": 1298 }, { "epoch": 0.7841835194687594, "grad_norm": 0.1165798008441925, "learning_rate": 0.00015230386052303863, "loss": 0.5109, "step": 1299 }, { "epoch": 0.7847872019317839, "grad_norm": 0.12739436328411102, "learning_rate": 0.0001522623495226235, "loss": 0.4305, "step": 1300 }, { "epoch": 0.7853908843948083, "grad_norm": 0.10032903403043747, "learning_rate": 0.00015222083852220838, "loss": 0.8067, "step": 1301 }, { "epoch": 0.7859945668578328, "grad_norm": 0.08724936097860336, "learning_rate": 0.0001521793275217933, "loss": 0.9813, "step": 1302 }, { "epoch": 0.7865982493208572, "grad_norm": 0.08223757147789001, "learning_rate": 0.00015213781652137818, "loss": 0.756, "step": 1303 }, { "epoch": 0.7872019317838816, "grad_norm": 0.09227702766656876, "learning_rate": 0.00015209630552096305, "loss": 0.8074, "step": 1304 }, { "epoch": 0.7878056142469061, "grad_norm": 0.08399201929569244, "learning_rate": 0.00015205479452054795, "loss": 1.0582, "step": 1305 }, { "epoch": 0.7884092967099305, "grad_norm": 0.0827503651380539, "learning_rate": 0.00015201328352013285, "loss": 0.7878, "step": 1306 }, { "epoch": 0.789012979172955, "grad_norm": 0.08233575522899628, "learning_rate": 0.00015197177251971772, "loss": 0.8512, "step": 1307 }, { "epoch": 0.7896166616359794, "grad_norm": 0.09783722460269928, "learning_rate": 0.00015193026151930262, "loss": 0.8324, "step": 1308 }, { "epoch": 0.7902203440990039, "grad_norm": 0.09064453095197678, "learning_rate": 0.00015188875051888752, "loss": 0.8361, "step": 1309 }, { "epoch": 0.7908240265620283, "grad_norm": 0.09642300009727478, "learning_rate": 0.0001518472395184724, "loss": 0.7578, "step": 1310 }, { "epoch": 0.7914277090250528, "grad_norm": 0.08258376270532608, "learning_rate": 0.0001518057285180573, "loss": 0.9612, "step": 1311 }, { "epoch": 0.7920313914880772, "grad_norm": 0.08849532902240753, "learning_rate": 0.00015176421751764217, "loss": 0.798, "step": 1312 }, { "epoch": 0.7926350739511017, "grad_norm": 0.08029762655496597, "learning_rate": 0.00015172270651722707, "loss": 0.8082, "step": 1313 }, { "epoch": 0.7932387564141262, "grad_norm": 0.0844390019774437, "learning_rate": 0.00015168119551681197, "loss": 0.7453, "step": 1314 }, { "epoch": 0.7938424388771507, "grad_norm": 0.09358351677656174, "learning_rate": 0.00015163968451639684, "loss": 0.8036, "step": 1315 }, { "epoch": 0.7944461213401751, "grad_norm": 0.08466824889183044, "learning_rate": 0.00015159817351598174, "loss": 0.777, "step": 1316 }, { "epoch": 0.7950498038031996, "grad_norm": 0.08159632235765457, "learning_rate": 0.00015155666251556664, "loss": 0.8004, "step": 1317 }, { "epoch": 0.795653486266224, "grad_norm": 0.1605248749256134, "learning_rate": 0.00015151515151515152, "loss": 0.8841, "step": 1318 }, { "epoch": 0.7962571687292485, "grad_norm": 0.08063754439353943, "learning_rate": 0.00015147364051473642, "loss": 0.9292, "step": 1319 }, { "epoch": 0.7968608511922729, "grad_norm": 0.08406137675046921, "learning_rate": 0.00015143212951432132, "loss": 0.7878, "step": 1320 }, { "epoch": 0.7974645336552973, "grad_norm": 0.07940443605184555, "learning_rate": 0.0001513906185139062, "loss": 0.8981, "step": 1321 }, { "epoch": 0.7980682161183218, "grad_norm": 0.07914227992296219, "learning_rate": 0.00015134910751349106, "loss": 1.1571, "step": 1322 }, { "epoch": 0.7986718985813462, "grad_norm": 0.07836727797985077, "learning_rate": 0.000151307596513076, "loss": 0.8384, "step": 1323 }, { "epoch": 0.7992755810443707, "grad_norm": 0.08515416085720062, "learning_rate": 0.00015126608551266086, "loss": 0.7468, "step": 1324 }, { "epoch": 0.7998792635073951, "grad_norm": 0.0839102566242218, "learning_rate": 0.00015122457451224573, "loss": 0.9835, "step": 1325 }, { "epoch": 0.8004829459704196, "grad_norm": 0.2878398299217224, "learning_rate": 0.00015118306351183063, "loss": 1.232, "step": 1326 }, { "epoch": 0.801086628433444, "grad_norm": 0.0905466228723526, "learning_rate": 0.00015114155251141553, "loss": 0.8908, "step": 1327 }, { "epoch": 0.8016903108964685, "grad_norm": 0.08527706563472748, "learning_rate": 0.0001511000415110004, "loss": 0.8004, "step": 1328 }, { "epoch": 0.8022939933594929, "grad_norm": 0.08341808617115021, "learning_rate": 0.0001510585305105853, "loss": 0.7974, "step": 1329 }, { "epoch": 0.8028976758225174, "grad_norm": 0.07585755735635757, "learning_rate": 0.0001510170195101702, "loss": 0.7528, "step": 1330 }, { "epoch": 0.8035013582855418, "grad_norm": 0.08576681464910507, "learning_rate": 0.00015097550850975508, "loss": 0.7634, "step": 1331 }, { "epoch": 0.8041050407485663, "grad_norm": 0.10057312995195389, "learning_rate": 0.00015093399750933998, "loss": 0.6505, "step": 1332 }, { "epoch": 0.8047087232115907, "grad_norm": 0.08735567331314087, "learning_rate": 0.00015089248650892488, "loss": 0.6856, "step": 1333 }, { "epoch": 0.8053124056746152, "grad_norm": 0.09076007455587387, "learning_rate": 0.00015085097550850978, "loss": 0.7697, "step": 1334 }, { "epoch": 0.8059160881376396, "grad_norm": 0.09162318706512451, "learning_rate": 0.00015080946450809465, "loss": 0.7425, "step": 1335 }, { "epoch": 0.806519770600664, "grad_norm": 0.08863736689090729, "learning_rate": 0.00015076795350767953, "loss": 0.7748, "step": 1336 }, { "epoch": 0.8071234530636885, "grad_norm": 0.10001536458730698, "learning_rate": 0.00015072644250726445, "loss": 0.7162, "step": 1337 }, { "epoch": 0.8077271355267129, "grad_norm": 0.09326426684856415, "learning_rate": 0.00015068493150684933, "loss": 0.82, "step": 1338 }, { "epoch": 0.8083308179897374, "grad_norm": 0.09727218747138977, "learning_rate": 0.0001506434205064342, "loss": 0.7534, "step": 1339 }, { "epoch": 0.8089345004527618, "grad_norm": 0.09535179287195206, "learning_rate": 0.0001506019095060191, "loss": 0.7623, "step": 1340 }, { "epoch": 0.8095381829157863, "grad_norm": 0.09758217632770538, "learning_rate": 0.000150560398505604, "loss": 0.7533, "step": 1341 }, { "epoch": 0.8101418653788107, "grad_norm": 0.11768268793821335, "learning_rate": 0.00015051888750518887, "loss": 0.7481, "step": 1342 }, { "epoch": 0.8107455478418352, "grad_norm": 0.09633111953735352, "learning_rate": 0.00015047737650477377, "loss": 0.7191, "step": 1343 }, { "epoch": 0.8113492303048596, "grad_norm": 0.10011658817529678, "learning_rate": 0.00015043586550435867, "loss": 0.6665, "step": 1344 }, { "epoch": 0.8119529127678841, "grad_norm": 0.11359147727489471, "learning_rate": 0.00015039435450394354, "loss": 0.7549, "step": 1345 }, { "epoch": 0.8125565952309085, "grad_norm": 0.11092590540647507, "learning_rate": 0.00015035284350352844, "loss": 0.8078, "step": 1346 }, { "epoch": 0.813160277693933, "grad_norm": 0.1197749450802803, "learning_rate": 0.00015031133250311335, "loss": 0.6463, "step": 1347 }, { "epoch": 0.8137639601569574, "grad_norm": 0.12124069780111313, "learning_rate": 0.00015026982150269822, "loss": 0.6384, "step": 1348 }, { "epoch": 0.8143676426199818, "grad_norm": 0.14109613001346588, "learning_rate": 0.00015022831050228312, "loss": 0.5564, "step": 1349 }, { "epoch": 0.8149713250830063, "grad_norm": 0.12870782613754272, "learning_rate": 0.000150186799501868, "loss": 0.4514, "step": 1350 }, { "epoch": 0.8155750075460307, "grad_norm": 0.14718082547187805, "learning_rate": 0.0001501452885014529, "loss": 0.8223, "step": 1351 }, { "epoch": 0.8161786900090552, "grad_norm": 0.0827641561627388, "learning_rate": 0.0001501037775010378, "loss": 0.7125, "step": 1352 }, { "epoch": 0.8167823724720796, "grad_norm": 0.0898633524775505, "learning_rate": 0.00015006226650062266, "loss": 1.0915, "step": 1353 }, { "epoch": 0.8173860549351042, "grad_norm": 0.09281725436449051, "learning_rate": 0.00015002075550020756, "loss": 0.854, "step": 1354 }, { "epoch": 0.8179897373981286, "grad_norm": 0.08282145857810974, "learning_rate": 0.00014997924449979246, "loss": 1.0761, "step": 1355 }, { "epoch": 0.8185934198611531, "grad_norm": 0.08490285277366638, "learning_rate": 0.00014993773349937734, "loss": 1.1083, "step": 1356 }, { "epoch": 0.8191971023241775, "grad_norm": 0.09798530489206314, "learning_rate": 0.00014989622249896224, "loss": 1.0771, "step": 1357 }, { "epoch": 0.819800784787202, "grad_norm": 0.08119674026966095, "learning_rate": 0.00014985471149854714, "loss": 0.8048, "step": 1358 }, { "epoch": 0.8204044672502264, "grad_norm": 0.08817379921674728, "learning_rate": 0.000149813200498132, "loss": 0.8062, "step": 1359 }, { "epoch": 0.8210081497132509, "grad_norm": 0.08717963844537735, "learning_rate": 0.00014977168949771688, "loss": 0.8923, "step": 1360 }, { "epoch": 0.8216118321762753, "grad_norm": 0.08484237641096115, "learning_rate": 0.0001497301784973018, "loss": 0.882, "step": 1361 }, { "epoch": 0.8222155146392998, "grad_norm": 0.0800343006849289, "learning_rate": 0.00014968866749688668, "loss": 0.8224, "step": 1362 }, { "epoch": 0.8228191971023242, "grad_norm": 0.08844833076000214, "learning_rate": 0.00014964715649647156, "loss": 0.7582, "step": 1363 }, { "epoch": 0.8234228795653487, "grad_norm": 0.08437283337116241, "learning_rate": 0.00014960564549605646, "loss": 0.847, "step": 1364 }, { "epoch": 0.8240265620283731, "grad_norm": 0.08416473865509033, "learning_rate": 0.00014956413449564136, "loss": 0.8066, "step": 1365 }, { "epoch": 0.8246302444913975, "grad_norm": 0.0806473046541214, "learning_rate": 0.00014952262349522623, "loss": 0.797, "step": 1366 }, { "epoch": 0.825233926954422, "grad_norm": 0.08505918830633163, "learning_rate": 0.00014948111249481113, "loss": 0.826, "step": 1367 }, { "epoch": 0.8258376094174464, "grad_norm": 0.08299072831869125, "learning_rate": 0.00014943960149439603, "loss": 0.7974, "step": 1368 }, { "epoch": 0.8264412918804709, "grad_norm": 0.0950680747628212, "learning_rate": 0.0001493980904939809, "loss": 0.8961, "step": 1369 }, { "epoch": 0.8270449743434953, "grad_norm": 0.08349784463644028, "learning_rate": 0.0001493565794935658, "loss": 0.8791, "step": 1370 }, { "epoch": 0.8276486568065198, "grad_norm": 0.08619910478591919, "learning_rate": 0.0001493150684931507, "loss": 0.8367, "step": 1371 }, { "epoch": 0.8282523392695442, "grad_norm": 0.09480497241020203, "learning_rate": 0.00014927355749273557, "loss": 0.8101, "step": 1372 }, { "epoch": 0.8288560217325687, "grad_norm": 0.0833049863576889, "learning_rate": 0.00014923204649232047, "loss": 0.8616, "step": 1373 }, { "epoch": 0.8294597041955931, "grad_norm": 0.08111268281936646, "learning_rate": 0.00014919053549190535, "loss": 0.7388, "step": 1374 }, { "epoch": 0.8300633866586176, "grad_norm": 0.08618942648172379, "learning_rate": 0.00014914902449149025, "loss": 0.8678, "step": 1375 }, { "epoch": 0.830667069121642, "grad_norm": 0.08400508016347885, "learning_rate": 0.00014910751349107515, "loss": 0.7766, "step": 1376 }, { "epoch": 0.8312707515846665, "grad_norm": 0.0798826590180397, "learning_rate": 0.00014906600249066002, "loss": 0.8156, "step": 1377 }, { "epoch": 0.8318744340476909, "grad_norm": 0.08481590449810028, "learning_rate": 0.00014902449149024492, "loss": 1.117, "step": 1378 }, { "epoch": 0.8324781165107153, "grad_norm": 0.09439216554164886, "learning_rate": 0.00014898298048982982, "loss": 0.8822, "step": 1379 }, { "epoch": 0.8330817989737398, "grad_norm": 0.09862873703241348, "learning_rate": 0.0001489414694894147, "loss": 0.8087, "step": 1380 }, { "epoch": 0.8336854814367642, "grad_norm": 0.08466385304927826, "learning_rate": 0.0001488999584889996, "loss": 0.8568, "step": 1381 }, { "epoch": 0.8342891638997887, "grad_norm": 0.08554793149232864, "learning_rate": 0.0001488584474885845, "loss": 0.95, "step": 1382 }, { "epoch": 0.8348928463628131, "grad_norm": 0.07983773201704025, "learning_rate": 0.00014881693648816937, "loss": 0.7137, "step": 1383 }, { "epoch": 0.8354965288258376, "grad_norm": 0.07740309834480286, "learning_rate": 0.00014877542548775427, "loss": 0.7139, "step": 1384 }, { "epoch": 0.836100211288862, "grad_norm": 0.08271116763353348, "learning_rate": 0.00014873391448733917, "loss": 0.7001, "step": 1385 }, { "epoch": 0.8367038937518865, "grad_norm": 0.08564220368862152, "learning_rate": 0.00014869240348692404, "loss": 0.7624, "step": 1386 }, { "epoch": 0.8373075762149109, "grad_norm": 0.08788985759019852, "learning_rate": 0.00014865089248650894, "loss": 0.7563, "step": 1387 }, { "epoch": 0.8379112586779354, "grad_norm": 0.09162076562643051, "learning_rate": 0.0001486093814860938, "loss": 0.7008, "step": 1388 }, { "epoch": 0.8385149411409598, "grad_norm": 0.08919322490692139, "learning_rate": 0.0001485678704856787, "loss": 0.7408, "step": 1389 }, { "epoch": 0.8391186236039843, "grad_norm": 0.4382922053337097, "learning_rate": 0.0001485263594852636, "loss": 0.748, "step": 1390 }, { "epoch": 0.8397223060670087, "grad_norm": 0.09150999784469604, "learning_rate": 0.00014848484848484849, "loss": 0.7801, "step": 1391 }, { "epoch": 0.8403259885300332, "grad_norm": 0.09508758038282394, "learning_rate": 0.00014844333748443339, "loss": 0.6773, "step": 1392 }, { "epoch": 0.8409296709930576, "grad_norm": 0.10179366171360016, "learning_rate": 0.00014840182648401829, "loss": 0.7492, "step": 1393 }, { "epoch": 0.8415333534560822, "grad_norm": 0.10912247747182846, "learning_rate": 0.00014836031548360316, "loss": 0.7271, "step": 1394 }, { "epoch": 0.8421370359191066, "grad_norm": 0.10175374895334244, "learning_rate": 0.00014831880448318803, "loss": 0.6465, "step": 1395 }, { "epoch": 0.842740718382131, "grad_norm": 0.10953323543071747, "learning_rate": 0.00014827729348277296, "loss": 0.6639, "step": 1396 }, { "epoch": 0.8433444008451555, "grad_norm": 0.11571014672517776, "learning_rate": 0.00014823578248235783, "loss": 0.6389, "step": 1397 }, { "epoch": 0.8439480833081799, "grad_norm": 0.1150619387626648, "learning_rate": 0.0001481942714819427, "loss": 0.5918, "step": 1398 }, { "epoch": 0.8445517657712044, "grad_norm": 0.12481655180454254, "learning_rate": 0.00014815276048152763, "loss": 0.5285, "step": 1399 }, { "epoch": 0.8451554482342288, "grad_norm": 0.12378332763910294, "learning_rate": 0.0001481112494811125, "loss": 0.4092, "step": 1400 }, { "epoch": 0.8457591306972533, "grad_norm": 0.0824832022190094, "learning_rate": 0.00014806973848069738, "loss": 1.0004, "step": 1401 }, { "epoch": 0.8463628131602777, "grad_norm": 0.0863024890422821, "learning_rate": 0.00014802822748028228, "loss": 0.8013, "step": 1402 }, { "epoch": 0.8469664956233022, "grad_norm": 0.08444255590438843, "learning_rate": 0.00014798671647986718, "loss": 0.8442, "step": 1403 }, { "epoch": 0.8475701780863266, "grad_norm": 0.10533567517995834, "learning_rate": 0.00014794520547945205, "loss": 0.987, "step": 1404 }, { "epoch": 0.8481738605493511, "grad_norm": 0.09298814088106155, "learning_rate": 0.00014790369447903695, "loss": 0.9405, "step": 1405 }, { "epoch": 0.8487775430123755, "grad_norm": 0.09649625420570374, "learning_rate": 0.00014786218347862185, "loss": 0.9325, "step": 1406 }, { "epoch": 0.8493812254754, "grad_norm": 0.10233578085899353, "learning_rate": 0.00014782067247820672, "loss": 0.9722, "step": 1407 }, { "epoch": 0.8499849079384244, "grad_norm": 0.07888230681419373, "learning_rate": 0.00014777916147779162, "loss": 0.6847, "step": 1408 }, { "epoch": 0.8505885904014489, "grad_norm": 0.1061205044388771, "learning_rate": 0.0001477376504773765, "loss": 0.7159, "step": 1409 }, { "epoch": 0.8511922728644733, "grad_norm": 0.09067028015851974, "learning_rate": 0.0001476961394769614, "loss": 1.0638, "step": 1410 }, { "epoch": 0.8517959553274977, "grad_norm": 0.07997080683708191, "learning_rate": 0.0001476546284765463, "loss": 1.0731, "step": 1411 }, { "epoch": 0.8523996377905222, "grad_norm": 0.08588366955518723, "learning_rate": 0.00014761311747613117, "loss": 0.8226, "step": 1412 }, { "epoch": 0.8530033202535466, "grad_norm": 0.08147002011537552, "learning_rate": 0.00014757160647571607, "loss": 0.8404, "step": 1413 }, { "epoch": 0.8536070027165711, "grad_norm": 0.08258868753910065, "learning_rate": 0.00014753009547530097, "loss": 0.8363, "step": 1414 }, { "epoch": 0.8542106851795955, "grad_norm": 0.11236279457807541, "learning_rate": 0.00014748858447488584, "loss": 0.8376, "step": 1415 }, { "epoch": 0.85481436764262, "grad_norm": 0.07888762652873993, "learning_rate": 0.00014744707347447074, "loss": 0.8395, "step": 1416 }, { "epoch": 0.8554180501056444, "grad_norm": 0.08469484001398087, "learning_rate": 0.00014740556247405564, "loss": 1.0579, "step": 1417 }, { "epoch": 0.8560217325686689, "grad_norm": 0.07748138904571533, "learning_rate": 0.00014736405147364052, "loss": 0.7358, "step": 1418 }, { "epoch": 0.8566254150316933, "grad_norm": 0.08709147572517395, "learning_rate": 0.0001473225404732254, "loss": 0.9408, "step": 1419 }, { "epoch": 0.8572290974947178, "grad_norm": 0.08583850413560867, "learning_rate": 0.00014728102947281032, "loss": 0.822, "step": 1420 }, { "epoch": 0.8578327799577422, "grad_norm": 0.09203623980283737, "learning_rate": 0.0001472395184723952, "loss": 0.8535, "step": 1421 }, { "epoch": 0.8584364624207667, "grad_norm": 0.08235791325569153, "learning_rate": 0.0001471980074719801, "loss": 0.8012, "step": 1422 }, { "epoch": 0.8590401448837911, "grad_norm": 0.09391656517982483, "learning_rate": 0.00014715649647156496, "loss": 0.888, "step": 1423 }, { "epoch": 0.8596438273468155, "grad_norm": 0.0862961858510971, "learning_rate": 0.00014711498547114986, "loss": 0.8323, "step": 1424 }, { "epoch": 0.86024750980984, "grad_norm": 0.08743558079004288, "learning_rate": 0.00014707347447073476, "loss": 0.7275, "step": 1425 }, { "epoch": 0.8608511922728644, "grad_norm": 0.07890239357948303, "learning_rate": 0.00014703196347031963, "loss": 0.6607, "step": 1426 }, { "epoch": 0.8614548747358889, "grad_norm": 0.07826490700244904, "learning_rate": 0.00014699045246990453, "loss": 0.7179, "step": 1427 }, { "epoch": 0.8620585571989133, "grad_norm": 0.08540347218513489, "learning_rate": 0.00014694894146948943, "loss": 0.7166, "step": 1428 }, { "epoch": 0.8626622396619378, "grad_norm": 0.08672580122947693, "learning_rate": 0.0001469074304690743, "loss": 1.0812, "step": 1429 }, { "epoch": 0.8632659221249622, "grad_norm": 0.08532918244600296, "learning_rate": 0.0001468659194686592, "loss": 0.7676, "step": 1430 }, { "epoch": 0.8638696045879867, "grad_norm": 0.08678078651428223, "learning_rate": 0.0001468244084682441, "loss": 0.8672, "step": 1431 }, { "epoch": 0.8644732870510111, "grad_norm": 0.08257097750902176, "learning_rate": 0.00014678289746782898, "loss": 0.7108, "step": 1432 }, { "epoch": 0.8650769695140356, "grad_norm": 0.09253834187984467, "learning_rate": 0.00014674138646741385, "loss": 0.8333, "step": 1433 }, { "epoch": 0.86568065197706, "grad_norm": 0.08253934979438782, "learning_rate": 0.00014669987546699878, "loss": 0.7099, "step": 1434 }, { "epoch": 0.8662843344400846, "grad_norm": 0.08953447639942169, "learning_rate": 0.00014665836446658365, "loss": 0.7593, "step": 1435 }, { "epoch": 0.866888016903109, "grad_norm": 0.09555850178003311, "learning_rate": 0.00014661685346616853, "loss": 0.6711, "step": 1436 }, { "epoch": 0.8674916993661335, "grad_norm": 0.08465403318405151, "learning_rate": 0.00014657534246575343, "loss": 0.7276, "step": 1437 }, { "epoch": 0.8680953818291579, "grad_norm": 0.08792877197265625, "learning_rate": 0.00014653383146533833, "loss": 0.7075, "step": 1438 }, { "epoch": 0.8686990642921824, "grad_norm": 0.09290292859077454, "learning_rate": 0.0001464923204649232, "loss": 0.7923, "step": 1439 }, { "epoch": 0.8693027467552068, "grad_norm": 0.09828725457191467, "learning_rate": 0.0001464508094645081, "loss": 0.7282, "step": 1440 }, { "epoch": 0.8699064292182312, "grad_norm": 0.1370926946401596, "learning_rate": 0.000146409298464093, "loss": 0.7228, "step": 1441 }, { "epoch": 0.8705101116812557, "grad_norm": 0.09725096821784973, "learning_rate": 0.00014636778746367787, "loss": 0.7643, "step": 1442 }, { "epoch": 0.8711137941442801, "grad_norm": 0.09183010458946228, "learning_rate": 0.00014632627646326277, "loss": 0.7243, "step": 1443 }, { "epoch": 0.8717174766073046, "grad_norm": 0.0972151830792427, "learning_rate": 0.00014628476546284767, "loss": 0.6837, "step": 1444 }, { "epoch": 0.872321159070329, "grad_norm": 0.11110269278287888, "learning_rate": 0.00014624325446243255, "loss": 0.7487, "step": 1445 }, { "epoch": 0.8729248415333535, "grad_norm": 0.10543739795684814, "learning_rate": 0.00014620174346201745, "loss": 0.6177, "step": 1446 }, { "epoch": 0.8735285239963779, "grad_norm": 0.13775797188282013, "learning_rate": 0.00014616023246160232, "loss": 0.6354, "step": 1447 }, { "epoch": 0.8741322064594024, "grad_norm": 0.11157894879579544, "learning_rate": 0.00014611872146118722, "loss": 0.5814, "step": 1448 }, { "epoch": 0.8747358889224268, "grad_norm": 0.11887330561876297, "learning_rate": 0.00014607721046077212, "loss": 0.5033, "step": 1449 }, { "epoch": 0.8753395713854513, "grad_norm": 0.1345004439353943, "learning_rate": 0.000146035699460357, "loss": 0.4091, "step": 1450 }, { "epoch": 0.8759432538484757, "grad_norm": 0.1368480920791626, "learning_rate": 0.0001459941884599419, "loss": 0.7745, "step": 1451 }, { "epoch": 0.8765469363115002, "grad_norm": 0.09147506207227707, "learning_rate": 0.0001459526774595268, "loss": 1.1362, "step": 1452 }, { "epoch": 0.8771506187745246, "grad_norm": 0.07865995168685913, "learning_rate": 0.00014591116645911166, "loss": 0.7771, "step": 1453 }, { "epoch": 0.877754301237549, "grad_norm": 0.0899992287158966, "learning_rate": 0.00014586965545869656, "loss": 0.779, "step": 1454 }, { "epoch": 0.8783579837005735, "grad_norm": 0.0916723981499672, "learning_rate": 0.00014582814445828146, "loss": 1.2033, "step": 1455 }, { "epoch": 0.8789616661635979, "grad_norm": 0.09812314063310623, "learning_rate": 0.00014578663345786634, "loss": 0.9668, "step": 1456 }, { "epoch": 0.8795653486266224, "grad_norm": 0.09160758554935455, "learning_rate": 0.0001457451224574512, "loss": 0.8744, "step": 1457 }, { "epoch": 0.8801690310896468, "grad_norm": 0.0805237665772438, "learning_rate": 0.00014570361145703614, "loss": 0.9906, "step": 1458 }, { "epoch": 0.8807727135526713, "grad_norm": 0.09051381051540375, "learning_rate": 0.000145662100456621, "loss": 1.0932, "step": 1459 }, { "epoch": 0.8813763960156957, "grad_norm": 0.08794981986284256, "learning_rate": 0.00014562058945620588, "loss": 0.7806, "step": 1460 }, { "epoch": 0.8819800784787202, "grad_norm": 0.0870475247502327, "learning_rate": 0.00014557907845579078, "loss": 1.0775, "step": 1461 }, { "epoch": 0.8825837609417446, "grad_norm": 0.0890730544924736, "learning_rate": 0.00014553756745537568, "loss": 0.7489, "step": 1462 }, { "epoch": 0.8831874434047691, "grad_norm": 0.08005908131599426, "learning_rate": 0.00014549605645496056, "loss": 1.0614, "step": 1463 }, { "epoch": 0.8837911258677935, "grad_norm": 0.10012607276439667, "learning_rate": 0.00014545454545454546, "loss": 0.8, "step": 1464 }, { "epoch": 0.884394808330818, "grad_norm": 0.09003414958715439, "learning_rate": 0.00014541303445413036, "loss": 0.7296, "step": 1465 }, { "epoch": 0.8849984907938424, "grad_norm": 0.09428632259368896, "learning_rate": 0.00014537152345371523, "loss": 0.7256, "step": 1466 }, { "epoch": 0.8856021732568669, "grad_norm": 0.08243349939584732, "learning_rate": 0.00014533001245330013, "loss": 0.7458, "step": 1467 }, { "epoch": 0.8862058557198913, "grad_norm": 0.07649943977594376, "learning_rate": 0.00014528850145288503, "loss": 0.7372, "step": 1468 }, { "epoch": 0.8868095381829157, "grad_norm": 0.08396594226360321, "learning_rate": 0.00014524699045246993, "loss": 0.8017, "step": 1469 }, { "epoch": 0.8874132206459402, "grad_norm": 0.10132227838039398, "learning_rate": 0.0001452054794520548, "loss": 1.2413, "step": 1470 }, { "epoch": 0.8880169031089646, "grad_norm": 0.0780363529920578, "learning_rate": 0.00014516396845163968, "loss": 0.8076, "step": 1471 }, { "epoch": 0.8886205855719891, "grad_norm": 0.09692296385765076, "learning_rate": 0.0001451224574512246, "loss": 0.9112, "step": 1472 }, { "epoch": 0.8892242680350135, "grad_norm": 0.08603715151548386, "learning_rate": 0.00014508094645080948, "loss": 0.8524, "step": 1473 }, { "epoch": 0.889827950498038, "grad_norm": 0.0896432101726532, "learning_rate": 0.00014503943545039435, "loss": 1.0479, "step": 1474 }, { "epoch": 0.8904316329610625, "grad_norm": 0.0839807540178299, "learning_rate": 0.00014499792444997925, "loss": 0.794, "step": 1475 }, { "epoch": 0.891035315424087, "grad_norm": 0.08278023451566696, "learning_rate": 0.00014495641344956415, "loss": 0.8375, "step": 1476 }, { "epoch": 0.8916389978871114, "grad_norm": 0.07943416386842728, "learning_rate": 0.00014491490244914902, "loss": 1.0645, "step": 1477 }, { "epoch": 0.8922426803501359, "grad_norm": 0.08287323266267776, "learning_rate": 0.00014487339144873392, "loss": 0.7824, "step": 1478 }, { "epoch": 0.8928463628131603, "grad_norm": 0.09486839175224304, "learning_rate": 0.00014483188044831882, "loss": 0.9189, "step": 1479 }, { "epoch": 0.8934500452761848, "grad_norm": 0.09007449448108673, "learning_rate": 0.0001447903694479037, "loss": 0.7276, "step": 1480 }, { "epoch": 0.8940537277392092, "grad_norm": 0.08944438397884369, "learning_rate": 0.0001447488584474886, "loss": 0.8821, "step": 1481 }, { "epoch": 0.8946574102022337, "grad_norm": 0.08388552069664001, "learning_rate": 0.0001447073474470735, "loss": 0.7537, "step": 1482 }, { "epoch": 0.8952610926652581, "grad_norm": 0.09146512299776077, "learning_rate": 0.00014466583644665837, "loss": 0.7041, "step": 1483 }, { "epoch": 0.8958647751282826, "grad_norm": 0.09139760583639145, "learning_rate": 0.00014462432544624327, "loss": 0.7686, "step": 1484 }, { "epoch": 0.896468457591307, "grad_norm": 0.08465917408466339, "learning_rate": 0.00014458281444582814, "loss": 0.7442, "step": 1485 }, { "epoch": 0.8970721400543314, "grad_norm": 0.08978710323572159, "learning_rate": 0.00014454130344541304, "loss": 0.7162, "step": 1486 }, { "epoch": 0.8976758225173559, "grad_norm": 0.09368009865283966, "learning_rate": 0.00014449979244499794, "loss": 0.6723, "step": 1487 }, { "epoch": 0.8982795049803803, "grad_norm": 0.09958908706903458, "learning_rate": 0.0001444582814445828, "loss": 0.7469, "step": 1488 }, { "epoch": 0.8988831874434048, "grad_norm": 0.09646685421466827, "learning_rate": 0.0001444167704441677, "loss": 0.7735, "step": 1489 }, { "epoch": 0.8994868699064292, "grad_norm": 0.09356389194726944, "learning_rate": 0.00014437525944375261, "loss": 0.7194, "step": 1490 }, { "epoch": 0.9000905523694537, "grad_norm": 0.09738507866859436, "learning_rate": 0.0001443337484433375, "loss": 0.6941, "step": 1491 }, { "epoch": 0.9006942348324781, "grad_norm": 0.10621945559978485, "learning_rate": 0.0001442922374429224, "loss": 0.636, "step": 1492 }, { "epoch": 0.9012979172955026, "grad_norm": 0.10333646088838577, "learning_rate": 0.0001442507264425073, "loss": 0.6574, "step": 1493 }, { "epoch": 0.901901599758527, "grad_norm": 0.1097961962223053, "learning_rate": 0.00014420921544209216, "loss": 0.7928, "step": 1494 }, { "epoch": 0.9025052822215515, "grad_norm": 0.11894567310810089, "learning_rate": 0.00014416770444167703, "loss": 0.6904, "step": 1495 }, { "epoch": 0.9031089646845759, "grad_norm": 0.12034012377262115, "learning_rate": 0.00014412619344126196, "loss": 0.6091, "step": 1496 }, { "epoch": 0.9037126471476004, "grad_norm": 0.11718066781759262, "learning_rate": 0.00014408468244084683, "loss": 0.6336, "step": 1497 }, { "epoch": 0.9043163296106248, "grad_norm": 0.130398690700531, "learning_rate": 0.0001440431714404317, "loss": 0.5771, "step": 1498 }, { "epoch": 0.9049200120736492, "grad_norm": 0.12476742267608643, "learning_rate": 0.0001440016604400166, "loss": 0.5312, "step": 1499 }, { "epoch": 0.9055236945366737, "grad_norm": 0.12364601343870163, "learning_rate": 0.0001439601494396015, "loss": 0.3942, "step": 1500 }, { "epoch": 0.9055236945366737, "eval_loss": 0.8090887069702148, "eval_runtime": 1219.063, "eval_samples_per_second": 2.289, "eval_steps_per_second": 0.286, "step": 1500 }, { "epoch": 0.9061273769996981, "grad_norm": 0.0810943990945816, "learning_rate": 0.00014391863843918638, "loss": 1.0772, "step": 1501 }, { "epoch": 0.9067310594627226, "grad_norm": 0.08426682651042938, "learning_rate": 0.00014387712743877128, "loss": 0.7147, "step": 1502 }, { "epoch": 0.907334741925747, "grad_norm": 0.09331586956977844, "learning_rate": 0.00014383561643835618, "loss": 0.8483, "step": 1503 }, { "epoch": 0.9079384243887715, "grad_norm": 0.09169352054595947, "learning_rate": 0.00014379410543794105, "loss": 0.763, "step": 1504 }, { "epoch": 0.9085421068517959, "grad_norm": 0.08313553780317307, "learning_rate": 0.00014375259443752595, "loss": 0.7842, "step": 1505 }, { "epoch": 0.9091457893148204, "grad_norm": 0.08828964829444885, "learning_rate": 0.00014371108343711085, "loss": 0.9434, "step": 1506 }, { "epoch": 0.9097494717778448, "grad_norm": 0.08548730611801147, "learning_rate": 0.00014366957243669572, "loss": 0.7584, "step": 1507 }, { "epoch": 0.9103531542408693, "grad_norm": 0.09586931765079498, "learning_rate": 0.00014362806143628062, "loss": 1.016, "step": 1508 }, { "epoch": 0.9109568367038937, "grad_norm": 0.08847955614328384, "learning_rate": 0.0001435865504358655, "loss": 0.886, "step": 1509 }, { "epoch": 0.9115605191669182, "grad_norm": 0.14398765563964844, "learning_rate": 0.0001435450394354504, "loss": 0.7924, "step": 1510 }, { "epoch": 0.9121642016299426, "grad_norm": 0.08921834826469421, "learning_rate": 0.0001435035284350353, "loss": 0.8978, "step": 1511 }, { "epoch": 0.912767884092967, "grad_norm": 0.07638692855834961, "learning_rate": 0.00014346201743462017, "loss": 0.6995, "step": 1512 }, { "epoch": 0.9133715665559915, "grad_norm": 0.08616790175437927, "learning_rate": 0.00014342050643420507, "loss": 0.9325, "step": 1513 }, { "epoch": 0.913975249019016, "grad_norm": 0.08327416330575943, "learning_rate": 0.00014337899543378997, "loss": 1.0005, "step": 1514 }, { "epoch": 0.9145789314820404, "grad_norm": 0.07625047117471695, "learning_rate": 0.00014333748443337484, "loss": 0.9611, "step": 1515 }, { "epoch": 0.915182613945065, "grad_norm": 0.08822344988584518, "learning_rate": 0.00014329597343295974, "loss": 0.8797, "step": 1516 }, { "epoch": 0.9157862964080894, "grad_norm": 0.08142781257629395, "learning_rate": 0.00014325446243254464, "loss": 0.8249, "step": 1517 }, { "epoch": 0.9163899788711138, "grad_norm": 0.07951053231954575, "learning_rate": 0.00014321295143212952, "loss": 0.8624, "step": 1518 }, { "epoch": 0.9169936613341383, "grad_norm": 0.08235177397727966, "learning_rate": 0.00014317144043171442, "loss": 0.8208, "step": 1519 }, { "epoch": 0.9175973437971627, "grad_norm": 0.09271861612796783, "learning_rate": 0.00014312992943129932, "loss": 0.789, "step": 1520 }, { "epoch": 0.9182010262601872, "grad_norm": 0.09429887682199478, "learning_rate": 0.0001430884184308842, "loss": 1.1317, "step": 1521 }, { "epoch": 0.9188047087232116, "grad_norm": 0.08084212988615036, "learning_rate": 0.0001430469074304691, "loss": 0.7235, "step": 1522 }, { "epoch": 0.9194083911862361, "grad_norm": 0.08684766292572021, "learning_rate": 0.00014300539643005396, "loss": 0.9076, "step": 1523 }, { "epoch": 0.9200120736492605, "grad_norm": 0.07625840604305267, "learning_rate": 0.00014296388542963886, "loss": 0.743, "step": 1524 }, { "epoch": 0.920615756112285, "grad_norm": 0.0968519002199173, "learning_rate": 0.00014292237442922376, "loss": 0.8984, "step": 1525 }, { "epoch": 0.9212194385753094, "grad_norm": 0.08472032099962234, "learning_rate": 0.00014288086342880864, "loss": 1.0295, "step": 1526 }, { "epoch": 0.9218231210383339, "grad_norm": 0.0939970538020134, "learning_rate": 0.00014283935242839354, "loss": 0.9342, "step": 1527 }, { "epoch": 0.9224268035013583, "grad_norm": 0.14263185858726501, "learning_rate": 0.00014279784142797844, "loss": 0.8136, "step": 1528 }, { "epoch": 0.9230304859643828, "grad_norm": 0.08136036992073059, "learning_rate": 0.0001427563304275633, "loss": 0.8533, "step": 1529 }, { "epoch": 0.9236341684274072, "grad_norm": 0.0846790298819542, "learning_rate": 0.00014271481942714818, "loss": 0.857, "step": 1530 }, { "epoch": 0.9242378508904316, "grad_norm": 0.08513128757476807, "learning_rate": 0.0001426733084267331, "loss": 0.8139, "step": 1531 }, { "epoch": 0.9248415333534561, "grad_norm": 0.07980025559663773, "learning_rate": 0.00014263179742631798, "loss": 0.745, "step": 1532 }, { "epoch": 0.9254452158164805, "grad_norm": 0.0855269581079483, "learning_rate": 0.00014259028642590285, "loss": 0.7256, "step": 1533 }, { "epoch": 0.926048898279505, "grad_norm": 0.08665366470813751, "learning_rate": 0.00014254877542548778, "loss": 0.7378, "step": 1534 }, { "epoch": 0.9266525807425294, "grad_norm": 0.098544642329216, "learning_rate": 0.00014250726442507265, "loss": 0.7067, "step": 1535 }, { "epoch": 0.9272562632055539, "grad_norm": 0.08651740849018097, "learning_rate": 0.00014246575342465753, "loss": 0.7731, "step": 1536 }, { "epoch": 0.9278599456685783, "grad_norm": 0.09180111438035965, "learning_rate": 0.00014242424242424243, "loss": 0.7735, "step": 1537 }, { "epoch": 0.9284636281316028, "grad_norm": 0.09831391274929047, "learning_rate": 0.00014238273142382733, "loss": 0.7889, "step": 1538 }, { "epoch": 0.9290673105946272, "grad_norm": 0.09092969447374344, "learning_rate": 0.0001423412204234122, "loss": 0.8703, "step": 1539 }, { "epoch": 0.9296709930576517, "grad_norm": 0.08774839341640472, "learning_rate": 0.0001422997094229971, "loss": 0.6438, "step": 1540 }, { "epoch": 0.9302746755206761, "grad_norm": 0.10328055173158646, "learning_rate": 0.000142258198422582, "loss": 0.7747, "step": 1541 }, { "epoch": 0.9308783579837006, "grad_norm": 0.0955677479505539, "learning_rate": 0.00014221668742216687, "loss": 0.7044, "step": 1542 }, { "epoch": 0.931482040446725, "grad_norm": 0.12321915477514267, "learning_rate": 0.00014217517642175177, "loss": 0.7503, "step": 1543 }, { "epoch": 0.9320857229097494, "grad_norm": 0.10311946272850037, "learning_rate": 0.00014213366542133665, "loss": 0.7091, "step": 1544 }, { "epoch": 0.9326894053727739, "grad_norm": 0.10908188670873642, "learning_rate": 0.00014209215442092155, "loss": 0.6301, "step": 1545 }, { "epoch": 0.9332930878357983, "grad_norm": 0.10754991322755814, "learning_rate": 0.00014205064342050645, "loss": 0.6588, "step": 1546 }, { "epoch": 0.9338967702988228, "grad_norm": 0.11673349887132645, "learning_rate": 0.00014200913242009132, "loss": 0.6268, "step": 1547 }, { "epoch": 0.9345004527618472, "grad_norm": 0.12704375386238098, "learning_rate": 0.00014196762141967622, "loss": 0.6035, "step": 1548 }, { "epoch": 0.9351041352248717, "grad_norm": 0.1293480098247528, "learning_rate": 0.00014192611041926112, "loss": 0.5336, "step": 1549 }, { "epoch": 0.9357078176878961, "grad_norm": 0.13309147953987122, "learning_rate": 0.000141884599418846, "loss": 0.4451, "step": 1550 }, { "epoch": 0.9363115001509206, "grad_norm": 0.1289544254541397, "learning_rate": 0.0001418430884184309, "loss": 0.818, "step": 1551 }, { "epoch": 0.936915182613945, "grad_norm": 0.10981032997369766, "learning_rate": 0.0001418015774180158, "loss": 0.8971, "step": 1552 }, { "epoch": 0.9375188650769695, "grad_norm": 0.07926082611083984, "learning_rate": 0.00014176006641760067, "loss": 0.7756, "step": 1553 }, { "epoch": 0.9381225475399939, "grad_norm": 0.08216461539268494, "learning_rate": 0.00014171855541718554, "loss": 0.7566, "step": 1554 }, { "epoch": 0.9387262300030184, "grad_norm": 0.08312779664993286, "learning_rate": 0.00014167704441677047, "loss": 0.7627, "step": 1555 }, { "epoch": 0.9393299124660429, "grad_norm": 0.08494763821363449, "learning_rate": 0.00014163553341635534, "loss": 1.0068, "step": 1556 }, { "epoch": 0.9399335949290674, "grad_norm": 0.10769182443618774, "learning_rate": 0.00014159402241594024, "loss": 0.8372, "step": 1557 }, { "epoch": 0.9405372773920918, "grad_norm": 0.08497872948646545, "learning_rate": 0.0001415525114155251, "loss": 0.8532, "step": 1558 }, { "epoch": 0.9411409598551163, "grad_norm": 0.07787720859050751, "learning_rate": 0.00014151100041511, "loss": 0.7097, "step": 1559 }, { "epoch": 0.9417446423181407, "grad_norm": 0.10002875328063965, "learning_rate": 0.0001414694894146949, "loss": 0.8718, "step": 1560 }, { "epoch": 0.9423483247811651, "grad_norm": 0.09802395850419998, "learning_rate": 0.00014142797841427978, "loss": 0.8319, "step": 1561 }, { "epoch": 0.9429520072441896, "grad_norm": 0.08751332759857178, "learning_rate": 0.00014138646741386468, "loss": 0.7035, "step": 1562 }, { "epoch": 0.943555689707214, "grad_norm": 0.08736349642276764, "learning_rate": 0.00014134495641344958, "loss": 0.8215, "step": 1563 }, { "epoch": 0.9441593721702385, "grad_norm": 0.09416454285383224, "learning_rate": 0.00014130344541303446, "loss": 0.8634, "step": 1564 }, { "epoch": 0.9447630546332629, "grad_norm": 0.0884031280875206, "learning_rate": 0.00014126193441261936, "loss": 0.848, "step": 1565 }, { "epoch": 0.9453667370962874, "grad_norm": 0.08755529671907425, "learning_rate": 0.00014122042341220426, "loss": 0.8167, "step": 1566 }, { "epoch": 0.9459704195593118, "grad_norm": 0.10254927724599838, "learning_rate": 0.00014117891241178913, "loss": 0.8119, "step": 1567 }, { "epoch": 0.9465741020223363, "grad_norm": 0.0907716229557991, "learning_rate": 0.000141137401411374, "loss": 1.0695, "step": 1568 }, { "epoch": 0.9471777844853607, "grad_norm": 0.08859268575906754, "learning_rate": 0.00014109589041095893, "loss": 0.8692, "step": 1569 }, { "epoch": 0.9477814669483852, "grad_norm": 0.0878458023071289, "learning_rate": 0.0001410543794105438, "loss": 0.758, "step": 1570 }, { "epoch": 0.9483851494114096, "grad_norm": 0.10177980363368988, "learning_rate": 0.00014101286841012868, "loss": 0.7831, "step": 1571 }, { "epoch": 0.9489888318744341, "grad_norm": 0.08606535941362381, "learning_rate": 0.00014097135740971358, "loss": 1.0743, "step": 1572 }, { "epoch": 0.9495925143374585, "grad_norm": 0.09055022895336151, "learning_rate": 0.00014092984640929848, "loss": 0.9108, "step": 1573 }, { "epoch": 0.950196196800483, "grad_norm": 0.08120472729206085, "learning_rate": 0.00014088833540888335, "loss": 0.7569, "step": 1574 }, { "epoch": 0.9507998792635074, "grad_norm": 0.08704983443021774, "learning_rate": 0.00014084682440846825, "loss": 0.8356, "step": 1575 }, { "epoch": 0.9514035617265318, "grad_norm": 0.08406449109315872, "learning_rate": 0.00014080531340805315, "loss": 0.7055, "step": 1576 }, { "epoch": 0.9520072441895563, "grad_norm": 0.08247263729572296, "learning_rate": 0.00014076380240763802, "loss": 0.8572, "step": 1577 }, { "epoch": 0.9526109266525807, "grad_norm": 0.13809391856193542, "learning_rate": 0.00014072229140722292, "loss": 0.7313, "step": 1578 }, { "epoch": 0.9532146091156052, "grad_norm": 0.09925238788127899, "learning_rate": 0.00014068078040680782, "loss": 0.7794, "step": 1579 }, { "epoch": 0.9538182915786296, "grad_norm": 0.08268136531114578, "learning_rate": 0.0001406392694063927, "loss": 0.7268, "step": 1580 }, { "epoch": 0.9544219740416541, "grad_norm": 0.08466971665620804, "learning_rate": 0.0001405977584059776, "loss": 0.8175, "step": 1581 }, { "epoch": 0.9550256565046785, "grad_norm": 0.08671456575393677, "learning_rate": 0.00014055624740556247, "loss": 0.7747, "step": 1582 }, { "epoch": 0.955629338967703, "grad_norm": 0.2081962525844574, "learning_rate": 0.00014051473640514737, "loss": 0.8782, "step": 1583 }, { "epoch": 0.9562330214307274, "grad_norm": 0.11799836158752441, "learning_rate": 0.00014047322540473227, "loss": 0.8098, "step": 1584 }, { "epoch": 0.9568367038937519, "grad_norm": 0.08406732231378555, "learning_rate": 0.00014043171440431714, "loss": 0.7716, "step": 1585 }, { "epoch": 0.9574403863567763, "grad_norm": 0.08422478288412094, "learning_rate": 0.00014039020340390204, "loss": 0.7237, "step": 1586 }, { "epoch": 0.9580440688198008, "grad_norm": 0.09907133132219315, "learning_rate": 0.00014034869240348694, "loss": 0.7298, "step": 1587 }, { "epoch": 0.9586477512828252, "grad_norm": 0.08633650094270706, "learning_rate": 0.00014030718140307181, "loss": 0.7262, "step": 1588 }, { "epoch": 0.9592514337458496, "grad_norm": 0.0880611315369606, "learning_rate": 0.00014026567040265671, "loss": 0.6691, "step": 1589 }, { "epoch": 0.9598551162088741, "grad_norm": 0.0872715413570404, "learning_rate": 0.00014022415940224161, "loss": 0.6593, "step": 1590 }, { "epoch": 0.9604587986718985, "grad_norm": 0.09152337163686752, "learning_rate": 0.0001401826484018265, "loss": 0.7065, "step": 1591 }, { "epoch": 0.961062481134923, "grad_norm": 0.1341097503900528, "learning_rate": 0.00014014113740141136, "loss": 0.723, "step": 1592 }, { "epoch": 0.9616661635979474, "grad_norm": 0.09733890742063522, "learning_rate": 0.0001400996264009963, "loss": 0.7194, "step": 1593 }, { "epoch": 0.9622698460609719, "grad_norm": 0.10491717606782913, "learning_rate": 0.00014005811540058116, "loss": 0.6986, "step": 1594 }, { "epoch": 0.9628735285239963, "grad_norm": 0.10709907114505768, "learning_rate": 0.00014001660440016603, "loss": 0.7316, "step": 1595 }, { "epoch": 0.9634772109870209, "grad_norm": 0.1096893846988678, "learning_rate": 0.00013997509339975093, "loss": 0.6352, "step": 1596 }, { "epoch": 0.9640808934500453, "grad_norm": 0.11475121229887009, "learning_rate": 0.00013993358239933583, "loss": 0.6632, "step": 1597 }, { "epoch": 0.9646845759130698, "grad_norm": 0.12020900100469589, "learning_rate": 0.0001398920713989207, "loss": 0.6824, "step": 1598 }, { "epoch": 0.9652882583760942, "grad_norm": 0.12162651866674423, "learning_rate": 0.0001398505603985056, "loss": 0.5713, "step": 1599 }, { "epoch": 0.9658919408391187, "grad_norm": 0.13266035914421082, "learning_rate": 0.0001398090493980905, "loss": 0.4436, "step": 1600 }, { "epoch": 0.9664956233021431, "grad_norm": 0.0847954973578453, "learning_rate": 0.0001397675383976754, "loss": 0.8099, "step": 1601 }, { "epoch": 0.9670993057651676, "grad_norm": 0.08815553784370422, "learning_rate": 0.00013972602739726028, "loss": 0.8847, "step": 1602 }, { "epoch": 0.967702988228192, "grad_norm": 0.0948781669139862, "learning_rate": 0.00013968451639684518, "loss": 1.0048, "step": 1603 }, { "epoch": 0.9683066706912165, "grad_norm": 0.08522479981184006, "learning_rate": 0.00013964300539643008, "loss": 1.0174, "step": 1604 }, { "epoch": 0.9689103531542409, "grad_norm": 0.07847335189580917, "learning_rate": 0.00013960149439601495, "loss": 0.7368, "step": 1605 }, { "epoch": 0.9695140356172653, "grad_norm": 0.10585256665945053, "learning_rate": 0.00013955998339559983, "loss": 0.9218, "step": 1606 }, { "epoch": 0.9701177180802898, "grad_norm": 0.08924368768930435, "learning_rate": 0.00013951847239518475, "loss": 0.8461, "step": 1607 }, { "epoch": 0.9707214005433142, "grad_norm": 0.07704899460077286, "learning_rate": 0.00013947696139476963, "loss": 0.7675, "step": 1608 }, { "epoch": 0.9713250830063387, "grad_norm": 0.08050254732370377, "learning_rate": 0.0001394354503943545, "loss": 0.845, "step": 1609 }, { "epoch": 0.9719287654693631, "grad_norm": 0.08915068209171295, "learning_rate": 0.0001393939393939394, "loss": 0.8523, "step": 1610 }, { "epoch": 0.9725324479323876, "grad_norm": 0.07807064801454544, "learning_rate": 0.0001393524283935243, "loss": 0.6999, "step": 1611 }, { "epoch": 0.973136130395412, "grad_norm": 0.1043185442686081, "learning_rate": 0.00013931091739310917, "loss": 0.8859, "step": 1612 }, { "epoch": 0.9737398128584365, "grad_norm": 0.1474182903766632, "learning_rate": 0.00013926940639269407, "loss": 1.1711, "step": 1613 }, { "epoch": 0.9743434953214609, "grad_norm": 0.08791965246200562, "learning_rate": 0.00013922789539227897, "loss": 0.9619, "step": 1614 }, { "epoch": 0.9749471777844854, "grad_norm": 0.08670973777770996, "learning_rate": 0.00013918638439186384, "loss": 1.1479, "step": 1615 }, { "epoch": 0.9755508602475098, "grad_norm": 0.09029529243707657, "learning_rate": 0.00013914487339144874, "loss": 0.7685, "step": 1616 }, { "epoch": 0.9761545427105343, "grad_norm": 0.15448486804962158, "learning_rate": 0.00013910336239103364, "loss": 0.9595, "step": 1617 }, { "epoch": 0.9767582251735587, "grad_norm": 0.08685984462499619, "learning_rate": 0.00013906185139061852, "loss": 0.7231, "step": 1618 }, { "epoch": 0.9773619076365831, "grad_norm": 0.09811729937791824, "learning_rate": 0.00013902034039020342, "loss": 0.9077, "step": 1619 }, { "epoch": 0.9779655900996076, "grad_norm": 0.10184766352176666, "learning_rate": 0.0001389788293897883, "loss": 0.8866, "step": 1620 }, { "epoch": 0.978569272562632, "grad_norm": 0.13190968334674835, "learning_rate": 0.0001389373183893732, "loss": 0.8538, "step": 1621 }, { "epoch": 0.9791729550256565, "grad_norm": 0.0916040688753128, "learning_rate": 0.0001388958073889581, "loss": 0.8823, "step": 1622 }, { "epoch": 0.9797766374886809, "grad_norm": 0.08365904539823532, "learning_rate": 0.00013885429638854296, "loss": 0.7825, "step": 1623 }, { "epoch": 0.9803803199517054, "grad_norm": 0.09449176490306854, "learning_rate": 0.00013881278538812786, "loss": 0.9603, "step": 1624 }, { "epoch": 0.9809840024147298, "grad_norm": 0.08852092921733856, "learning_rate": 0.00013877127438771276, "loss": 0.7159, "step": 1625 }, { "epoch": 0.9815876848777543, "grad_norm": 0.10536504536867142, "learning_rate": 0.00013872976338729764, "loss": 1.0687, "step": 1626 }, { "epoch": 0.9821913673407787, "grad_norm": 0.07926955074071884, "learning_rate": 0.0001386882523868825, "loss": 1.2306, "step": 1627 }, { "epoch": 0.9827950498038032, "grad_norm": 0.08607269078493118, "learning_rate": 0.00013864674138646744, "loss": 0.8071, "step": 1628 }, { "epoch": 0.9833987322668276, "grad_norm": 0.08337133377790451, "learning_rate": 0.0001386052303860523, "loss": 0.9241, "step": 1629 }, { "epoch": 0.9840024147298521, "grad_norm": 0.0874747484922409, "learning_rate": 0.00013856371938563718, "loss": 0.7326, "step": 1630 }, { "epoch": 0.9846060971928765, "grad_norm": 0.0751953125, "learning_rate": 0.0001385222083852221, "loss": 0.7567, "step": 1631 }, { "epoch": 0.985209779655901, "grad_norm": 0.09537570923566818, "learning_rate": 0.00013848069738480698, "loss": 1.0592, "step": 1632 }, { "epoch": 0.9858134621189254, "grad_norm": 0.0916895717382431, "learning_rate": 0.00013843918638439186, "loss": 0.7127, "step": 1633 }, { "epoch": 0.9864171445819498, "grad_norm": 0.08885890990495682, "learning_rate": 0.00013839767538397676, "loss": 0.7207, "step": 1634 }, { "epoch": 0.9870208270449743, "grad_norm": 0.08471481502056122, "learning_rate": 0.00013835616438356166, "loss": 0.7205, "step": 1635 }, { "epoch": 0.9876245095079987, "grad_norm": 0.0972660556435585, "learning_rate": 0.00013831465338314653, "loss": 0.793, "step": 1636 }, { "epoch": 0.9882281919710233, "grad_norm": 0.09207335114479065, "learning_rate": 0.00013827314238273143, "loss": 0.7134, "step": 1637 }, { "epoch": 0.9888318744340477, "grad_norm": 0.0968717560172081, "learning_rate": 0.00013823163138231633, "loss": 0.6872, "step": 1638 }, { "epoch": 0.9894355568970722, "grad_norm": 0.09645961970090866, "learning_rate": 0.0001381901203819012, "loss": 0.7423, "step": 1639 }, { "epoch": 0.9900392393600966, "grad_norm": 0.09619560837745667, "learning_rate": 0.0001381486093814861, "loss": 0.7063, "step": 1640 }, { "epoch": 0.9906429218231211, "grad_norm": 0.09428079426288605, "learning_rate": 0.00013810709838107097, "loss": 0.7305, "step": 1641 }, { "epoch": 0.9912466042861455, "grad_norm": 0.10160063952207565, "learning_rate": 0.00013806558738065587, "loss": 0.7281, "step": 1642 }, { "epoch": 0.99185028674917, "grad_norm": 0.10067980736494064, "learning_rate": 0.00013802407638024077, "loss": 0.7407, "step": 1643 }, { "epoch": 0.9924539692121944, "grad_norm": 0.18993158638477325, "learning_rate": 0.00013798256537982565, "loss": 0.6791, "step": 1644 }, { "epoch": 0.9930576516752189, "grad_norm": 0.10360643267631531, "learning_rate": 0.00013794105437941057, "loss": 0.7026, "step": 1645 }, { "epoch": 0.9936613341382433, "grad_norm": 0.11464804410934448, "learning_rate": 0.00013789954337899545, "loss": 0.6614, "step": 1646 }, { "epoch": 0.9942650166012678, "grad_norm": 0.11702211946249008, "learning_rate": 0.00013785803237858032, "loss": 0.6386, "step": 1647 }, { "epoch": 0.9948686990642922, "grad_norm": 0.12298808991909027, "learning_rate": 0.00013781652137816522, "loss": 0.574, "step": 1648 }, { "epoch": 0.9954723815273167, "grad_norm": 0.12374621629714966, "learning_rate": 0.00013777501037775012, "loss": 0.5813, "step": 1649 }, { "epoch": 0.9960760639903411, "grad_norm": 0.12134044617414474, "learning_rate": 0.000137733499377335, "loss": 0.4442, "step": 1650 }, { "epoch": 0.9966797464533655, "grad_norm": 0.08615315705537796, "learning_rate": 0.0001376919883769199, "loss": 0.839, "step": 1651 }, { "epoch": 0.99728342891639, "grad_norm": 0.08225715905427933, "learning_rate": 0.0001376504773765048, "loss": 0.8809, "step": 1652 }, { "epoch": 0.9978871113794144, "grad_norm": 0.0893816277384758, "learning_rate": 0.00013760896637608967, "loss": 0.8767, "step": 1653 }, { "epoch": 0.9984907938424389, "grad_norm": 0.09268541634082794, "learning_rate": 0.00013756745537567457, "loss": 0.8083, "step": 1654 }, { "epoch": 0.9990944763054633, "grad_norm": 0.10186032205820084, "learning_rate": 0.00013752594437525944, "loss": 0.783, "step": 1655 }, { "epoch": 0.9996981587684878, "grad_norm": 0.11257860064506531, "learning_rate": 0.00013748443337484434, "loss": 0.6633, "step": 1656 }, { "epoch": 1.0, "grad_norm": 0.1917642205953598, "learning_rate": 0.00013744292237442924, "loss": 0.476, "step": 1657 }, { "epoch": 1.0006036824630244, "grad_norm": 0.09141898155212402, "learning_rate": 0.0001374014113740141, "loss": 0.8386, "step": 1658 }, { "epoch": 1.001207364926049, "grad_norm": 0.09631163626909256, "learning_rate": 0.000137359900373599, "loss": 0.6534, "step": 1659 }, { "epoch": 1.0018110473890733, "grad_norm": 0.09060285240411758, "learning_rate": 0.0001373183893731839, "loss": 0.9631, "step": 1660 }, { "epoch": 1.0024147298520978, "grad_norm": 0.10698094218969345, "learning_rate": 0.00013727687837276878, "loss": 0.7317, "step": 1661 }, { "epoch": 1.0030184123151222, "grad_norm": 0.08738933503627777, "learning_rate": 0.00013723536737235369, "loss": 0.689, "step": 1662 }, { "epoch": 1.0036220947781467, "grad_norm": 0.09262979030609131, "learning_rate": 0.00013719385637193859, "loss": 0.7741, "step": 1663 }, { "epoch": 1.0042257772411711, "grad_norm": 0.08418719470500946, "learning_rate": 0.00013715234537152346, "loss": 0.7623, "step": 1664 }, { "epoch": 1.0048294597041956, "grad_norm": 0.09748540818691254, "learning_rate": 0.00013711083437110833, "loss": 0.8195, "step": 1665 }, { "epoch": 1.00543314216722, "grad_norm": 0.08836773782968521, "learning_rate": 0.00013706932337069326, "loss": 0.6536, "step": 1666 }, { "epoch": 1.0060368246302445, "grad_norm": 0.0891902819275856, "learning_rate": 0.00013702781237027813, "loss": 0.647, "step": 1667 }, { "epoch": 1.006640507093269, "grad_norm": 0.08402203768491745, "learning_rate": 0.000136986301369863, "loss": 0.9145, "step": 1668 }, { "epoch": 1.0072441895562934, "grad_norm": 0.08421420305967331, "learning_rate": 0.0001369447903694479, "loss": 0.6114, "step": 1669 }, { "epoch": 1.0078478720193178, "grad_norm": 0.09319291263818741, "learning_rate": 0.0001369032793690328, "loss": 0.6923, "step": 1670 }, { "epoch": 1.0084515544823422, "grad_norm": 0.0897219106554985, "learning_rate": 0.00013686176836861768, "loss": 0.7721, "step": 1671 }, { "epoch": 1.0090552369453667, "grad_norm": 0.08546704798936844, "learning_rate": 0.00013682025736820258, "loss": 0.7558, "step": 1672 }, { "epoch": 1.0096589194083911, "grad_norm": 0.10542399436235428, "learning_rate": 0.00013677874636778748, "loss": 0.7204, "step": 1673 }, { "epoch": 1.0102626018714156, "grad_norm": 0.0866287350654602, "learning_rate": 0.00013673723536737235, "loss": 0.6816, "step": 1674 }, { "epoch": 1.01086628433444, "grad_norm": 0.0957920253276825, "learning_rate": 0.00013669572436695725, "loss": 0.7554, "step": 1675 }, { "epoch": 1.0114699667974645, "grad_norm": 0.07835828512907028, "learning_rate": 0.00013665421336654215, "loss": 0.8906, "step": 1676 }, { "epoch": 1.012073649260489, "grad_norm": 0.09102047979831696, "learning_rate": 0.00013661270236612702, "loss": 0.707, "step": 1677 }, { "epoch": 1.0126773317235134, "grad_norm": 0.10025649517774582, "learning_rate": 0.00013657119136571192, "loss": 1.0865, "step": 1678 }, { "epoch": 1.0132810141865378, "grad_norm": 0.08916833996772766, "learning_rate": 0.0001365296803652968, "loss": 0.7058, "step": 1679 }, { "epoch": 1.0138846966495623, "grad_norm": 0.0850253701210022, "learning_rate": 0.0001364881693648817, "loss": 0.9324, "step": 1680 }, { "epoch": 1.0144883791125867, "grad_norm": 0.09609053283929825, "learning_rate": 0.0001364466583644666, "loss": 0.7521, "step": 1681 }, { "epoch": 1.0150920615756112, "grad_norm": 0.0949166864156723, "learning_rate": 0.00013640514736405147, "loss": 0.7548, "step": 1682 }, { "epoch": 1.0156957440386356, "grad_norm": 0.08850863575935364, "learning_rate": 0.00013636363636363637, "loss": 0.7067, "step": 1683 }, { "epoch": 1.01629942650166, "grad_norm": 0.09187474101781845, "learning_rate": 0.00013632212536322127, "loss": 0.7852, "step": 1684 }, { "epoch": 1.0169031089646845, "grad_norm": 0.09210950136184692, "learning_rate": 0.00013628061436280614, "loss": 0.8244, "step": 1685 }, { "epoch": 1.017506791427709, "grad_norm": 0.08923452347517014, "learning_rate": 0.00013623910336239104, "loss": 0.9229, "step": 1686 }, { "epoch": 1.0181104738907334, "grad_norm": 0.08911499381065369, "learning_rate": 0.00013619759236197594, "loss": 0.7053, "step": 1687 }, { "epoch": 1.0187141563537578, "grad_norm": 0.09007196873426437, "learning_rate": 0.00013615608136156081, "loss": 0.8376, "step": 1688 }, { "epoch": 1.0193178388167823, "grad_norm": 0.08651362359523773, "learning_rate": 0.0001361145703611457, "loss": 0.6759, "step": 1689 }, { "epoch": 1.0199215212798067, "grad_norm": 0.08241482824087143, "learning_rate": 0.00013607305936073061, "loss": 0.6885, "step": 1690 }, { "epoch": 1.0205252037428312, "grad_norm": 0.10755082219839096, "learning_rate": 0.0001360315483603155, "loss": 0.6961, "step": 1691 }, { "epoch": 1.0211288862058556, "grad_norm": 0.09004219621419907, "learning_rate": 0.0001359900373599004, "loss": 0.6243, "step": 1692 }, { "epoch": 1.02173256866888, "grad_norm": 0.0957816019654274, "learning_rate": 0.00013594852635948526, "loss": 0.6462, "step": 1693 }, { "epoch": 1.0223362511319045, "grad_norm": 0.1017783135175705, "learning_rate": 0.00013590701535907016, "loss": 0.6332, "step": 1694 }, { "epoch": 1.022939933594929, "grad_norm": 0.09232212603092194, "learning_rate": 0.00013586550435865506, "loss": 0.6461, "step": 1695 }, { "epoch": 1.0235436160579534, "grad_norm": 0.09460336714982986, "learning_rate": 0.00013582399335823993, "loss": 0.6641, "step": 1696 }, { "epoch": 1.0241472985209779, "grad_norm": 0.09913098812103271, "learning_rate": 0.00013578248235782483, "loss": 0.6559, "step": 1697 }, { "epoch": 1.0247509809840025, "grad_norm": 0.10010208189487457, "learning_rate": 0.00013574097135740973, "loss": 0.5921, "step": 1698 }, { "epoch": 1.025354663447027, "grad_norm": 0.10247639566659927, "learning_rate": 0.0001356994603569946, "loss": 0.6158, "step": 1699 }, { "epoch": 1.0259583459100514, "grad_norm": 0.10905825346708298, "learning_rate": 0.0001356579493565795, "loss": 0.66, "step": 1700 }, { "epoch": 1.0265620283730759, "grad_norm": 0.11569482833147049, "learning_rate": 0.0001356164383561644, "loss": 0.6431, "step": 1701 }, { "epoch": 1.0271657108361003, "grad_norm": 0.14849194884300232, "learning_rate": 0.00013557492735574928, "loss": 0.6045, "step": 1702 }, { "epoch": 1.0277693932991248, "grad_norm": 0.11826399713754654, "learning_rate": 0.00013553341635533415, "loss": 0.6559, "step": 1703 }, { "epoch": 1.0283730757621492, "grad_norm": 0.1232730820775032, "learning_rate": 0.00013549190535491908, "loss": 0.5124, "step": 1704 }, { "epoch": 1.0289767582251736, "grad_norm": 0.121260866522789, "learning_rate": 0.00013545039435450395, "loss": 0.4503, "step": 1705 }, { "epoch": 1.029580440688198, "grad_norm": 0.1328078657388687, "learning_rate": 0.00013540888335408883, "loss": 0.4254, "step": 1706 }, { "epoch": 1.0301841231512225, "grad_norm": 0.13635770976543427, "learning_rate": 0.00013536737235367373, "loss": 0.3007, "step": 1707 }, { "epoch": 1.030787805614247, "grad_norm": 0.10616832226514816, "learning_rate": 0.00013532586135325863, "loss": 0.7256, "step": 1708 }, { "epoch": 1.0313914880772714, "grad_norm": 0.10195182263851166, "learning_rate": 0.0001352843503528435, "loss": 0.7402, "step": 1709 }, { "epoch": 1.0319951705402959, "grad_norm": 0.11944833397865295, "learning_rate": 0.0001352428393524284, "loss": 0.757, "step": 1710 }, { "epoch": 1.0325988530033203, "grad_norm": 0.12061698734760284, "learning_rate": 0.0001352013283520133, "loss": 1.0738, "step": 1711 }, { "epoch": 1.0332025354663448, "grad_norm": 0.10394060611724854, "learning_rate": 0.00013515981735159817, "loss": 0.7147, "step": 1712 }, { "epoch": 1.0338062179293692, "grad_norm": 0.10730050504207611, "learning_rate": 0.00013511830635118307, "loss": 0.9586, "step": 1713 }, { "epoch": 1.0344099003923937, "grad_norm": 0.09942850470542908, "learning_rate": 0.00013507679535076797, "loss": 1.0808, "step": 1714 }, { "epoch": 1.0350135828554181, "grad_norm": 0.10033493489027023, "learning_rate": 0.00013503528435035284, "loss": 0.7759, "step": 1715 }, { "epoch": 1.0356172653184426, "grad_norm": 0.08915812522172928, "learning_rate": 0.00013499377334993774, "loss": 0.7716, "step": 1716 }, { "epoch": 1.036220947781467, "grad_norm": 0.09855514019727707, "learning_rate": 0.00013495226234952262, "loss": 0.8139, "step": 1717 }, { "epoch": 1.0368246302444915, "grad_norm": 0.0998401790857315, "learning_rate": 0.00013491075134910752, "loss": 0.8308, "step": 1718 }, { "epoch": 1.037428312707516, "grad_norm": 0.09024068713188171, "learning_rate": 0.00013486924034869242, "loss": 0.7503, "step": 1719 }, { "epoch": 1.0380319951705403, "grad_norm": 0.09240993112325668, "learning_rate": 0.0001348277293482773, "loss": 0.9344, "step": 1720 }, { "epoch": 1.0386356776335648, "grad_norm": 0.09110064059495926, "learning_rate": 0.0001347862183478622, "loss": 0.7452, "step": 1721 }, { "epoch": 1.0392393600965892, "grad_norm": 0.09389975666999817, "learning_rate": 0.0001347447073474471, "loss": 1.061, "step": 1722 }, { "epoch": 1.0398430425596137, "grad_norm": 0.10916675627231598, "learning_rate": 0.00013470319634703196, "loss": 0.7749, "step": 1723 }, { "epoch": 1.0404467250226381, "grad_norm": 0.08076412975788116, "learning_rate": 0.00013466168534661686, "loss": 0.5778, "step": 1724 }, { "epoch": 1.0410504074856626, "grad_norm": 0.09830320626497269, "learning_rate": 0.00013462017434620176, "loss": 0.7115, "step": 1725 }, { "epoch": 1.041654089948687, "grad_norm": 0.09751685708761215, "learning_rate": 0.00013457866334578664, "loss": 0.7439, "step": 1726 }, { "epoch": 1.0422577724117115, "grad_norm": 0.09098626673221588, "learning_rate": 0.0001345371523453715, "loss": 0.7062, "step": 1727 }, { "epoch": 1.042861454874736, "grad_norm": 0.10907137393951416, "learning_rate": 0.00013449564134495644, "loss": 0.888, "step": 1728 }, { "epoch": 1.0434651373377604, "grad_norm": 0.09241855889558792, "learning_rate": 0.0001344541303445413, "loss": 0.6564, "step": 1729 }, { "epoch": 1.0440688198007848, "grad_norm": 0.092192642390728, "learning_rate": 0.00013441261934412618, "loss": 0.6394, "step": 1730 }, { "epoch": 1.0446725022638093, "grad_norm": 0.08987250179052353, "learning_rate": 0.00013437110834371108, "loss": 0.8383, "step": 1731 }, { "epoch": 1.0452761847268337, "grad_norm": 0.09562289714813232, "learning_rate": 0.00013432959734329598, "loss": 0.6214, "step": 1732 }, { "epoch": 1.0458798671898581, "grad_norm": 0.0972490981221199, "learning_rate": 0.00013428808634288086, "loss": 0.7911, "step": 1733 }, { "epoch": 1.0464835496528826, "grad_norm": 0.10719991475343704, "learning_rate": 0.00013424657534246576, "loss": 0.8041, "step": 1734 }, { "epoch": 1.047087232115907, "grad_norm": 0.09392597526311874, "learning_rate": 0.00013420506434205066, "loss": 0.9184, "step": 1735 }, { "epoch": 1.0476909145789315, "grad_norm": 0.09040381014347076, "learning_rate": 0.00013416355334163556, "loss": 0.8159, "step": 1736 }, { "epoch": 1.048294597041956, "grad_norm": 0.09064770489931107, "learning_rate": 0.00013412204234122043, "loss": 0.768, "step": 1737 }, { "epoch": 1.0488982795049804, "grad_norm": 0.0895996019244194, "learning_rate": 0.00013408053134080533, "loss": 0.6926, "step": 1738 }, { "epoch": 1.0495019619680048, "grad_norm": 0.10096043348312378, "learning_rate": 0.00013403902034039023, "loss": 0.8543, "step": 1739 }, { "epoch": 1.0501056444310293, "grad_norm": 0.09393858164548874, "learning_rate": 0.0001339975093399751, "loss": 0.6274, "step": 1740 }, { "epoch": 1.0507093268940537, "grad_norm": 0.0904972106218338, "learning_rate": 0.00013395599833955997, "loss": 0.636, "step": 1741 }, { "epoch": 1.0513130093570782, "grad_norm": 0.09535270184278488, "learning_rate": 0.0001339144873391449, "loss": 0.6623, "step": 1742 }, { "epoch": 1.0519166918201026, "grad_norm": 0.09259801357984543, "learning_rate": 0.00013387297633872977, "loss": 0.659, "step": 1743 }, { "epoch": 1.052520374283127, "grad_norm": 0.09762239456176758, "learning_rate": 0.00013383146533831465, "loss": 0.6494, "step": 1744 }, { "epoch": 1.0531240567461515, "grad_norm": 0.104673370718956, "learning_rate": 0.00013378995433789955, "loss": 0.6844, "step": 1745 }, { "epoch": 1.053727739209176, "grad_norm": 0.108694888651371, "learning_rate": 0.00013374844333748445, "loss": 0.6817, "step": 1746 }, { "epoch": 1.0543314216722004, "grad_norm": 0.10462528467178345, "learning_rate": 0.00013370693233706932, "loss": 0.6151, "step": 1747 }, { "epoch": 1.0549351041352248, "grad_norm": 0.11397890001535416, "learning_rate": 0.00013366542133665422, "loss": 0.6114, "step": 1748 }, { "epoch": 1.0555387865982493, "grad_norm": 0.10102736949920654, "learning_rate": 0.00013362391033623912, "loss": 0.6314, "step": 1749 }, { "epoch": 1.0561424690612737, "grad_norm": 0.12406179308891296, "learning_rate": 0.000133582399335824, "loss": 0.6278, "step": 1750 }, { "epoch": 1.0567461515242982, "grad_norm": 0.11435554176568985, "learning_rate": 0.0001335408883354089, "loss": 0.5789, "step": 1751 }, { "epoch": 1.0573498339873226, "grad_norm": 0.12788087129592896, "learning_rate": 0.0001334993773349938, "loss": 0.5205, "step": 1752 }, { "epoch": 1.057953516450347, "grad_norm": 0.1364794224500656, "learning_rate": 0.00013345786633457867, "loss": 0.5034, "step": 1753 }, { "epoch": 1.0585571989133715, "grad_norm": 0.13836418092250824, "learning_rate": 0.00013341635533416357, "loss": 0.5269, "step": 1754 }, { "epoch": 1.059160881376396, "grad_norm": 0.14710794389247894, "learning_rate": 0.00013337484433374844, "loss": 0.4156, "step": 1755 }, { "epoch": 1.0597645638394204, "grad_norm": 0.13123385608196259, "learning_rate": 0.00013333333333333334, "loss": 0.32, "step": 1756 }, { "epoch": 1.0603682463024449, "grad_norm": 0.15206588804721832, "learning_rate": 0.00013329182233291824, "loss": 0.2739, "step": 1757 }, { "epoch": 1.0609719287654693, "grad_norm": 0.13307802379131317, "learning_rate": 0.0001332503113325031, "loss": 0.8787, "step": 1758 }, { "epoch": 1.0615756112284938, "grad_norm": 0.11052402853965759, "learning_rate": 0.000133208800332088, "loss": 0.7224, "step": 1759 }, { "epoch": 1.0621792936915182, "grad_norm": 0.17755720019340515, "learning_rate": 0.0001331672893316729, "loss": 0.9626, "step": 1760 }, { "epoch": 1.0627829761545426, "grad_norm": 0.10982788354158401, "learning_rate": 0.00013312577833125779, "loss": 0.789, "step": 1761 }, { "epoch": 1.063386658617567, "grad_norm": 0.10073138773441315, "learning_rate": 0.00013308426733084266, "loss": 0.7448, "step": 1762 }, { "epoch": 1.0639903410805915, "grad_norm": 0.09374914318323135, "learning_rate": 0.00013304275633042759, "loss": 0.7442, "step": 1763 }, { "epoch": 1.064594023543616, "grad_norm": 0.10434511303901672, "learning_rate": 0.00013300124533001246, "loss": 0.8159, "step": 1764 }, { "epoch": 1.0651977060066404, "grad_norm": 0.11345966905355453, "learning_rate": 0.00013295973432959733, "loss": 0.804, "step": 1765 }, { "epoch": 1.0658013884696649, "grad_norm": 0.10115978866815567, "learning_rate": 0.00013291822332918226, "loss": 0.802, "step": 1766 }, { "epoch": 1.0664050709326893, "grad_norm": 0.10045889765024185, "learning_rate": 0.00013287671232876713, "loss": 0.7314, "step": 1767 }, { "epoch": 1.0670087533957138, "grad_norm": 0.11507212370634079, "learning_rate": 0.000132835201328352, "loss": 1.0092, "step": 1768 }, { "epoch": 1.0676124358587382, "grad_norm": 0.09701373428106308, "learning_rate": 0.0001327936903279369, "loss": 0.9924, "step": 1769 }, { "epoch": 1.0682161183217627, "grad_norm": 0.10776803642511368, "learning_rate": 0.0001327521793275218, "loss": 0.7889, "step": 1770 }, { "epoch": 1.068819800784787, "grad_norm": 0.09671414643526077, "learning_rate": 0.00013271066832710668, "loss": 0.7996, "step": 1771 }, { "epoch": 1.0694234832478116, "grad_norm": 0.10002996027469635, "learning_rate": 0.00013266915732669158, "loss": 0.7149, "step": 1772 }, { "epoch": 1.070027165710836, "grad_norm": 0.09639476984739304, "learning_rate": 0.00013262764632627648, "loss": 0.9136, "step": 1773 }, { "epoch": 1.0706308481738604, "grad_norm": 0.11277900636196136, "learning_rate": 0.00013258613532586135, "loss": 0.7176, "step": 1774 }, { "epoch": 1.071234530636885, "grad_norm": 0.09514438360929489, "learning_rate": 0.00013254462432544625, "loss": 0.7988, "step": 1775 }, { "epoch": 1.0718382130999093, "grad_norm": 0.09252556413412094, "learning_rate": 0.00013250311332503112, "loss": 0.7631, "step": 1776 }, { "epoch": 1.072441895562934, "grad_norm": 0.10669627785682678, "learning_rate": 0.00013246160232461602, "loss": 0.8116, "step": 1777 }, { "epoch": 1.0730455780259582, "grad_norm": 0.10841196030378342, "learning_rate": 0.00013242009132420092, "loss": 0.7423, "step": 1778 }, { "epoch": 1.073649260488983, "grad_norm": 0.10168831795454025, "learning_rate": 0.0001323785803237858, "loss": 0.7702, "step": 1779 }, { "epoch": 1.0742529429520071, "grad_norm": 0.10742917656898499, "learning_rate": 0.00013233706932337072, "loss": 0.8964, "step": 1780 }, { "epoch": 1.0748566254150318, "grad_norm": 0.09869187325239182, "learning_rate": 0.0001322955583229556, "loss": 0.7048, "step": 1781 }, { "epoch": 1.0754603078780562, "grad_norm": 0.1356370896100998, "learning_rate": 0.00013225404732254047, "loss": 0.9056, "step": 1782 }, { "epoch": 1.0760639903410807, "grad_norm": 0.0955684557557106, "learning_rate": 0.00013221253632212537, "loss": 0.715, "step": 1783 }, { "epoch": 1.0766676728041051, "grad_norm": 0.09964524954557419, "learning_rate": 0.00013217102532171027, "loss": 0.7039, "step": 1784 }, { "epoch": 1.0772713552671296, "grad_norm": 0.10292885452508926, "learning_rate": 0.00013212951432129514, "loss": 0.9838, "step": 1785 }, { "epoch": 1.077875037730154, "grad_norm": 0.101532481610775, "learning_rate": 0.00013208800332088004, "loss": 0.6629, "step": 1786 }, { "epoch": 1.0784787201931785, "grad_norm": 0.09781339764595032, "learning_rate": 0.00013204649232046494, "loss": 0.789, "step": 1787 }, { "epoch": 1.079082402656203, "grad_norm": 0.10302070528268814, "learning_rate": 0.00013200498132004982, "loss": 0.7772, "step": 1788 }, { "epoch": 1.0796860851192274, "grad_norm": 0.10221746563911438, "learning_rate": 0.00013196347031963472, "loss": 0.8008, "step": 1789 }, { "epoch": 1.0802897675822518, "grad_norm": 0.09660208970308304, "learning_rate": 0.0001319219593192196, "loss": 0.8582, "step": 1790 }, { "epoch": 1.0808934500452763, "grad_norm": 0.09637465327978134, "learning_rate": 0.0001318804483188045, "loss": 0.7061, "step": 1791 }, { "epoch": 1.0814971325083007, "grad_norm": 0.09803172200918198, "learning_rate": 0.0001318389373183894, "loss": 0.6043, "step": 1792 }, { "epoch": 1.0821008149713252, "grad_norm": 0.10259698331356049, "learning_rate": 0.00013179742631797426, "loss": 0.6007, "step": 1793 }, { "epoch": 1.0827044974343496, "grad_norm": 0.10633745789527893, "learning_rate": 0.00013175591531755916, "loss": 0.6679, "step": 1794 }, { "epoch": 1.083308179897374, "grad_norm": 0.10431138426065445, "learning_rate": 0.00013171440431714406, "loss": 0.6999, "step": 1795 }, { "epoch": 1.0839118623603985, "grad_norm": 0.10492059588432312, "learning_rate": 0.00013167289331672893, "loss": 0.7314, "step": 1796 }, { "epoch": 1.084515544823423, "grad_norm": 0.1053004339337349, "learning_rate": 0.00013163138231631383, "loss": 0.5873, "step": 1797 }, { "epoch": 1.0851192272864474, "grad_norm": 0.10796685516834259, "learning_rate": 0.00013158987131589873, "loss": 0.6122, "step": 1798 }, { "epoch": 1.0857229097494718, "grad_norm": 0.11339765042066574, "learning_rate": 0.0001315483603154836, "loss": 0.6486, "step": 1799 }, { "epoch": 1.0863265922124963, "grad_norm": 0.11134713143110275, "learning_rate": 0.00013150684931506848, "loss": 0.5925, "step": 1800 }, { "epoch": 1.0869302746755207, "grad_norm": 0.12589313089847565, "learning_rate": 0.0001314653383146534, "loss": 0.6238, "step": 1801 }, { "epoch": 1.0875339571385452, "grad_norm": 0.1222606673836708, "learning_rate": 0.00013142382731423828, "loss": 0.5478, "step": 1802 }, { "epoch": 1.0881376396015696, "grad_norm": 0.1365615576505661, "learning_rate": 0.00013138231631382315, "loss": 0.5323, "step": 1803 }, { "epoch": 1.088741322064594, "grad_norm": 0.1305444985628128, "learning_rate": 0.00013134080531340805, "loss": 0.5246, "step": 1804 }, { "epoch": 1.0893450045276185, "grad_norm": 0.1343401074409485, "learning_rate": 0.00013129929431299295, "loss": 0.4212, "step": 1805 }, { "epoch": 1.089948686990643, "grad_norm": 0.14103184640407562, "learning_rate": 0.00013125778331257783, "loss": 0.4255, "step": 1806 }, { "epoch": 1.0905523694536674, "grad_norm": 0.15704935789108276, "learning_rate": 0.00013121627231216273, "loss": 0.3559, "step": 1807 }, { "epoch": 1.0911560519166918, "grad_norm": 0.4550192058086395, "learning_rate": 0.00013117476131174763, "loss": 0.7013, "step": 1808 }, { "epoch": 1.0917597343797163, "grad_norm": 0.12131811678409576, "learning_rate": 0.0001311332503113325, "loss": 0.7005, "step": 1809 }, { "epoch": 1.0923634168427407, "grad_norm": 0.11735139787197113, "learning_rate": 0.0001310917393109174, "loss": 0.6871, "step": 1810 }, { "epoch": 1.0929670993057652, "grad_norm": 0.11844879388809204, "learning_rate": 0.0001310502283105023, "loss": 1.2869, "step": 1811 }, { "epoch": 1.0935707817687896, "grad_norm": 0.11029795557260513, "learning_rate": 0.00013100871731008717, "loss": 0.7609, "step": 1812 }, { "epoch": 1.094174464231814, "grad_norm": 0.11960664391517639, "learning_rate": 0.00013096720630967207, "loss": 0.6971, "step": 1813 }, { "epoch": 1.0947781466948385, "grad_norm": 0.09530165046453476, "learning_rate": 0.00013092569530925695, "loss": 0.7224, "step": 1814 }, { "epoch": 1.095381829157863, "grad_norm": 0.1267603486776352, "learning_rate": 0.00013088418430884185, "loss": 0.815, "step": 1815 }, { "epoch": 1.0959855116208874, "grad_norm": 0.11272140592336655, "learning_rate": 0.00013084267330842675, "loss": 0.78, "step": 1816 }, { "epoch": 1.0965891940839119, "grad_norm": 0.10290253907442093, "learning_rate": 0.00013080116230801162, "loss": 0.7218, "step": 1817 }, { "epoch": 1.0971928765469363, "grad_norm": 0.09555505961179733, "learning_rate": 0.00013075965130759652, "loss": 0.7541, "step": 1818 }, { "epoch": 1.0977965590099608, "grad_norm": 0.10414687544107437, "learning_rate": 0.00013071814030718142, "loss": 0.9603, "step": 1819 }, { "epoch": 1.0984002414729852, "grad_norm": 0.09657555818557739, "learning_rate": 0.0001306766293067663, "loss": 0.7862, "step": 1820 }, { "epoch": 1.0990039239360097, "grad_norm": 0.09514836966991425, "learning_rate": 0.0001306351183063512, "loss": 0.7408, "step": 1821 }, { "epoch": 1.099607606399034, "grad_norm": 0.09766734391450882, "learning_rate": 0.0001305936073059361, "loss": 0.7459, "step": 1822 }, { "epoch": 1.1002112888620585, "grad_norm": 0.08910214155912399, "learning_rate": 0.00013055209630552096, "loss": 1.0229, "step": 1823 }, { "epoch": 1.100814971325083, "grad_norm": 0.10597441345453262, "learning_rate": 0.00013051058530510586, "loss": 0.7877, "step": 1824 }, { "epoch": 1.1014186537881074, "grad_norm": 0.10066650807857513, "learning_rate": 0.00013046907430469076, "loss": 0.6925, "step": 1825 }, { "epoch": 1.1020223362511319, "grad_norm": 0.11692957580089569, "learning_rate": 0.00013042756330427564, "loss": 0.8301, "step": 1826 }, { "epoch": 1.1026260187141563, "grad_norm": 0.10520220547914505, "learning_rate": 0.00013038605230386054, "loss": 0.8868, "step": 1827 }, { "epoch": 1.1032297011771808, "grad_norm": 0.10756828635931015, "learning_rate": 0.0001303445413034454, "loss": 0.7164, "step": 1828 }, { "epoch": 1.1038333836402052, "grad_norm": 0.10687938332557678, "learning_rate": 0.0001303030303030303, "loss": 0.8075, "step": 1829 }, { "epoch": 1.1044370661032297, "grad_norm": 0.09681583940982819, "learning_rate": 0.0001302615193026152, "loss": 0.7073, "step": 1830 }, { "epoch": 1.1050407485662541, "grad_norm": 0.0960078164935112, "learning_rate": 0.00013022000830220008, "loss": 0.7665, "step": 1831 }, { "epoch": 1.1056444310292786, "grad_norm": 0.10860790312290192, "learning_rate": 0.00013017849730178498, "loss": 0.7155, "step": 1832 }, { "epoch": 1.106248113492303, "grad_norm": 0.10167111456394196, "learning_rate": 0.00013013698630136988, "loss": 0.7887, "step": 1833 }, { "epoch": 1.1068517959553275, "grad_norm": 0.09937427192926407, "learning_rate": 0.00013009547530095476, "loss": 0.7305, "step": 1834 }, { "epoch": 1.107455478418352, "grad_norm": 0.09677271544933319, "learning_rate": 0.00013005396430053966, "loss": 0.7064, "step": 1835 }, { "epoch": 1.1080591608813763, "grad_norm": 0.10012287646532059, "learning_rate": 0.00013001245330012456, "loss": 0.6711, "step": 1836 }, { "epoch": 1.1086628433444008, "grad_norm": 0.10501682013273239, "learning_rate": 0.00012997094229970943, "loss": 0.8093, "step": 1837 }, { "epoch": 1.1092665258074252, "grad_norm": 0.09496523439884186, "learning_rate": 0.0001299294312992943, "loss": 0.7329, "step": 1838 }, { "epoch": 1.1098702082704497, "grad_norm": 0.09581634402275085, "learning_rate": 0.00012988792029887923, "loss": 0.6305, "step": 1839 }, { "epoch": 1.1104738907334741, "grad_norm": 0.09801549464464188, "learning_rate": 0.0001298464092984641, "loss": 0.5904, "step": 1840 }, { "epoch": 1.1110775731964986, "grad_norm": 0.0966857448220253, "learning_rate": 0.00012980489829804898, "loss": 0.7596, "step": 1841 }, { "epoch": 1.111681255659523, "grad_norm": 0.10019635409116745, "learning_rate": 0.00012976338729763388, "loss": 0.6856, "step": 1842 }, { "epoch": 1.1122849381225475, "grad_norm": 0.09953883290290833, "learning_rate": 0.00012972187629721878, "loss": 0.7132, "step": 1843 }, { "epoch": 1.112888620585572, "grad_norm": 0.09991202503442764, "learning_rate": 0.00012968036529680365, "loss": 0.613, "step": 1844 }, { "epoch": 1.1134923030485964, "grad_norm": 0.09988027811050415, "learning_rate": 0.00012963885429638855, "loss": 0.6142, "step": 1845 }, { "epoch": 1.1140959855116208, "grad_norm": 0.10957928746938705, "learning_rate": 0.00012959734329597345, "loss": 0.6573, "step": 1846 }, { "epoch": 1.1146996679746453, "grad_norm": 0.11322573572397232, "learning_rate": 0.00012955583229555832, "loss": 0.6891, "step": 1847 }, { "epoch": 1.1153033504376697, "grad_norm": 0.11455134302377701, "learning_rate": 0.00012951432129514322, "loss": 0.6676, "step": 1848 }, { "epoch": 1.1159070329006942, "grad_norm": 0.14914073050022125, "learning_rate": 0.00012947281029472812, "loss": 0.6206, "step": 1849 }, { "epoch": 1.1165107153637186, "grad_norm": 0.12033107876777649, "learning_rate": 0.000129431299294313, "loss": 0.5269, "step": 1850 }, { "epoch": 1.117114397826743, "grad_norm": 0.13541612029075623, "learning_rate": 0.0001293897882938979, "loss": 0.6263, "step": 1851 }, { "epoch": 1.1177180802897675, "grad_norm": 0.12948794662952423, "learning_rate": 0.00012934827729348277, "loss": 0.6229, "step": 1852 }, { "epoch": 1.118321762752792, "grad_norm": 0.13281860947608948, "learning_rate": 0.00012930676629306767, "loss": 0.6001, "step": 1853 }, { "epoch": 1.1189254452158164, "grad_norm": 0.14206023514270782, "learning_rate": 0.00012926525529265257, "loss": 0.5866, "step": 1854 }, { "epoch": 1.1195291276788408, "grad_norm": 0.15054769814014435, "learning_rate": 0.00012922374429223744, "loss": 0.4866, "step": 1855 }, { "epoch": 1.1201328101418653, "grad_norm": 0.15093187987804413, "learning_rate": 0.00012918223329182234, "loss": 0.4194, "step": 1856 }, { "epoch": 1.1207364926048897, "grad_norm": 0.14579585194587708, "learning_rate": 0.00012914072229140724, "loss": 0.3209, "step": 1857 }, { "epoch": 1.1213401750679144, "grad_norm": 0.11571165919303894, "learning_rate": 0.0001290992112909921, "loss": 0.8016, "step": 1858 }, { "epoch": 1.1219438575309386, "grad_norm": 0.13167442381381989, "learning_rate": 0.00012905770029057699, "loss": 0.7073, "step": 1859 }, { "epoch": 1.1225475399939633, "grad_norm": 0.09710317105054855, "learning_rate": 0.0001290161892901619, "loss": 0.6618, "step": 1860 }, { "epoch": 1.1231512224569875, "grad_norm": 0.1127190887928009, "learning_rate": 0.00012897467828974679, "loss": 0.7702, "step": 1861 }, { "epoch": 1.1237549049200122, "grad_norm": 0.09531667083501816, "learning_rate": 0.00012893316728933166, "loss": 0.7331, "step": 1862 }, { "epoch": 1.1243585873830366, "grad_norm": 0.10616014897823334, "learning_rate": 0.0001288916562889166, "loss": 0.6404, "step": 1863 }, { "epoch": 1.124962269846061, "grad_norm": 0.09906172007322311, "learning_rate": 0.00012885014528850146, "loss": 0.7295, "step": 1864 }, { "epoch": 1.1255659523090855, "grad_norm": 0.10826185345649719, "learning_rate": 0.00012880863428808633, "loss": 0.7036, "step": 1865 }, { "epoch": 1.12616963477211, "grad_norm": 0.09856102615594864, "learning_rate": 0.00012876712328767123, "loss": 0.6587, "step": 1866 }, { "epoch": 1.1267733172351344, "grad_norm": 0.11088103801012039, "learning_rate": 0.00012872561228725613, "loss": 0.8501, "step": 1867 }, { "epoch": 1.1273769996981589, "grad_norm": 0.10369934886693954, "learning_rate": 0.00012868410128684103, "loss": 0.9292, "step": 1868 }, { "epoch": 1.1279806821611833, "grad_norm": 0.10459262132644653, "learning_rate": 0.0001286425902864259, "loss": 0.8489, "step": 1869 }, { "epoch": 1.1285843646242077, "grad_norm": 0.10528787970542908, "learning_rate": 0.0001286010792860108, "loss": 0.7793, "step": 1870 }, { "epoch": 1.1291880470872322, "grad_norm": 0.10074903815984726, "learning_rate": 0.0001285595682855957, "loss": 0.7929, "step": 1871 }, { "epoch": 1.1297917295502566, "grad_norm": 0.09653479605913162, "learning_rate": 0.00012851805728518058, "loss": 0.8025, "step": 1872 }, { "epoch": 1.130395412013281, "grad_norm": 0.10775607079267502, "learning_rate": 0.00012847654628476545, "loss": 0.9679, "step": 1873 }, { "epoch": 1.1309990944763055, "grad_norm": 0.10731203109025955, "learning_rate": 0.00012843503528435038, "loss": 0.7715, "step": 1874 }, { "epoch": 1.13160277693933, "grad_norm": 0.1124483272433281, "learning_rate": 0.00012839352428393525, "loss": 0.8295, "step": 1875 }, { "epoch": 1.1322064594023544, "grad_norm": 0.21262244880199432, "learning_rate": 0.00012835201328352012, "loss": 1.0585, "step": 1876 }, { "epoch": 1.1328101418653789, "grad_norm": 0.10024654865264893, "learning_rate": 0.00012831050228310505, "loss": 0.7567, "step": 1877 }, { "epoch": 1.1334138243284033, "grad_norm": 0.11772340536117554, "learning_rate": 0.00012826899128268992, "loss": 0.8871, "step": 1878 }, { "epoch": 1.1340175067914278, "grad_norm": 0.10074224323034286, "learning_rate": 0.0001282274802822748, "loss": 0.9096, "step": 1879 }, { "epoch": 1.1346211892544522, "grad_norm": 0.1045253723859787, "learning_rate": 0.0001281859692818597, "loss": 0.6523, "step": 1880 }, { "epoch": 1.1352248717174767, "grad_norm": 0.10628263652324677, "learning_rate": 0.0001281444582814446, "loss": 0.6945, "step": 1881 }, { "epoch": 1.135828554180501, "grad_norm": 0.09413371235132217, "learning_rate": 0.00012810294728102947, "loss": 0.6471, "step": 1882 }, { "epoch": 1.1364322366435256, "grad_norm": 0.10187766700983047, "learning_rate": 0.00012806143628061437, "loss": 0.7088, "step": 1883 }, { "epoch": 1.13703591910655, "grad_norm": 0.10155726969242096, "learning_rate": 0.00012801992528019927, "loss": 0.8189, "step": 1884 }, { "epoch": 1.1376396015695744, "grad_norm": 0.101055808365345, "learning_rate": 0.00012797841427978414, "loss": 0.753, "step": 1885 }, { "epoch": 1.138243284032599, "grad_norm": 0.10111107677221298, "learning_rate": 0.00012793690327936904, "loss": 0.8097, "step": 1886 }, { "epoch": 1.1388469664956233, "grad_norm": 0.10023274272680283, "learning_rate": 0.00012789539227895392, "loss": 0.8272, "step": 1887 }, { "epoch": 1.1394506489586478, "grad_norm": 0.09850174188613892, "learning_rate": 0.00012785388127853882, "loss": 0.9205, "step": 1888 }, { "epoch": 1.1400543314216722, "grad_norm": 0.10670360922813416, "learning_rate": 0.00012781237027812372, "loss": 0.7138, "step": 1889 }, { "epoch": 1.1406580138846967, "grad_norm": 0.10679405182600021, "learning_rate": 0.0001277708592777086, "loss": 0.8036, "step": 1890 }, { "epoch": 1.1412616963477211, "grad_norm": 0.10706860572099686, "learning_rate": 0.0001277293482772935, "loss": 0.6126, "step": 1891 }, { "epoch": 1.1418653788107456, "grad_norm": 0.10065101832151413, "learning_rate": 0.0001276878372768784, "loss": 0.6491, "step": 1892 }, { "epoch": 1.14246906127377, "grad_norm": 0.10702386498451233, "learning_rate": 0.00012764632627646326, "loss": 0.7233, "step": 1893 }, { "epoch": 1.1430727437367945, "grad_norm": 0.09997487813234329, "learning_rate": 0.00012760481527604816, "loss": 0.6635, "step": 1894 }, { "epoch": 1.143676426199819, "grad_norm": 0.09972809255123138, "learning_rate": 0.00012756330427563306, "loss": 0.5934, "step": 1895 }, { "epoch": 1.1442801086628434, "grad_norm": 0.11297734826803207, "learning_rate": 0.00012752179327521794, "loss": 0.6151, "step": 1896 }, { "epoch": 1.1448837911258678, "grad_norm": 0.15789483487606049, "learning_rate": 0.0001274802822748028, "loss": 0.6286, "step": 1897 }, { "epoch": 1.1454874735888922, "grad_norm": 0.10323326289653778, "learning_rate": 0.00012743877127438774, "loss": 0.5922, "step": 1898 }, { "epoch": 1.1460911560519167, "grad_norm": 0.11414547264575958, "learning_rate": 0.0001273972602739726, "loss": 0.6048, "step": 1899 }, { "epoch": 1.1466948385149411, "grad_norm": 0.11642369627952576, "learning_rate": 0.00012735574927355748, "loss": 0.6391, "step": 1900 }, { "epoch": 1.1472985209779656, "grad_norm": 0.12589019536972046, "learning_rate": 0.00012731423827314238, "loss": 0.6153, "step": 1901 }, { "epoch": 1.14790220344099, "grad_norm": 0.12789148092269897, "learning_rate": 0.00012727272727272728, "loss": 0.6229, "step": 1902 }, { "epoch": 1.1485058859040145, "grad_norm": 0.14213663339614868, "learning_rate": 0.00012723121627231215, "loss": 0.579, "step": 1903 }, { "epoch": 1.149109568367039, "grad_norm": 0.14148949086666107, "learning_rate": 0.00012718970527189705, "loss": 0.5596, "step": 1904 }, { "epoch": 1.1497132508300634, "grad_norm": 0.14953409135341644, "learning_rate": 0.00012714819427148195, "loss": 0.4723, "step": 1905 }, { "epoch": 1.1503169332930878, "grad_norm": 0.1479395627975464, "learning_rate": 0.00012710668327106683, "loss": 0.4351, "step": 1906 }, { "epoch": 1.1509206157561123, "grad_norm": 0.13544803857803345, "learning_rate": 0.00012706517227065173, "loss": 0.3325, "step": 1907 }, { "epoch": 1.1515242982191367, "grad_norm": 0.1175408810377121, "learning_rate": 0.00012702366127023663, "loss": 0.6767, "step": 1908 }, { "epoch": 1.1521279806821612, "grad_norm": 0.209543377161026, "learning_rate": 0.0001269821502698215, "loss": 0.9705, "step": 1909 }, { "epoch": 1.1527316631451856, "grad_norm": 0.10693584382534027, "learning_rate": 0.0001269406392694064, "loss": 0.7096, "step": 1910 }, { "epoch": 1.15333534560821, "grad_norm": 0.10303391516208649, "learning_rate": 0.00012689912826899127, "loss": 0.7388, "step": 1911 }, { "epoch": 1.1539390280712345, "grad_norm": 0.1123664528131485, "learning_rate": 0.00012685761726857617, "loss": 0.7372, "step": 1912 }, { "epoch": 1.154542710534259, "grad_norm": 0.10753045976161957, "learning_rate": 0.00012681610626816107, "loss": 0.7714, "step": 1913 }, { "epoch": 1.1551463929972834, "grad_norm": 0.10717937350273132, "learning_rate": 0.00012677459526774595, "loss": 0.6893, "step": 1914 }, { "epoch": 1.1557500754603078, "grad_norm": 0.09875158965587616, "learning_rate": 0.00012673308426733085, "loss": 0.7437, "step": 1915 }, { "epoch": 1.1563537579233323, "grad_norm": 0.12408756464719772, "learning_rate": 0.00012669157326691575, "loss": 0.7512, "step": 1916 }, { "epoch": 1.1569574403863567, "grad_norm": 0.10806753486394882, "learning_rate": 0.00012665006226650062, "loss": 0.6687, "step": 1917 }, { "epoch": 1.1575611228493812, "grad_norm": 0.10482674092054367, "learning_rate": 0.00012660855126608552, "loss": 0.6417, "step": 1918 }, { "epoch": 1.1581648053124056, "grad_norm": 0.10658420622348785, "learning_rate": 0.00012656704026567042, "loss": 0.876, "step": 1919 }, { "epoch": 1.15876848777543, "grad_norm": 0.10012143105268478, "learning_rate": 0.0001265255292652553, "loss": 0.6473, "step": 1920 }, { "epoch": 1.1593721702384545, "grad_norm": 0.12289892137050629, "learning_rate": 0.0001264840182648402, "loss": 0.7602, "step": 1921 }, { "epoch": 1.159975852701479, "grad_norm": 0.0990784764289856, "learning_rate": 0.0001264425072644251, "loss": 0.6372, "step": 1922 }, { "epoch": 1.1605795351645034, "grad_norm": 0.0976581797003746, "learning_rate": 0.00012640099626400997, "loss": 0.7036, "step": 1923 }, { "epoch": 1.1611832176275279, "grad_norm": 0.11021608114242554, "learning_rate": 0.00012635948526359487, "loss": 0.7666, "step": 1924 }, { "epoch": 1.1617869000905523, "grad_norm": 0.2441258430480957, "learning_rate": 0.00012631797426317974, "loss": 0.7026, "step": 1925 }, { "epoch": 1.1623905825535767, "grad_norm": 0.10685604065656662, "learning_rate": 0.00012627646326276464, "loss": 0.8768, "step": 1926 }, { "epoch": 1.1629942650166012, "grad_norm": 0.10796701908111572, "learning_rate": 0.00012623495226234954, "loss": 0.9038, "step": 1927 }, { "epoch": 1.1635979474796256, "grad_norm": 0.1107090562582016, "learning_rate": 0.0001261934412619344, "loss": 0.7874, "step": 1928 }, { "epoch": 1.16420162994265, "grad_norm": 0.10373709350824356, "learning_rate": 0.0001261519302615193, "loss": 0.7946, "step": 1929 }, { "epoch": 1.1648053124056745, "grad_norm": 0.09496215730905533, "learning_rate": 0.0001261104192611042, "loss": 0.688, "step": 1930 }, { "epoch": 1.165408994868699, "grad_norm": 0.10575946420431137, "learning_rate": 0.00012606890826068908, "loss": 0.7762, "step": 1931 }, { "epoch": 1.1660126773317234, "grad_norm": 0.0949690192937851, "learning_rate": 0.00012602739726027398, "loss": 0.6066, "step": 1932 }, { "epoch": 1.1666163597947479, "grad_norm": 0.12045732140541077, "learning_rate": 0.00012598588625985888, "loss": 0.9106, "step": 1933 }, { "epoch": 1.1672200422577723, "grad_norm": 0.10367399454116821, "learning_rate": 0.00012594437525944376, "loss": 0.7499, "step": 1934 }, { "epoch": 1.167823724720797, "grad_norm": 0.09977395832538605, "learning_rate": 0.00012590286425902863, "loss": 0.6043, "step": 1935 }, { "epoch": 1.1684274071838212, "grad_norm": 0.10659980773925781, "learning_rate": 0.00012586135325861356, "loss": 0.8781, "step": 1936 }, { "epoch": 1.1690310896468459, "grad_norm": 0.0970248430967331, "learning_rate": 0.00012581984225819843, "loss": 0.6159, "step": 1937 }, { "epoch": 1.16963477210987, "grad_norm": 0.10996770858764648, "learning_rate": 0.0001257783312577833, "loss": 0.7535, "step": 1938 }, { "epoch": 1.1702384545728948, "grad_norm": 0.10588741302490234, "learning_rate": 0.0001257368202573682, "loss": 0.6802, "step": 1939 }, { "epoch": 1.170842137035919, "grad_norm": 0.1041964441537857, "learning_rate": 0.0001256953092569531, "loss": 0.731, "step": 1940 }, { "epoch": 1.1714458194989437, "grad_norm": 0.10040424019098282, "learning_rate": 0.00012565379825653798, "loss": 0.645, "step": 1941 }, { "epoch": 1.1720495019619679, "grad_norm": 0.11066184937953949, "learning_rate": 0.00012561228725612288, "loss": 0.6855, "step": 1942 }, { "epoch": 1.1726531844249926, "grad_norm": 0.10693058371543884, "learning_rate": 0.00012557077625570778, "loss": 0.6097, "step": 1943 }, { "epoch": 1.1732568668880168, "grad_norm": 0.10633829981088638, "learning_rate": 0.00012552926525529265, "loss": 0.6503, "step": 1944 }, { "epoch": 1.1738605493510414, "grad_norm": 0.10232945531606674, "learning_rate": 0.00012548775425487755, "loss": 0.6123, "step": 1945 }, { "epoch": 1.174464231814066, "grad_norm": 0.11123668402433395, "learning_rate": 0.00012544624325446245, "loss": 0.574, "step": 1946 }, { "epoch": 1.1750679142770903, "grad_norm": 0.11907467991113663, "learning_rate": 0.00012540473225404732, "loss": 0.6328, "step": 1947 }, { "epoch": 1.1756715967401148, "grad_norm": 0.11772100627422333, "learning_rate": 0.00012536322125363222, "loss": 0.5881, "step": 1948 }, { "epoch": 1.1762752792031392, "grad_norm": 0.11829902976751328, "learning_rate": 0.0001253217102532171, "loss": 0.6201, "step": 1949 }, { "epoch": 1.1768789616661637, "grad_norm": 0.13115333020687103, "learning_rate": 0.000125280199252802, "loss": 0.5381, "step": 1950 }, { "epoch": 1.1774826441291881, "grad_norm": 0.1208442747592926, "learning_rate": 0.0001252386882523869, "loss": 0.5937, "step": 1951 }, { "epoch": 1.1780863265922126, "grad_norm": 0.13914504647254944, "learning_rate": 0.00012519717725197177, "loss": 0.5468, "step": 1952 }, { "epoch": 1.178690009055237, "grad_norm": 0.14542445540428162, "learning_rate": 0.00012515566625155667, "loss": 0.5401, "step": 1953 }, { "epoch": 1.1792936915182615, "grad_norm": 0.14165444672107697, "learning_rate": 0.00012511415525114157, "loss": 0.4952, "step": 1954 }, { "epoch": 1.179897373981286, "grad_norm": 0.14939822256565094, "learning_rate": 0.00012507264425072644, "loss": 0.4343, "step": 1955 }, { "epoch": 1.1805010564443104, "grad_norm": 0.14716513454914093, "learning_rate": 0.00012503113325031131, "loss": 0.3985, "step": 1956 }, { "epoch": 1.1811047389073348, "grad_norm": 0.157403826713562, "learning_rate": 0.00012498962224989624, "loss": 0.3312, "step": 1957 }, { "epoch": 1.1817084213703593, "grad_norm": 0.12419522553682327, "learning_rate": 0.00012494811124948111, "loss": 0.9177, "step": 1958 }, { "epoch": 1.1823121038333837, "grad_norm": 0.10016956180334091, "learning_rate": 0.00012490660024906601, "loss": 0.8362, "step": 1959 }, { "epoch": 1.1829157862964081, "grad_norm": 0.12086569517850876, "learning_rate": 0.00012486508924865091, "loss": 0.7991, "step": 1960 }, { "epoch": 1.1835194687594326, "grad_norm": 0.11325091868638992, "learning_rate": 0.0001248235782482358, "loss": 0.7633, "step": 1961 }, { "epoch": 1.184123151222457, "grad_norm": 0.11552873998880386, "learning_rate": 0.0001247820672478207, "loss": 0.7515, "step": 1962 }, { "epoch": 1.1847268336854815, "grad_norm": 0.09600497037172318, "learning_rate": 0.00012474055624740556, "loss": 0.698, "step": 1963 }, { "epoch": 1.185330516148506, "grad_norm": 0.0974835678935051, "learning_rate": 0.00012469904524699046, "loss": 0.6602, "step": 1964 }, { "epoch": 1.1859341986115304, "grad_norm": 0.23812483251094818, "learning_rate": 0.00012465753424657536, "loss": 0.7079, "step": 1965 }, { "epoch": 1.1865378810745548, "grad_norm": 0.09787070006132126, "learning_rate": 0.00012461602324616023, "loss": 0.752, "step": 1966 }, { "epoch": 1.1871415635375793, "grad_norm": 0.10244850814342499, "learning_rate": 0.00012457451224574513, "loss": 0.8414, "step": 1967 }, { "epoch": 1.1877452460006037, "grad_norm": 0.11027634143829346, "learning_rate": 0.00012453300124533003, "loss": 0.8229, "step": 1968 }, { "epoch": 1.1883489284636282, "grad_norm": 0.1009831577539444, "learning_rate": 0.0001244914902449149, "loss": 0.9911, "step": 1969 }, { "epoch": 1.1889526109266526, "grad_norm": 0.10704313963651657, "learning_rate": 0.00012444997924449978, "loss": 0.7212, "step": 1970 }, { "epoch": 1.189556293389677, "grad_norm": 0.10770639777183533, "learning_rate": 0.0001244084682440847, "loss": 0.7613, "step": 1971 }, { "epoch": 1.1901599758527015, "grad_norm": 0.10859589278697968, "learning_rate": 0.00012436695724366958, "loss": 0.8025, "step": 1972 }, { "epoch": 1.190763658315726, "grad_norm": 0.09854567050933838, "learning_rate": 0.00012432544624325445, "loss": 0.6613, "step": 1973 }, { "epoch": 1.1913673407787504, "grad_norm": 0.10922684520483017, "learning_rate": 0.00012428393524283938, "loss": 0.6675, "step": 1974 }, { "epoch": 1.1919710232417748, "grad_norm": 0.11279135942459106, "learning_rate": 0.00012424242424242425, "loss": 0.7562, "step": 1975 }, { "epoch": 1.1925747057047993, "grad_norm": 0.10514727234840393, "learning_rate": 0.00012420091324200913, "loss": 0.6954, "step": 1976 }, { "epoch": 1.1931783881678237, "grad_norm": 0.0970945954322815, "learning_rate": 0.00012415940224159403, "loss": 0.8071, "step": 1977 }, { "epoch": 1.1937820706308482, "grad_norm": 0.11609134823083878, "learning_rate": 0.00012411789124117893, "loss": 0.8732, "step": 1978 }, { "epoch": 1.1943857530938726, "grad_norm": 0.10586526989936829, "learning_rate": 0.0001240763802407638, "loss": 0.6324, "step": 1979 }, { "epoch": 1.194989435556897, "grad_norm": 0.11197761446237564, "learning_rate": 0.0001240348692403487, "loss": 0.7699, "step": 1980 }, { "epoch": 1.1955931180199215, "grad_norm": 0.11370430141687393, "learning_rate": 0.0001239933582399336, "loss": 0.8104, "step": 1981 }, { "epoch": 1.196196800482946, "grad_norm": 0.0997292697429657, "learning_rate": 0.00012395184723951847, "loss": 0.7494, "step": 1982 }, { "epoch": 1.1968004829459704, "grad_norm": 0.10209706425666809, "learning_rate": 0.00012391033623910337, "loss": 0.6899, "step": 1983 }, { "epoch": 1.1974041654089949, "grad_norm": 0.09956377744674683, "learning_rate": 0.00012386882523868824, "loss": 0.7469, "step": 1984 }, { "epoch": 1.1980078478720193, "grad_norm": 0.12185216695070267, "learning_rate": 0.00012382731423827314, "loss": 1.0278, "step": 1985 }, { "epoch": 1.1986115303350438, "grad_norm": 0.16837285459041595, "learning_rate": 0.00012378580323785804, "loss": 0.8619, "step": 1986 }, { "epoch": 1.1992152127980682, "grad_norm": 0.10831692814826965, "learning_rate": 0.00012374429223744292, "loss": 0.6662, "step": 1987 }, { "epoch": 1.1998188952610926, "grad_norm": 0.09968318045139313, "learning_rate": 0.00012370278123702782, "loss": 0.7561, "step": 1988 }, { "epoch": 1.200422577724117, "grad_norm": 0.09389069676399231, "learning_rate": 0.00012366127023661272, "loss": 0.6967, "step": 1989 }, { "epoch": 1.2010262601871415, "grad_norm": 0.10120224207639694, "learning_rate": 0.0001236197592361976, "loss": 0.7008, "step": 1990 }, { "epoch": 1.201629942650166, "grad_norm": 0.1032261773943901, "learning_rate": 0.0001235782482357825, "loss": 0.6327, "step": 1991 }, { "epoch": 1.2022336251131904, "grad_norm": 0.10337409377098083, "learning_rate": 0.0001235367372353674, "loss": 0.6639, "step": 1992 }, { "epoch": 1.2028373075762149, "grad_norm": 0.1101367250084877, "learning_rate": 0.00012349522623495226, "loss": 0.6232, "step": 1993 }, { "epoch": 1.2034409900392393, "grad_norm": 0.1051281988620758, "learning_rate": 0.00012345371523453714, "loss": 0.6032, "step": 1994 }, { "epoch": 1.2040446725022638, "grad_norm": 0.10944084078073502, "learning_rate": 0.00012341220423412206, "loss": 0.635, "step": 1995 }, { "epoch": 1.2046483549652882, "grad_norm": 0.10979338735342026, "learning_rate": 0.00012337069323370694, "loss": 0.5944, "step": 1996 }, { "epoch": 1.2052520374283127, "grad_norm": 0.11819171160459518, "learning_rate": 0.0001233291822332918, "loss": 0.6785, "step": 1997 }, { "epoch": 1.205855719891337, "grad_norm": 0.12030988931655884, "learning_rate": 0.0001232876712328767, "loss": 0.6699, "step": 1998 }, { "epoch": 1.2064594023543616, "grad_norm": 0.12456244975328445, "learning_rate": 0.0001232461602324616, "loss": 0.6065, "step": 1999 }, { "epoch": 1.207063084817386, "grad_norm": 0.12571309506893158, "learning_rate": 0.00012320464923204648, "loss": 0.5504, "step": 2000 }, { "epoch": 1.207063084817386, "eval_loss": 0.8049178123474121, "eval_runtime": 1222.2521, "eval_samples_per_second": 2.283, "eval_steps_per_second": 0.286, "step": 2000 }, { "epoch": 1.2076667672804104, "grad_norm": 0.12835955619812012, "learning_rate": 0.00012316313823163138, "loss": 0.6229, "step": 2001 }, { "epoch": 1.208270449743435, "grad_norm": 0.12921744585037231, "learning_rate": 0.00012312162723121628, "loss": 0.5465, "step": 2002 }, { "epoch": 1.2088741322064593, "grad_norm": 0.139405757188797, "learning_rate": 0.00012308011623080118, "loss": 0.5361, "step": 2003 }, { "epoch": 1.2094778146694838, "grad_norm": 0.14658528566360474, "learning_rate": 0.00012303860523038605, "loss": 0.4847, "step": 2004 }, { "epoch": 1.2100814971325082, "grad_norm": 0.15193478763103485, "learning_rate": 0.00012299709422997095, "loss": 0.4933, "step": 2005 }, { "epoch": 1.2106851795955327, "grad_norm": 0.15982936322689056, "learning_rate": 0.00012295558322955586, "loss": 0.4008, "step": 2006 }, { "epoch": 1.2112888620585571, "grad_norm": 0.15444207191467285, "learning_rate": 0.00012291407222914073, "loss": 0.3094, "step": 2007 }, { "epoch": 1.2118925445215816, "grad_norm": 0.11019429564476013, "learning_rate": 0.0001228725612287256, "loss": 0.808, "step": 2008 }, { "epoch": 1.212496226984606, "grad_norm": 0.11978921294212341, "learning_rate": 0.00012283105022831053, "loss": 0.7044, "step": 2009 }, { "epoch": 1.2130999094476305, "grad_norm": 0.10817541182041168, "learning_rate": 0.0001227895392278954, "loss": 0.7133, "step": 2010 }, { "epoch": 1.213703591910655, "grad_norm": 0.10823164880275726, "learning_rate": 0.00012274802822748027, "loss": 0.7497, "step": 2011 }, { "epoch": 1.2143072743736794, "grad_norm": 0.10769373923540115, "learning_rate": 0.0001227065172270652, "loss": 0.9813, "step": 2012 }, { "epoch": 1.2149109568367038, "grad_norm": 0.11639753729104996, "learning_rate": 0.00012266500622665007, "loss": 0.8259, "step": 2013 }, { "epoch": 1.2155146392997283, "grad_norm": 0.10166247934103012, "learning_rate": 0.00012262349522623495, "loss": 0.6783, "step": 2014 }, { "epoch": 1.2161183217627527, "grad_norm": 0.10975832492113113, "learning_rate": 0.00012258198422581985, "loss": 0.8338, "step": 2015 }, { "epoch": 1.2167220042257774, "grad_norm": 0.12426348030567169, "learning_rate": 0.00012254047322540475, "loss": 0.7086, "step": 2016 }, { "epoch": 1.2173256866888016, "grad_norm": 0.09932038187980652, "learning_rate": 0.00012249896222498962, "loss": 0.7193, "step": 2017 }, { "epoch": 1.2179293691518263, "grad_norm": 0.11152757704257965, "learning_rate": 0.00012245745122457452, "loss": 0.7668, "step": 2018 }, { "epoch": 1.2185330516148505, "grad_norm": 0.1197136715054512, "learning_rate": 0.00012241594022415942, "loss": 1.1513, "step": 2019 }, { "epoch": 1.2191367340778752, "grad_norm": 0.12218881398439407, "learning_rate": 0.0001223744292237443, "loss": 0.5988, "step": 2020 }, { "epoch": 1.2197404165408994, "grad_norm": 0.10389691591262817, "learning_rate": 0.0001223329182233292, "loss": 0.7079, "step": 2021 }, { "epoch": 1.220344099003924, "grad_norm": 0.10646260529756546, "learning_rate": 0.00012229140722291407, "loss": 0.6731, "step": 2022 }, { "epoch": 1.2209477814669483, "grad_norm": 0.11292260885238647, "learning_rate": 0.00012224989622249897, "loss": 0.7529, "step": 2023 }, { "epoch": 1.221551463929973, "grad_norm": 0.11751693487167358, "learning_rate": 0.00012220838522208387, "loss": 0.873, "step": 2024 }, { "epoch": 1.2221551463929972, "grad_norm": 0.10794904083013535, "learning_rate": 0.00012216687422166874, "loss": 0.8017, "step": 2025 }, { "epoch": 1.2227588288560218, "grad_norm": 0.10329126566648483, "learning_rate": 0.00012212536322125364, "loss": 0.8863, "step": 2026 }, { "epoch": 1.2233625113190463, "grad_norm": 0.10396264493465424, "learning_rate": 0.00012208385222083854, "loss": 0.6399, "step": 2027 }, { "epoch": 1.2239661937820707, "grad_norm": 0.12621258199214935, "learning_rate": 0.00012204234122042341, "loss": 0.7813, "step": 2028 }, { "epoch": 1.2245698762450952, "grad_norm": 0.11045660823583603, "learning_rate": 0.0001220008302200083, "loss": 0.7743, "step": 2029 }, { "epoch": 1.2251735587081196, "grad_norm": 0.11149439960718155, "learning_rate": 0.0001219593192195932, "loss": 0.788, "step": 2030 }, { "epoch": 1.225777241171144, "grad_norm": 0.11239415407180786, "learning_rate": 0.00012191780821917808, "loss": 1.0028, "step": 2031 }, { "epoch": 1.2263809236341685, "grad_norm": 0.11321971565485, "learning_rate": 0.00012187629721876297, "loss": 0.7841, "step": 2032 }, { "epoch": 1.226984606097193, "grad_norm": 0.10027159005403519, "learning_rate": 0.00012183478621834787, "loss": 0.728, "step": 2033 }, { "epoch": 1.2275882885602174, "grad_norm": 0.09943227469921112, "learning_rate": 0.00012179327521793276, "loss": 0.7397, "step": 2034 }, { "epoch": 1.2281919710232418, "grad_norm": 0.09892190247774124, "learning_rate": 0.00012175176421751764, "loss": 0.806, "step": 2035 }, { "epoch": 1.2287956534862663, "grad_norm": 0.10850216448307037, "learning_rate": 0.00012171025321710254, "loss": 0.8292, "step": 2036 }, { "epoch": 1.2293993359492907, "grad_norm": 0.102396160364151, "learning_rate": 0.00012166874221668743, "loss": 0.7506, "step": 2037 }, { "epoch": 1.2300030184123152, "grad_norm": 0.12359751015901566, "learning_rate": 0.0001216272312162723, "loss": 0.6433, "step": 2038 }, { "epoch": 1.2306067008753396, "grad_norm": 0.10474561899900436, "learning_rate": 0.00012158572021585722, "loss": 0.7758, "step": 2039 }, { "epoch": 1.231210383338364, "grad_norm": 0.10111495852470398, "learning_rate": 0.00012154420921544209, "loss": 0.6707, "step": 2040 }, { "epoch": 1.2318140658013885, "grad_norm": 0.10861996561288834, "learning_rate": 0.00012150269821502698, "loss": 0.6847, "step": 2041 }, { "epoch": 1.232417748264413, "grad_norm": 0.10541880130767822, "learning_rate": 0.00012146118721461188, "loss": 0.6607, "step": 2042 }, { "epoch": 1.2330214307274374, "grad_norm": 0.11882983893156052, "learning_rate": 0.00012141967621419676, "loss": 0.6679, "step": 2043 }, { "epoch": 1.2336251131904619, "grad_norm": 0.10306091606616974, "learning_rate": 0.00012137816521378165, "loss": 0.6325, "step": 2044 }, { "epoch": 1.2342287956534863, "grad_norm": 0.11227353662252426, "learning_rate": 0.00012133665421336655, "loss": 0.6578, "step": 2045 }, { "epoch": 1.2348324781165108, "grad_norm": 0.11392517387866974, "learning_rate": 0.00012129514321295144, "loss": 0.6137, "step": 2046 }, { "epoch": 1.2354361605795352, "grad_norm": 0.117860347032547, "learning_rate": 0.00012125363221253634, "loss": 0.6464, "step": 2047 }, { "epoch": 1.2360398430425596, "grad_norm": 0.11536695808172226, "learning_rate": 0.00012121212121212122, "loss": 0.5613, "step": 2048 }, { "epoch": 1.236643525505584, "grad_norm": 0.12926429510116577, "learning_rate": 0.00012117061021170611, "loss": 0.588, "step": 2049 }, { "epoch": 1.2372472079686085, "grad_norm": 0.12854550778865814, "learning_rate": 0.00012112909921129101, "loss": 0.643, "step": 2050 }, { "epoch": 1.237850890431633, "grad_norm": 0.12910380959510803, "learning_rate": 0.0001210875882108759, "loss": 0.6016, "step": 2051 }, { "epoch": 1.2384545728946574, "grad_norm": 0.1356167048215866, "learning_rate": 0.00012104607721046077, "loss": 0.5593, "step": 2052 }, { "epoch": 1.2390582553576819, "grad_norm": 0.13904690742492676, "learning_rate": 0.00012100456621004568, "loss": 0.5238, "step": 2053 }, { "epoch": 1.2396619378207063, "grad_norm": 0.14844262599945068, "learning_rate": 0.00012096305520963056, "loss": 0.5047, "step": 2054 }, { "epoch": 1.2402656202837308, "grad_norm": 0.14652326703071594, "learning_rate": 0.00012092154420921544, "loss": 0.4741, "step": 2055 }, { "epoch": 1.2408693027467552, "grad_norm": 0.15764220058918, "learning_rate": 0.00012088003320880034, "loss": 0.3982, "step": 2056 }, { "epoch": 1.2414729852097797, "grad_norm": 0.146280899643898, "learning_rate": 0.00012083852220838523, "loss": 0.3012, "step": 2057 }, { "epoch": 1.2420766676728041, "grad_norm": 0.13532495498657227, "learning_rate": 0.00012079701120797011, "loss": 0.7918, "step": 2058 }, { "epoch": 1.2426803501358286, "grad_norm": 0.11222409456968307, "learning_rate": 0.00012075550020755501, "loss": 0.7279, "step": 2059 }, { "epoch": 1.243284032598853, "grad_norm": 0.12232893705368042, "learning_rate": 0.0001207139892071399, "loss": 0.8694, "step": 2060 }, { "epoch": 1.2438877150618775, "grad_norm": 0.11951932311058044, "learning_rate": 0.00012067247820672479, "loss": 0.6634, "step": 2061 }, { "epoch": 1.244491397524902, "grad_norm": 0.10638052970170975, "learning_rate": 0.00012063096720630969, "loss": 0.7437, "step": 2062 }, { "epoch": 1.2450950799879263, "grad_norm": 0.10910008102655411, "learning_rate": 0.00012058945620589457, "loss": 0.8246, "step": 2063 }, { "epoch": 1.2456987624509508, "grad_norm": 0.12002308666706085, "learning_rate": 0.00012054794520547945, "loss": 0.8136, "step": 2064 }, { "epoch": 1.2463024449139752, "grad_norm": 0.10318660736083984, "learning_rate": 0.00012050643420506436, "loss": 0.9517, "step": 2065 }, { "epoch": 1.2469061273769997, "grad_norm": 0.10524383932352066, "learning_rate": 0.00012046492320464923, "loss": 0.8019, "step": 2066 }, { "epoch": 1.2475098098400241, "grad_norm": 0.10615126043558121, "learning_rate": 0.00012042341220423412, "loss": 0.732, "step": 2067 }, { "epoch": 1.2481134923030486, "grad_norm": 0.10820569843053818, "learning_rate": 0.00012038190120381902, "loss": 0.9747, "step": 2068 }, { "epoch": 1.248717174766073, "grad_norm": 0.10168500244617462, "learning_rate": 0.0001203403902034039, "loss": 0.7427, "step": 2069 }, { "epoch": 1.2493208572290975, "grad_norm": 0.11007564514875412, "learning_rate": 0.0001202988792029888, "loss": 0.6818, "step": 2070 }, { "epoch": 1.249924539692122, "grad_norm": 0.10033000260591507, "learning_rate": 0.0001202573682025737, "loss": 0.7134, "step": 2071 }, { "epoch": 1.2505282221551464, "grad_norm": 0.11808007955551147, "learning_rate": 0.00012021585720215858, "loss": 0.7718, "step": 2072 }, { "epoch": 1.2511319046181708, "grad_norm": 0.11681666970252991, "learning_rate": 0.00012017434620174347, "loss": 0.7327, "step": 2073 }, { "epoch": 1.2517355870811953, "grad_norm": 0.11521414667367935, "learning_rate": 0.00012013283520132837, "loss": 0.754, "step": 2074 }, { "epoch": 1.2523392695442197, "grad_norm": 0.09546779096126556, "learning_rate": 0.00012009132420091325, "loss": 0.7399, "step": 2075 }, { "epoch": 1.2529429520072441, "grad_norm": 0.10306226462125778, "learning_rate": 0.00012004981320049813, "loss": 0.682, "step": 2076 }, { "epoch": 1.2535466344702686, "grad_norm": 0.11089253425598145, "learning_rate": 0.00012000830220008304, "loss": 0.868, "step": 2077 }, { "epoch": 1.254150316933293, "grad_norm": 0.12641939520835876, "learning_rate": 0.00011996679119966791, "loss": 0.8043, "step": 2078 }, { "epoch": 1.2547539993963175, "grad_norm": 0.11239389330148697, "learning_rate": 0.0001199252801992528, "loss": 0.6766, "step": 2079 }, { "epoch": 1.255357681859342, "grad_norm": 0.11025764048099518, "learning_rate": 0.0001198837691988377, "loss": 0.7516, "step": 2080 }, { "epoch": 1.2559613643223664, "grad_norm": 0.10319259762763977, "learning_rate": 0.00011984225819842259, "loss": 0.791, "step": 2081 }, { "epoch": 1.2565650467853908, "grad_norm": 0.1244928389787674, "learning_rate": 0.00011980074719800747, "loss": 0.8266, "step": 2082 }, { "epoch": 1.2571687292484153, "grad_norm": 0.09494119137525558, "learning_rate": 0.00011975923619759237, "loss": 0.8277, "step": 2083 }, { "epoch": 1.2577724117114397, "grad_norm": 0.11088497191667557, "learning_rate": 0.00011971772519717726, "loss": 0.7471, "step": 2084 }, { "epoch": 1.2583760941744642, "grad_norm": 0.10349318385124207, "learning_rate": 0.00011967621419676213, "loss": 0.8844, "step": 2085 }, { "epoch": 1.2589797766374886, "grad_norm": 0.09668376296758652, "learning_rate": 0.00011963470319634704, "loss": 0.7249, "step": 2086 }, { "epoch": 1.259583459100513, "grad_norm": 0.11646587401628494, "learning_rate": 0.00011959319219593193, "loss": 0.9458, "step": 2087 }, { "epoch": 1.2601871415635375, "grad_norm": 0.11103519797325134, "learning_rate": 0.0001195516811955168, "loss": 0.8437, "step": 2088 }, { "epoch": 1.260790824026562, "grad_norm": 0.12291695177555084, "learning_rate": 0.00011951017019510172, "loss": 0.8271, "step": 2089 }, { "epoch": 1.2613945064895864, "grad_norm": 0.10509349405765533, "learning_rate": 0.00011946865919468659, "loss": 0.7225, "step": 2090 }, { "epoch": 1.2619981889526108, "grad_norm": 0.09915387630462646, "learning_rate": 0.0001194271481942715, "loss": 0.5739, "step": 2091 }, { "epoch": 1.2626018714156353, "grad_norm": 0.10708332806825638, "learning_rate": 0.00011938563719385638, "loss": 0.6239, "step": 2092 }, { "epoch": 1.26320555387866, "grad_norm": 0.10963302105665207, "learning_rate": 0.00011934412619344126, "loss": 0.6454, "step": 2093 }, { "epoch": 1.2638092363416842, "grad_norm": 0.10599739104509354, "learning_rate": 0.00011930261519302616, "loss": 0.6203, "step": 2094 }, { "epoch": 1.2644129188047089, "grad_norm": 0.11157579720020294, "learning_rate": 0.00011926110419261105, "loss": 0.689, "step": 2095 }, { "epoch": 1.265016601267733, "grad_norm": 0.11258841305971146, "learning_rate": 0.00011921959319219594, "loss": 0.5814, "step": 2096 }, { "epoch": 1.2656202837307577, "grad_norm": 0.11933767050504684, "learning_rate": 0.00011917808219178084, "loss": 0.6181, "step": 2097 }, { "epoch": 1.266223966193782, "grad_norm": 0.1172049269080162, "learning_rate": 0.00011913657119136572, "loss": 0.6099, "step": 2098 }, { "epoch": 1.2668276486568066, "grad_norm": 0.1134132593870163, "learning_rate": 0.0001190950601909506, "loss": 0.6055, "step": 2099 }, { "epoch": 1.2674313311198309, "grad_norm": 0.12405924499034882, "learning_rate": 0.00011905354919053551, "loss": 0.5902, "step": 2100 }, { "epoch": 1.2680350135828555, "grad_norm": 0.13423797488212585, "learning_rate": 0.0001190120381901204, "loss": 0.6814, "step": 2101 }, { "epoch": 1.2686386960458798, "grad_norm": 0.13356977701187134, "learning_rate": 0.00011897052718970527, "loss": 0.6092, "step": 2102 }, { "epoch": 1.2692423785089044, "grad_norm": 0.13142277300357819, "learning_rate": 0.00011892901618929018, "loss": 0.4849, "step": 2103 }, { "epoch": 1.2698460609719286, "grad_norm": 0.1493036448955536, "learning_rate": 0.00011888750518887506, "loss": 0.5734, "step": 2104 }, { "epoch": 1.2704497434349533, "grad_norm": 0.14654318988323212, "learning_rate": 0.00011884599418845994, "loss": 0.4648, "step": 2105 }, { "epoch": 1.2710534258979775, "grad_norm": 0.1532881259918213, "learning_rate": 0.00011880448318804484, "loss": 0.3987, "step": 2106 }, { "epoch": 1.2716571083610022, "grad_norm": 0.16462919116020203, "learning_rate": 0.00011876297218762973, "loss": 0.3358, "step": 2107 }, { "epoch": 1.2722607908240264, "grad_norm": 0.11268685758113861, "learning_rate": 0.00011872146118721462, "loss": 0.6965, "step": 2108 }, { "epoch": 1.272864473287051, "grad_norm": 0.11173847317695618, "learning_rate": 0.00011867995018679952, "loss": 0.9126, "step": 2109 }, { "epoch": 1.2734681557500753, "grad_norm": 0.10636541247367859, "learning_rate": 0.0001186384391863844, "loss": 0.8093, "step": 2110 }, { "epoch": 1.2740718382131, "grad_norm": 0.11559230089187622, "learning_rate": 0.00011859692818596927, "loss": 0.7025, "step": 2111 }, { "epoch": 1.2746755206761244, "grad_norm": 0.11961853504180908, "learning_rate": 0.00011855541718555419, "loss": 0.7638, "step": 2112 }, { "epoch": 1.2752792031391489, "grad_norm": 0.11241693049669266, "learning_rate": 0.00011851390618513906, "loss": 0.7422, "step": 2113 }, { "epoch": 1.2758828856021733, "grad_norm": 0.11459380388259888, "learning_rate": 0.00011847239518472395, "loss": 1.0325, "step": 2114 }, { "epoch": 1.2764865680651978, "grad_norm": 0.1107303723692894, "learning_rate": 0.00011843088418430886, "loss": 0.6566, "step": 2115 }, { "epoch": 1.2770902505282222, "grad_norm": 0.11201652884483337, "learning_rate": 0.00011838937318389373, "loss": 0.7305, "step": 2116 }, { "epoch": 1.2776939329912467, "grad_norm": 0.1064920499920845, "learning_rate": 0.00011834786218347862, "loss": 0.6963, "step": 2117 }, { "epoch": 1.2782976154542711, "grad_norm": 0.11608701199293137, "learning_rate": 0.00011830635118306352, "loss": 0.7162, "step": 2118 }, { "epoch": 1.2789012979172956, "grad_norm": 0.10472101718187332, "learning_rate": 0.00011826484018264841, "loss": 0.683, "step": 2119 }, { "epoch": 1.27950498038032, "grad_norm": 0.12958820164203644, "learning_rate": 0.0001182233291822333, "loss": 0.8274, "step": 2120 }, { "epoch": 1.2801086628433445, "grad_norm": 0.1009947806596756, "learning_rate": 0.0001181818181818182, "loss": 0.6998, "step": 2121 }, { "epoch": 1.280712345306369, "grad_norm": 0.11905679106712341, "learning_rate": 0.00011814030718140308, "loss": 0.7364, "step": 2122 }, { "epoch": 1.2813160277693934, "grad_norm": 0.10903461277484894, "learning_rate": 0.00011809879618098795, "loss": 0.7088, "step": 2123 }, { "epoch": 1.2819197102324178, "grad_norm": 0.11340314149856567, "learning_rate": 0.00011805728518057287, "loss": 0.8662, "step": 2124 }, { "epoch": 1.2825233926954422, "grad_norm": 0.10403577238321304, "learning_rate": 0.00011801577418015774, "loss": 0.8047, "step": 2125 }, { "epoch": 1.2831270751584667, "grad_norm": 0.12345468252897263, "learning_rate": 0.00011797426317974263, "loss": 0.9449, "step": 2126 }, { "epoch": 1.2837307576214911, "grad_norm": 0.11308707296848297, "learning_rate": 0.00011793275217932753, "loss": 0.7631, "step": 2127 }, { "epoch": 1.2843344400845156, "grad_norm": 0.10067742317914963, "learning_rate": 0.00011789124117891241, "loss": 0.7624, "step": 2128 }, { "epoch": 1.28493812254754, "grad_norm": 0.11228195577859879, "learning_rate": 0.0001178497301784973, "loss": 1.1616, "step": 2129 }, { "epoch": 1.2855418050105645, "grad_norm": 0.10720618814229965, "learning_rate": 0.0001178082191780822, "loss": 1.3342, "step": 2130 }, { "epoch": 1.286145487473589, "grad_norm": 0.10839608311653137, "learning_rate": 0.00011776670817766709, "loss": 0.9353, "step": 2131 }, { "epoch": 1.2867491699366134, "grad_norm": 0.10751800239086151, "learning_rate": 0.00011772519717725197, "loss": 0.7667, "step": 2132 }, { "epoch": 1.2873528523996378, "grad_norm": 0.10154768079519272, "learning_rate": 0.00011768368617683687, "loss": 0.794, "step": 2133 }, { "epoch": 1.2879565348626623, "grad_norm": 0.10651742666959763, "learning_rate": 0.00011764217517642176, "loss": 0.7428, "step": 2134 }, { "epoch": 1.2885602173256867, "grad_norm": 0.1084497794508934, "learning_rate": 0.00011760066417600663, "loss": 0.7594, "step": 2135 }, { "epoch": 1.2891638997887112, "grad_norm": 0.11117968708276749, "learning_rate": 0.00011755915317559155, "loss": 0.7812, "step": 2136 }, { "epoch": 1.2897675822517356, "grad_norm": 0.112984798848629, "learning_rate": 0.00011751764217517642, "loss": 0.6982, "step": 2137 }, { "epoch": 1.29037126471476, "grad_norm": 0.10472138971090317, "learning_rate": 0.00011747613117476133, "loss": 0.6542, "step": 2138 }, { "epoch": 1.2909749471777845, "grad_norm": 0.12380527704954147, "learning_rate": 0.0001174346201743462, "loss": 0.7346, "step": 2139 }, { "epoch": 1.291578629640809, "grad_norm": 0.10141772031784058, "learning_rate": 0.00011739310917393109, "loss": 0.6896, "step": 2140 }, { "epoch": 1.2921823121038334, "grad_norm": 0.10354811698198318, "learning_rate": 0.00011735159817351599, "loss": 0.6743, "step": 2141 }, { "epoch": 1.2927859945668578, "grad_norm": 0.10275600850582123, "learning_rate": 0.00011731008717310088, "loss": 0.6862, "step": 2142 }, { "epoch": 1.2933896770298823, "grad_norm": 0.10765067487955093, "learning_rate": 0.00011726857617268576, "loss": 0.6372, "step": 2143 }, { "epoch": 1.2939933594929067, "grad_norm": 0.10741368681192398, "learning_rate": 0.00011722706517227066, "loss": 0.6173, "step": 2144 }, { "epoch": 1.2945970419559312, "grad_norm": 0.11568807065486908, "learning_rate": 0.00011718555417185555, "loss": 0.7006, "step": 2145 }, { "epoch": 1.2952007244189556, "grad_norm": 0.12116066366434097, "learning_rate": 0.00011714404317144044, "loss": 0.678, "step": 2146 }, { "epoch": 1.29580440688198, "grad_norm": 0.1318727433681488, "learning_rate": 0.00011710253217102534, "loss": 0.6605, "step": 2147 }, { "epoch": 1.2964080893450045, "grad_norm": 0.1201576441526413, "learning_rate": 0.00011706102117061022, "loss": 0.6411, "step": 2148 }, { "epoch": 1.297011771808029, "grad_norm": 0.12639600038528442, "learning_rate": 0.0001170195101701951, "loss": 0.6679, "step": 2149 }, { "epoch": 1.2976154542710534, "grad_norm": 0.12824396789073944, "learning_rate": 0.00011697799916978001, "loss": 0.5963, "step": 2150 }, { "epoch": 1.2982191367340778, "grad_norm": 0.1317749321460724, "learning_rate": 0.00011693648816936488, "loss": 0.6115, "step": 2151 }, { "epoch": 1.2988228191971023, "grad_norm": 0.14059890806674957, "learning_rate": 0.00011689497716894977, "loss": 0.5661, "step": 2152 }, { "epoch": 1.2994265016601267, "grad_norm": 0.14957548677921295, "learning_rate": 0.00011685346616853467, "loss": 0.616, "step": 2153 }, { "epoch": 1.3000301841231512, "grad_norm": 0.15705609321594238, "learning_rate": 0.00011681195516811956, "loss": 0.5566, "step": 2154 }, { "epoch": 1.3006338665861756, "grad_norm": 0.14872965216636658, "learning_rate": 0.00011677044416770444, "loss": 0.4887, "step": 2155 }, { "epoch": 1.3012375490492, "grad_norm": 0.1517983227968216, "learning_rate": 0.00011672893316728934, "loss": 0.4465, "step": 2156 }, { "epoch": 1.3018412315122245, "grad_norm": 0.15609890222549438, "learning_rate": 0.00011668742216687423, "loss": 0.4065, "step": 2157 }, { "epoch": 1.302444913975249, "grad_norm": 0.11183691769838333, "learning_rate": 0.00011664591116645912, "loss": 0.6092, "step": 2158 }, { "epoch": 1.3030485964382734, "grad_norm": 0.10590233653783798, "learning_rate": 0.00011660440016604402, "loss": 0.9179, "step": 2159 }, { "epoch": 1.3036522789012979, "grad_norm": 0.11257438361644745, "learning_rate": 0.0001165628891656289, "loss": 0.7511, "step": 2160 }, { "epoch": 1.3042559613643223, "grad_norm": 0.11735875904560089, "learning_rate": 0.00011652137816521377, "loss": 0.8342, "step": 2161 }, { "epoch": 1.3048596438273468, "grad_norm": 0.12115509063005447, "learning_rate": 0.00011647986716479869, "loss": 0.6921, "step": 2162 }, { "epoch": 1.3054633262903712, "grad_norm": 0.10376877337694168, "learning_rate": 0.00011643835616438356, "loss": 0.5839, "step": 2163 }, { "epoch": 1.3060670087533957, "grad_norm": 0.10715942084789276, "learning_rate": 0.00011639684516396845, "loss": 0.7511, "step": 2164 }, { "epoch": 1.30667069121642, "grad_norm": 0.1192198321223259, "learning_rate": 0.00011635533416355335, "loss": 0.7023, "step": 2165 }, { "epoch": 1.3072743736794445, "grad_norm": 0.10451126098632812, "learning_rate": 0.00011631382316313823, "loss": 0.642, "step": 2166 }, { "epoch": 1.307878056142469, "grad_norm": 0.10327422618865967, "learning_rate": 0.00011627231216272312, "loss": 0.7792, "step": 2167 }, { "epoch": 1.3084817386054934, "grad_norm": 0.10710068792104721, "learning_rate": 0.00011623080116230802, "loss": 0.705, "step": 2168 }, { "epoch": 1.3090854210685179, "grad_norm": 0.10589960962533951, "learning_rate": 0.00011618929016189291, "loss": 0.6591, "step": 2169 }, { "epoch": 1.3096891035315423, "grad_norm": 0.10254230350255966, "learning_rate": 0.0001161477791614778, "loss": 0.723, "step": 2170 }, { "epoch": 1.3102927859945668, "grad_norm": 0.10744060575962067, "learning_rate": 0.0001161062681610627, "loss": 0.7218, "step": 2171 }, { "epoch": 1.3108964684575912, "grad_norm": 0.11566371470689774, "learning_rate": 0.00011606475716064758, "loss": 0.6746, "step": 2172 }, { "epoch": 1.3115001509206157, "grad_norm": 0.10865778475999832, "learning_rate": 0.00011602324616023245, "loss": 0.706, "step": 2173 }, { "epoch": 1.3121038333836403, "grad_norm": 0.11026459187269211, "learning_rate": 0.00011598173515981737, "loss": 0.7467, "step": 2174 }, { "epoch": 1.3127075158466646, "grad_norm": 0.1640511453151703, "learning_rate": 0.00011594022415940224, "loss": 0.754, "step": 2175 }, { "epoch": 1.3133111983096892, "grad_norm": 0.11436708271503448, "learning_rate": 0.00011589871315898713, "loss": 0.8731, "step": 2176 }, { "epoch": 1.3139148807727135, "grad_norm": 0.11194545775651932, "learning_rate": 0.00011585720215857203, "loss": 0.8425, "step": 2177 }, { "epoch": 1.3145185632357381, "grad_norm": 0.10926983505487442, "learning_rate": 0.00011581569115815691, "loss": 0.695, "step": 2178 }, { "epoch": 1.3151222456987623, "grad_norm": 0.11056378483772278, "learning_rate": 0.0001157741801577418, "loss": 0.7545, "step": 2179 }, { "epoch": 1.315725928161787, "grad_norm": 0.11646270751953125, "learning_rate": 0.0001157326691573267, "loss": 0.697, "step": 2180 }, { "epoch": 1.3163296106248112, "grad_norm": 0.096307173371315, "learning_rate": 0.00011569115815691159, "loss": 0.7044, "step": 2181 }, { "epoch": 1.316933293087836, "grad_norm": 0.10772639513015747, "learning_rate": 0.00011564964715649649, "loss": 0.6897, "step": 2182 }, { "epoch": 1.3175369755508601, "grad_norm": 0.10433688759803772, "learning_rate": 0.00011560813615608137, "loss": 0.7438, "step": 2183 }, { "epoch": 1.3181406580138848, "grad_norm": 0.10850182920694351, "learning_rate": 0.00011556662515566626, "loss": 0.712, "step": 2184 }, { "epoch": 1.318744340476909, "grad_norm": 0.11556683480739594, "learning_rate": 0.00011552511415525116, "loss": 0.7897, "step": 2185 }, { "epoch": 1.3193480229399337, "grad_norm": 0.11677438765764236, "learning_rate": 0.00011548360315483605, "loss": 0.9773, "step": 2186 }, { "epoch": 1.319951705402958, "grad_norm": 0.10189380496740341, "learning_rate": 0.00011544209215442092, "loss": 0.7935, "step": 2187 }, { "epoch": 1.3205553878659826, "grad_norm": 0.10953009128570557, "learning_rate": 0.00011540058115400583, "loss": 0.7774, "step": 2188 }, { "epoch": 1.3211590703290068, "grad_norm": 0.10907643288373947, "learning_rate": 0.0001153590701535907, "loss": 0.6565, "step": 2189 }, { "epoch": 1.3217627527920315, "grad_norm": 0.10476800054311752, "learning_rate": 0.00011531755915317559, "loss": 0.6466, "step": 2190 }, { "epoch": 1.3223664352550557, "grad_norm": 0.11148995906114578, "learning_rate": 0.00011527604815276049, "loss": 0.5893, "step": 2191 }, { "epoch": 1.3229701177180804, "grad_norm": 0.10962202399969101, "learning_rate": 0.00011523453715234538, "loss": 0.6037, "step": 2192 }, { "epoch": 1.3235738001811048, "grad_norm": 0.11205828934907913, "learning_rate": 0.00011519302615193026, "loss": 0.6462, "step": 2193 }, { "epoch": 1.3241774826441293, "grad_norm": 0.11916244029998779, "learning_rate": 0.00011515151515151516, "loss": 0.6124, "step": 2194 }, { "epoch": 1.3247811651071537, "grad_norm": 0.11426917463541031, "learning_rate": 0.00011511000415110005, "loss": 0.6288, "step": 2195 }, { "epoch": 1.3253848475701782, "grad_norm": 0.11942315101623535, "learning_rate": 0.00011506849315068494, "loss": 0.6146, "step": 2196 }, { "epoch": 1.3259885300332026, "grad_norm": 0.1271522343158722, "learning_rate": 0.00011502698215026984, "loss": 0.6174, "step": 2197 }, { "epoch": 1.326592212496227, "grad_norm": 0.12273116409778595, "learning_rate": 0.00011498547114985472, "loss": 0.6165, "step": 2198 }, { "epoch": 1.3271958949592515, "grad_norm": 0.12854193150997162, "learning_rate": 0.0001149439601494396, "loss": 0.6234, "step": 2199 }, { "epoch": 1.327799577422276, "grad_norm": 0.12989364564418793, "learning_rate": 0.00011490244914902451, "loss": 0.5936, "step": 2200 }, { "epoch": 1.3284032598853004, "grad_norm": 0.13210608065128326, "learning_rate": 0.00011486093814860938, "loss": 0.5591, "step": 2201 }, { "epoch": 1.3290069423483248, "grad_norm": 0.1452115774154663, "learning_rate": 0.00011481942714819427, "loss": 0.5945, "step": 2202 }, { "epoch": 1.3296106248113493, "grad_norm": 0.14136159420013428, "learning_rate": 0.00011477791614777917, "loss": 0.5191, "step": 2203 }, { "epoch": 1.3302143072743737, "grad_norm": 0.1478489339351654, "learning_rate": 0.00011473640514736406, "loss": 0.4752, "step": 2204 }, { "epoch": 1.3308179897373982, "grad_norm": 0.15822367370128632, "learning_rate": 0.00011469489414694894, "loss": 0.4601, "step": 2205 }, { "epoch": 1.3314216722004226, "grad_norm": 0.1557387262582779, "learning_rate": 0.00011465338314653384, "loss": 0.4091, "step": 2206 }, { "epoch": 1.332025354663447, "grad_norm": 0.15302599966526031, "learning_rate": 0.00011461187214611873, "loss": 0.3174, "step": 2207 }, { "epoch": 1.3326290371264715, "grad_norm": 0.10820896923542023, "learning_rate": 0.0001145703611457036, "loss": 0.852, "step": 2208 }, { "epoch": 1.333232719589496, "grad_norm": 0.10538190603256226, "learning_rate": 0.00011452885014528852, "loss": 0.6777, "step": 2209 }, { "epoch": 1.3338364020525204, "grad_norm": 0.11455769091844559, "learning_rate": 0.0001144873391448734, "loss": 0.7658, "step": 2210 }, { "epoch": 1.3344400845155449, "grad_norm": 0.12761631608009338, "learning_rate": 0.00011444582814445828, "loss": 0.7603, "step": 2211 }, { "epoch": 1.3350437669785693, "grad_norm": 0.12457510828971863, "learning_rate": 0.00011440431714404319, "loss": 0.8002, "step": 2212 }, { "epoch": 1.3356474494415937, "grad_norm": 0.11467002332210541, "learning_rate": 0.00011436280614362806, "loss": 0.7013, "step": 2213 }, { "epoch": 1.3362511319046182, "grad_norm": 0.10846070200204849, "learning_rate": 0.00011432129514321295, "loss": 0.8422, "step": 2214 }, { "epoch": 1.3368548143676426, "grad_norm": 0.11062701046466827, "learning_rate": 0.00011427978414279785, "loss": 0.691, "step": 2215 }, { "epoch": 1.337458496830667, "grad_norm": 0.10322251170873642, "learning_rate": 0.00011423827314238273, "loss": 0.7036, "step": 2216 }, { "epoch": 1.3380621792936915, "grad_norm": 0.09952748566865921, "learning_rate": 0.00011419676214196762, "loss": 0.6875, "step": 2217 }, { "epoch": 1.338665861756716, "grad_norm": 0.11490236967802048, "learning_rate": 0.00011415525114155252, "loss": 0.6921, "step": 2218 }, { "epoch": 1.3392695442197404, "grad_norm": 0.1203780472278595, "learning_rate": 0.00011411374014113741, "loss": 0.7415, "step": 2219 }, { "epoch": 1.3398732266827649, "grad_norm": 0.11478865891695023, "learning_rate": 0.00011407222914072228, "loss": 0.7476, "step": 2220 }, { "epoch": 1.3404769091457893, "grad_norm": 0.10916067659854889, "learning_rate": 0.0001140307181403072, "loss": 0.7539, "step": 2221 }, { "epoch": 1.3410805916088138, "grad_norm": 0.11967265605926514, "learning_rate": 0.00011398920713989207, "loss": 0.834, "step": 2222 }, { "epoch": 1.3416842740718382, "grad_norm": 0.12789706885814667, "learning_rate": 0.00011394769613947695, "loss": 0.7649, "step": 2223 }, { "epoch": 1.3422879565348627, "grad_norm": 0.11024732142686844, "learning_rate": 0.00011390618513906187, "loss": 0.8246, "step": 2224 }, { "epoch": 1.342891638997887, "grad_norm": 0.10574118793010712, "learning_rate": 0.00011386467413864674, "loss": 0.6421, "step": 2225 }, { "epoch": 1.3434953214609116, "grad_norm": 0.1084161251783371, "learning_rate": 0.00011382316313823165, "loss": 0.8431, "step": 2226 }, { "epoch": 1.344099003923936, "grad_norm": 0.112635038793087, "learning_rate": 0.00011378165213781653, "loss": 0.7057, "step": 2227 }, { "epoch": 1.3447026863869604, "grad_norm": 0.11063949763774872, "learning_rate": 0.00011374014113740141, "loss": 0.7764, "step": 2228 }, { "epoch": 1.345306368849985, "grad_norm": 0.1124119758605957, "learning_rate": 0.00011369863013698631, "loss": 0.9151, "step": 2229 }, { "epoch": 1.3459100513130093, "grad_norm": 0.1197538748383522, "learning_rate": 0.0001136571191365712, "loss": 0.6366, "step": 2230 }, { "epoch": 1.3465137337760338, "grad_norm": 0.1143086701631546, "learning_rate": 0.00011361560813615609, "loss": 1.0755, "step": 2231 }, { "epoch": 1.3471174162390582, "grad_norm": 0.1160961166024208, "learning_rate": 0.00011357409713574099, "loss": 0.7346, "step": 2232 }, { "epoch": 1.3477210987020827, "grad_norm": 0.1389804482460022, "learning_rate": 0.00011353258613532587, "loss": 0.7935, "step": 2233 }, { "epoch": 1.3483247811651071, "grad_norm": 0.10732017457485199, "learning_rate": 0.00011349107513491075, "loss": 1.1968, "step": 2234 }, { "epoch": 1.3489284636281316, "grad_norm": 0.10339702665805817, "learning_rate": 0.00011344956413449566, "loss": 0.674, "step": 2235 }, { "epoch": 1.349532146091156, "grad_norm": 0.10818582028150558, "learning_rate": 0.00011340805313408053, "loss": 0.7626, "step": 2236 }, { "epoch": 1.3501358285541805, "grad_norm": 0.1044488251209259, "learning_rate": 0.00011336654213366542, "loss": 0.719, "step": 2237 }, { "epoch": 1.350739511017205, "grad_norm": 0.20112541317939758, "learning_rate": 0.00011332503113325033, "loss": 0.5892, "step": 2238 }, { "epoch": 1.3513431934802294, "grad_norm": 0.10670458525419235, "learning_rate": 0.0001132835201328352, "loss": 0.6436, "step": 2239 }, { "epoch": 1.3519468759432538, "grad_norm": 0.11629335582256317, "learning_rate": 0.00011324200913242009, "loss": 1.0747, "step": 2240 }, { "epoch": 1.3525505584062782, "grad_norm": 0.14042681455612183, "learning_rate": 0.00011320049813200499, "loss": 0.6851, "step": 2241 }, { "epoch": 1.3531542408693027, "grad_norm": 0.10726059973239899, "learning_rate": 0.00011315898713158988, "loss": 0.6776, "step": 2242 }, { "epoch": 1.3537579233323271, "grad_norm": 0.10431487113237381, "learning_rate": 0.00011311747613117476, "loss": 0.5565, "step": 2243 }, { "epoch": 1.3543616057953516, "grad_norm": 0.11104051023721695, "learning_rate": 0.00011307596513075966, "loss": 0.6523, "step": 2244 }, { "epoch": 1.354965288258376, "grad_norm": 0.1104440838098526, "learning_rate": 0.00011303445413034455, "loss": 0.6657, "step": 2245 }, { "epoch": 1.3555689707214005, "grad_norm": 0.11780980974435806, "learning_rate": 0.00011299294312992942, "loss": 0.6437, "step": 2246 }, { "epoch": 1.356172653184425, "grad_norm": 0.11362163722515106, "learning_rate": 0.00011295143212951434, "loss": 0.6607, "step": 2247 }, { "epoch": 1.3567763356474494, "grad_norm": 0.12305210530757904, "learning_rate": 0.00011290992112909921, "loss": 0.6784, "step": 2248 }, { "epoch": 1.3573800181104738, "grad_norm": 0.12743736803531647, "learning_rate": 0.0001128684101286841, "loss": 0.6501, "step": 2249 }, { "epoch": 1.3579837005734983, "grad_norm": 0.12665079534053802, "learning_rate": 0.000112826899128269, "loss": 0.6922, "step": 2250 }, { "epoch": 1.3585873830365227, "grad_norm": 0.12639720737934113, "learning_rate": 0.00011278538812785388, "loss": 0.5699, "step": 2251 }, { "epoch": 1.3591910654995472, "grad_norm": 0.12889716029167175, "learning_rate": 0.00011274387712743877, "loss": 0.5657, "step": 2252 }, { "epoch": 1.3597947479625718, "grad_norm": 0.14307141304016113, "learning_rate": 0.00011270236612702367, "loss": 0.5307, "step": 2253 }, { "epoch": 1.360398430425596, "grad_norm": 0.13598784804344177, "learning_rate": 0.00011266085512660856, "loss": 0.4857, "step": 2254 }, { "epoch": 1.3610021128886207, "grad_norm": 0.150955468416214, "learning_rate": 0.00011261934412619344, "loss": 0.4596, "step": 2255 }, { "epoch": 1.361605795351645, "grad_norm": 0.14729629456996918, "learning_rate": 0.00011257783312577834, "loss": 0.3831, "step": 2256 }, { "epoch": 1.3622094778146696, "grad_norm": 0.15139052271842957, "learning_rate": 0.00011253632212536323, "loss": 0.3392, "step": 2257 }, { "epoch": 1.3628131602776938, "grad_norm": 0.10566865652799606, "learning_rate": 0.0001124948111249481, "loss": 0.6181, "step": 2258 }, { "epoch": 1.3634168427407185, "grad_norm": 0.1218709796667099, "learning_rate": 0.00011245330012453302, "loss": 0.7976, "step": 2259 }, { "epoch": 1.3640205252037427, "grad_norm": 0.11592400819063187, "learning_rate": 0.00011241178912411789, "loss": 0.8621, "step": 2260 }, { "epoch": 1.3646242076667674, "grad_norm": 0.1122564896941185, "learning_rate": 0.00011237027812370278, "loss": 0.6828, "step": 2261 }, { "epoch": 1.3652278901297916, "grad_norm": 0.1253998577594757, "learning_rate": 0.00011232876712328768, "loss": 0.7367, "step": 2262 }, { "epoch": 1.3658315725928163, "grad_norm": 0.11441956460475922, "learning_rate": 0.00011228725612287256, "loss": 0.7129, "step": 2263 }, { "epoch": 1.3664352550558405, "grad_norm": 0.18277932703495026, "learning_rate": 0.00011224574512245745, "loss": 0.7095, "step": 2264 }, { "epoch": 1.3670389375188652, "grad_norm": 0.10515805333852768, "learning_rate": 0.00011220423412204235, "loss": 0.6439, "step": 2265 }, { "epoch": 1.3676426199818894, "grad_norm": 0.10004261136054993, "learning_rate": 0.00011216272312162724, "loss": 0.7665, "step": 2266 }, { "epoch": 1.368246302444914, "grad_norm": 0.1079486608505249, "learning_rate": 0.00011212121212121212, "loss": 0.6854, "step": 2267 }, { "epoch": 1.3688499849079383, "grad_norm": 0.11932551115751266, "learning_rate": 0.00011207970112079702, "loss": 0.7683, "step": 2268 }, { "epoch": 1.369453667370963, "grad_norm": 0.13196608424186707, "learning_rate": 0.00011203819012038191, "loss": 0.8149, "step": 2269 }, { "epoch": 1.3700573498339872, "grad_norm": 0.1305970698595047, "learning_rate": 0.00011199667911996681, "loss": 0.8266, "step": 2270 }, { "epoch": 1.3706610322970119, "grad_norm": 0.10973062366247177, "learning_rate": 0.0001119551681195517, "loss": 0.6498, "step": 2271 }, { "epoch": 1.371264714760036, "grad_norm": 0.11326909810304642, "learning_rate": 0.00011191365711913657, "loss": 0.7068, "step": 2272 }, { "epoch": 1.3718683972230608, "grad_norm": 0.1138983964920044, "learning_rate": 0.00011187214611872148, "loss": 0.8917, "step": 2273 }, { "epoch": 1.3724720796860852, "grad_norm": 0.10338523238897324, "learning_rate": 0.00011183063511830635, "loss": 0.6048, "step": 2274 }, { "epoch": 1.3730757621491096, "grad_norm": 0.13740071654319763, "learning_rate": 0.00011178912411789124, "loss": 0.7794, "step": 2275 }, { "epoch": 1.373679444612134, "grad_norm": 0.11318357288837433, "learning_rate": 0.00011174761311747614, "loss": 0.7224, "step": 2276 }, { "epoch": 1.3742831270751585, "grad_norm": 0.10989907383918762, "learning_rate": 0.00011170610211706103, "loss": 0.7604, "step": 2277 }, { "epoch": 1.374886809538183, "grad_norm": 0.11369361728429794, "learning_rate": 0.00011166459111664591, "loss": 0.6629, "step": 2278 }, { "epoch": 1.3754904920012074, "grad_norm": 0.11444233357906342, "learning_rate": 0.00011162308011623081, "loss": 0.6644, "step": 2279 }, { "epoch": 1.3760941744642319, "grad_norm": 0.11095111072063446, "learning_rate": 0.0001115815691158157, "loss": 0.8084, "step": 2280 }, { "epoch": 1.3766978569272563, "grad_norm": 0.11327888071537018, "learning_rate": 0.00011154005811540059, "loss": 0.9121, "step": 2281 }, { "epoch": 1.3773015393902808, "grad_norm": 0.11659125983715057, "learning_rate": 0.00011149854711498549, "loss": 0.7475, "step": 2282 }, { "epoch": 1.3779052218533052, "grad_norm": 0.11424364149570465, "learning_rate": 0.00011145703611457037, "loss": 0.9011, "step": 2283 }, { "epoch": 1.3785089043163297, "grad_norm": 0.11273805052042007, "learning_rate": 0.00011141552511415525, "loss": 0.6689, "step": 2284 }, { "epoch": 1.379112586779354, "grad_norm": 0.1099931076169014, "learning_rate": 0.00011137401411374016, "loss": 0.7201, "step": 2285 }, { "epoch": 1.3797162692423786, "grad_norm": 0.11520043015480042, "learning_rate": 0.00011133250311332503, "loss": 0.6714, "step": 2286 }, { "epoch": 1.380319951705403, "grad_norm": 0.10492314398288727, "learning_rate": 0.00011129099211290992, "loss": 0.8564, "step": 2287 }, { "epoch": 1.3809236341684274, "grad_norm": 0.10921221226453781, "learning_rate": 0.00011124948111249482, "loss": 0.7088, "step": 2288 }, { "epoch": 1.381527316631452, "grad_norm": 0.11280515789985657, "learning_rate": 0.0001112079701120797, "loss": 1.1316, "step": 2289 }, { "epoch": 1.3821309990944763, "grad_norm": 0.10194700956344604, "learning_rate": 0.00011116645911166459, "loss": 1.0434, "step": 2290 }, { "epoch": 1.3827346815575008, "grad_norm": 0.11088079959154129, "learning_rate": 0.00011112494811124949, "loss": 0.663, "step": 2291 }, { "epoch": 1.3833383640205252, "grad_norm": 0.10625745356082916, "learning_rate": 0.00011108343711083438, "loss": 0.6609, "step": 2292 }, { "epoch": 1.3839420464835497, "grad_norm": 0.11164417862892151, "learning_rate": 0.00011104192611041927, "loss": 0.646, "step": 2293 }, { "epoch": 1.3845457289465741, "grad_norm": 0.11619970202445984, "learning_rate": 0.00011100041511000417, "loss": 0.6225, "step": 2294 }, { "epoch": 1.3851494114095986, "grad_norm": 0.11362091451883316, "learning_rate": 0.00011095890410958905, "loss": 0.6535, "step": 2295 }, { "epoch": 1.385753093872623, "grad_norm": 0.11212702840566635, "learning_rate": 0.00011091739310917392, "loss": 0.585, "step": 2296 }, { "epoch": 1.3863567763356475, "grad_norm": 0.12629151344299316, "learning_rate": 0.00011087588210875884, "loss": 0.6642, "step": 2297 }, { "epoch": 1.386960458798672, "grad_norm": 0.12351097911596298, "learning_rate": 0.00011083437110834371, "loss": 0.6866, "step": 2298 }, { "epoch": 1.3875641412616964, "grad_norm": 0.11870943009853363, "learning_rate": 0.0001107928601079286, "loss": 0.6017, "step": 2299 }, { "epoch": 1.3881678237247208, "grad_norm": 0.12148743122816086, "learning_rate": 0.0001107513491075135, "loss": 0.574, "step": 2300 }, { "epoch": 1.3887715061877453, "grad_norm": 0.1272771656513214, "learning_rate": 0.00011070983810709838, "loss": 0.5676, "step": 2301 }, { "epoch": 1.3893751886507697, "grad_norm": 0.13474443554878235, "learning_rate": 0.00011066832710668327, "loss": 0.5647, "step": 2302 }, { "epoch": 1.3899788711137941, "grad_norm": 0.16455814242362976, "learning_rate": 0.00011062681610626817, "loss": 0.513, "step": 2303 }, { "epoch": 1.3905825535768186, "grad_norm": 0.15098370611667633, "learning_rate": 0.00011058530510585306, "loss": 0.5227, "step": 2304 }, { "epoch": 1.391186236039843, "grad_norm": 0.14636565744876862, "learning_rate": 0.00011054379410543793, "loss": 0.4745, "step": 2305 }, { "epoch": 1.3917899185028675, "grad_norm": 0.15192949771881104, "learning_rate": 0.00011050228310502284, "loss": 0.4152, "step": 2306 }, { "epoch": 1.392393600965892, "grad_norm": 0.15223491191864014, "learning_rate": 0.00011046077210460773, "loss": 0.3253, "step": 2307 }, { "epoch": 1.3929972834289164, "grad_norm": 0.10637323558330536, "learning_rate": 0.0001104192611041926, "loss": 0.6033, "step": 2308 }, { "epoch": 1.3936009658919408, "grad_norm": 0.11009783297777176, "learning_rate": 0.00011037775010377752, "loss": 0.6934, "step": 2309 }, { "epoch": 1.3942046483549653, "grad_norm": 0.10806847363710403, "learning_rate": 0.00011033623910336239, "loss": 0.6502, "step": 2310 }, { "epoch": 1.3948083308179897, "grad_norm": 0.11237577348947525, "learning_rate": 0.00011029472810294728, "loss": 0.684, "step": 2311 }, { "epoch": 1.3954120132810142, "grad_norm": 0.15474088490009308, "learning_rate": 0.00011025321710253218, "loss": 0.7796, "step": 2312 }, { "epoch": 1.3960156957440386, "grad_norm": 0.10955344140529633, "learning_rate": 0.00011021170610211706, "loss": 0.6851, "step": 2313 }, { "epoch": 1.396619378207063, "grad_norm": 0.12131255120038986, "learning_rate": 0.00011017019510170196, "loss": 0.7527, "step": 2314 }, { "epoch": 1.3972230606700875, "grad_norm": 0.10648848861455917, "learning_rate": 0.00011012868410128685, "loss": 0.7382, "step": 2315 }, { "epoch": 1.397826743133112, "grad_norm": 0.10780826210975647, "learning_rate": 0.00011008717310087174, "loss": 0.873, "step": 2316 }, { "epoch": 1.3984304255961364, "grad_norm": 0.11691659688949585, "learning_rate": 0.00011004566210045664, "loss": 1.0688, "step": 2317 }, { "epoch": 1.3990341080591608, "grad_norm": 0.1263553500175476, "learning_rate": 0.00011000415110004152, "loss": 0.7616, "step": 2318 }, { "epoch": 1.3996377905221853, "grad_norm": 0.1272163987159729, "learning_rate": 0.0001099626400996264, "loss": 0.8121, "step": 2319 }, { "epoch": 1.4002414729852097, "grad_norm": 0.267022967338562, "learning_rate": 0.00010992112909921131, "loss": 0.8719, "step": 2320 }, { "epoch": 1.4008451554482342, "grad_norm": 0.10211081057786942, "learning_rate": 0.0001098796180987962, "loss": 1.0426, "step": 2321 }, { "epoch": 1.4014488379112586, "grad_norm": 0.1085173562169075, "learning_rate": 0.00010983810709838107, "loss": 0.6862, "step": 2322 }, { "epoch": 1.402052520374283, "grad_norm": 0.1208515390753746, "learning_rate": 0.00010979659609796598, "loss": 0.7391, "step": 2323 }, { "epoch": 1.4026562028373075, "grad_norm": 0.10681469738483429, "learning_rate": 0.00010975508509755085, "loss": 0.7787, "step": 2324 }, { "epoch": 1.403259885300332, "grad_norm": 0.1286618411540985, "learning_rate": 0.00010971357409713574, "loss": 0.7521, "step": 2325 }, { "epoch": 1.4038635677633564, "grad_norm": 0.10427875816822052, "learning_rate": 0.00010967206309672064, "loss": 0.7142, "step": 2326 }, { "epoch": 1.4044672502263809, "grad_norm": 0.10928861796855927, "learning_rate": 0.00010963055209630553, "loss": 0.8745, "step": 2327 }, { "epoch": 1.4050709326894053, "grad_norm": 0.09785252064466476, "learning_rate": 0.00010958904109589041, "loss": 0.6134, "step": 2328 }, { "epoch": 1.4056746151524298, "grad_norm": 0.10797867923974991, "learning_rate": 0.00010954753009547531, "loss": 1.0501, "step": 2329 }, { "epoch": 1.4062782976154542, "grad_norm": 0.10527419298887253, "learning_rate": 0.0001095060190950602, "loss": 0.7023, "step": 2330 }, { "epoch": 1.4068819800784786, "grad_norm": 0.10159947723150253, "learning_rate": 0.00010946450809464507, "loss": 0.733, "step": 2331 }, { "epoch": 1.407485662541503, "grad_norm": 0.1160796582698822, "learning_rate": 0.00010942299709422999, "loss": 0.6167, "step": 2332 }, { "epoch": 1.4080893450045275, "grad_norm": 0.10533631592988968, "learning_rate": 0.00010938148609381486, "loss": 0.9244, "step": 2333 }, { "epoch": 1.4086930274675522, "grad_norm": 0.1092139482498169, "learning_rate": 0.00010933997509339975, "loss": 0.7907, "step": 2334 }, { "epoch": 1.4092967099305764, "grad_norm": 0.12767547369003296, "learning_rate": 0.00010929846409298466, "loss": 0.6244, "step": 2335 }, { "epoch": 1.409900392393601, "grad_norm": 0.1770038902759552, "learning_rate": 0.00010925695309256953, "loss": 0.8401, "step": 2336 }, { "epoch": 1.4105040748566253, "grad_norm": 0.10718080401420593, "learning_rate": 0.00010921544209215442, "loss": 0.6228, "step": 2337 }, { "epoch": 1.41110775731965, "grad_norm": 0.1158752515912056, "learning_rate": 0.00010917393109173932, "loss": 0.7027, "step": 2338 }, { "epoch": 1.4117114397826742, "grad_norm": 0.11484018713235855, "learning_rate": 0.0001091324200913242, "loss": 0.7482, "step": 2339 }, { "epoch": 1.4123151222456989, "grad_norm": 0.10645020753145218, "learning_rate": 0.00010909090909090909, "loss": 0.7191, "step": 2340 }, { "epoch": 1.412918804708723, "grad_norm": 0.11423823237419128, "learning_rate": 0.00010904939809049399, "loss": 0.6348, "step": 2341 }, { "epoch": 1.4135224871717478, "grad_norm": 0.10967587679624557, "learning_rate": 0.00010900788709007888, "loss": 0.6462, "step": 2342 }, { "epoch": 1.414126169634772, "grad_norm": 0.1130829006433487, "learning_rate": 0.00010896637608966375, "loss": 0.6599, "step": 2343 }, { "epoch": 1.4147298520977967, "grad_norm": 0.10537155717611313, "learning_rate": 0.00010892486508924867, "loss": 0.6092, "step": 2344 }, { "epoch": 1.415333534560821, "grad_norm": 0.11640435457229614, "learning_rate": 0.00010888335408883354, "loss": 0.6485, "step": 2345 }, { "epoch": 1.4159372170238456, "grad_norm": 0.13120651245117188, "learning_rate": 0.00010884184308841842, "loss": 0.8094, "step": 2346 }, { "epoch": 1.4165408994868698, "grad_norm": 0.12212494760751724, "learning_rate": 0.00010880033208800332, "loss": 0.642, "step": 2347 }, { "epoch": 1.4171445819498945, "grad_norm": 0.12393233925104141, "learning_rate": 0.00010875882108758821, "loss": 0.6366, "step": 2348 }, { "epoch": 1.4177482644129187, "grad_norm": 0.13876676559448242, "learning_rate": 0.0001087173100871731, "loss": 0.6859, "step": 2349 }, { "epoch": 1.4183519468759433, "grad_norm": 0.13311347365379333, "learning_rate": 0.000108675799086758, "loss": 0.6349, "step": 2350 }, { "epoch": 1.4189556293389676, "grad_norm": 0.13839563727378845, "learning_rate": 0.00010863428808634288, "loss": 0.5701, "step": 2351 }, { "epoch": 1.4195593118019922, "grad_norm": 0.14265376329421997, "learning_rate": 0.00010859277708592777, "loss": 0.5708, "step": 2352 }, { "epoch": 1.4201629942650165, "grad_norm": 0.1401066929101944, "learning_rate": 0.00010855126608551267, "loss": 0.5612, "step": 2353 }, { "epoch": 1.4207666767280411, "grad_norm": 0.14841662347316742, "learning_rate": 0.00010850975508509756, "loss": 0.5377, "step": 2354 }, { "epoch": 1.4213703591910656, "grad_norm": 0.15106201171875, "learning_rate": 0.00010846824408468243, "loss": 0.5121, "step": 2355 }, { "epoch": 1.42197404165409, "grad_norm": 0.15050731599330902, "learning_rate": 0.00010842673308426734, "loss": 0.3922, "step": 2356 }, { "epoch": 1.4225777241171145, "grad_norm": 0.1529148668050766, "learning_rate": 0.00010838522208385222, "loss": 0.3001, "step": 2357 }, { "epoch": 1.423181406580139, "grad_norm": 0.10441195219755173, "learning_rate": 0.0001083437110834371, "loss": 0.694, "step": 2358 }, { "epoch": 1.4237850890431634, "grad_norm": 0.11137279868125916, "learning_rate": 0.000108302200083022, "loss": 0.9643, "step": 2359 }, { "epoch": 1.4243887715061878, "grad_norm": 0.12313207983970642, "learning_rate": 0.00010826068908260689, "loss": 0.6247, "step": 2360 }, { "epoch": 1.4249924539692123, "grad_norm": 0.11363215744495392, "learning_rate": 0.00010821917808219179, "loss": 0.7187, "step": 2361 }, { "epoch": 1.4255961364322367, "grad_norm": 0.11889949440956116, "learning_rate": 0.00010817766708177668, "loss": 0.9015, "step": 2362 }, { "epoch": 1.4261998188952612, "grad_norm": 0.1457277536392212, "learning_rate": 0.00010813615608136156, "loss": 0.7921, "step": 2363 }, { "epoch": 1.4268035013582856, "grad_norm": 0.11033974587917328, "learning_rate": 0.00010809464508094646, "loss": 0.7434, "step": 2364 }, { "epoch": 1.42740718382131, "grad_norm": 0.1310395896434784, "learning_rate": 0.00010805313408053135, "loss": 0.8429, "step": 2365 }, { "epoch": 1.4280108662843345, "grad_norm": 0.11207132786512375, "learning_rate": 0.00010801162308011624, "loss": 1.0315, "step": 2366 }, { "epoch": 1.428614548747359, "grad_norm": 0.10667724162340164, "learning_rate": 0.00010797011207970114, "loss": 0.5591, "step": 2367 }, { "epoch": 1.4292182312103834, "grad_norm": 0.11251533776521683, "learning_rate": 0.00010792860107928602, "loss": 0.7458, "step": 2368 }, { "epoch": 1.4298219136734078, "grad_norm": 0.11157943308353424, "learning_rate": 0.0001078870900788709, "loss": 0.7979, "step": 2369 }, { "epoch": 1.4304255961364323, "grad_norm": 0.10930392146110535, "learning_rate": 0.00010784557907845581, "loss": 0.7159, "step": 2370 }, { "epoch": 1.4310292785994567, "grad_norm": 0.11821290105581284, "learning_rate": 0.00010780406807804068, "loss": 0.8176, "step": 2371 }, { "epoch": 1.4316329610624812, "grad_norm": 0.11287893354892731, "learning_rate": 0.00010776255707762557, "loss": 0.7326, "step": 2372 }, { "epoch": 1.4322366435255056, "grad_norm": 0.11231239885091782, "learning_rate": 0.00010772104607721047, "loss": 0.7556, "step": 2373 }, { "epoch": 1.43284032598853, "grad_norm": 0.10399778932332993, "learning_rate": 0.00010767953507679535, "loss": 0.7313, "step": 2374 }, { "epoch": 1.4334440084515545, "grad_norm": 0.112164206802845, "learning_rate": 0.00010763802407638024, "loss": 0.7218, "step": 2375 }, { "epoch": 1.434047690914579, "grad_norm": 0.10718587040901184, "learning_rate": 0.00010759651307596514, "loss": 0.7168, "step": 2376 }, { "epoch": 1.4346513733776034, "grad_norm": 0.1262284815311432, "learning_rate": 0.00010755500207555003, "loss": 0.7782, "step": 2377 }, { "epoch": 1.4352550558406278, "grad_norm": 0.11654514819383621, "learning_rate": 0.00010751349107513491, "loss": 0.7666, "step": 2378 }, { "epoch": 1.4358587383036523, "grad_norm": 0.10238191485404968, "learning_rate": 0.00010747198007471981, "loss": 0.7126, "step": 2379 }, { "epoch": 1.4364624207666767, "grad_norm": 0.11885840445756912, "learning_rate": 0.0001074304690743047, "loss": 0.9197, "step": 2380 }, { "epoch": 1.4370661032297012, "grad_norm": 0.11124838888645172, "learning_rate": 0.00010738895807388957, "loss": 0.6942, "step": 2381 }, { "epoch": 1.4376697856927256, "grad_norm": 0.10520321130752563, "learning_rate": 0.00010734744707347449, "loss": 1.0973, "step": 2382 }, { "epoch": 1.43827346815575, "grad_norm": 0.10521334409713745, "learning_rate": 0.00010730593607305936, "loss": 0.6247, "step": 2383 }, { "epoch": 1.4388771506187745, "grad_norm": 0.1028476133942604, "learning_rate": 0.00010726442507264425, "loss": 0.9452, "step": 2384 }, { "epoch": 1.439480833081799, "grad_norm": 0.10759977251291275, "learning_rate": 0.00010722291407222915, "loss": 0.7051, "step": 2385 }, { "epoch": 1.4400845155448234, "grad_norm": 0.13091117143630981, "learning_rate": 0.00010718140307181403, "loss": 0.8791, "step": 2386 }, { "epoch": 1.4406881980078479, "grad_norm": 0.11425816267728806, "learning_rate": 0.00010713989207139892, "loss": 0.7791, "step": 2387 }, { "epoch": 1.4412918804708723, "grad_norm": 0.10497016459703445, "learning_rate": 0.00010709838107098382, "loss": 0.6761, "step": 2388 }, { "epoch": 1.4418955629338968, "grad_norm": 0.12967948615550995, "learning_rate": 0.0001070568700705687, "loss": 0.9618, "step": 2389 }, { "epoch": 1.4424992453969212, "grad_norm": 0.11426857113838196, "learning_rate": 0.00010701535907015359, "loss": 0.761, "step": 2390 }, { "epoch": 1.4431029278599457, "grad_norm": 0.36786845326423645, "learning_rate": 0.00010697384806973849, "loss": 0.722, "step": 2391 }, { "epoch": 1.44370661032297, "grad_norm": 0.11083753407001495, "learning_rate": 0.00010693233706932338, "loss": 0.6985, "step": 2392 }, { "epoch": 1.4443102927859945, "grad_norm": 0.11332450807094574, "learning_rate": 0.00010689082606890825, "loss": 0.6547, "step": 2393 }, { "epoch": 1.444913975249019, "grad_norm": 0.11285028606653214, "learning_rate": 0.00010684931506849317, "loss": 0.6623, "step": 2394 }, { "epoch": 1.4455176577120434, "grad_norm": 0.12340085208415985, "learning_rate": 0.00010680780406807804, "loss": 0.7428, "step": 2395 }, { "epoch": 1.4461213401750679, "grad_norm": 0.11893676221370697, "learning_rate": 0.00010676629306766293, "loss": 0.6858, "step": 2396 }, { "epoch": 1.4467250226380923, "grad_norm": 0.11241084337234497, "learning_rate": 0.00010672478206724783, "loss": 0.6083, "step": 2397 }, { "epoch": 1.4473287051011168, "grad_norm": 0.12452124804258347, "learning_rate": 0.00010668327106683271, "loss": 0.6005, "step": 2398 }, { "epoch": 1.4479323875641412, "grad_norm": 0.3717198073863983, "learning_rate": 0.0001066417600664176, "loss": 0.651, "step": 2399 }, { "epoch": 1.4485360700271657, "grad_norm": 0.13178178668022156, "learning_rate": 0.0001066002490660025, "loss": 0.5641, "step": 2400 }, { "epoch": 1.4491397524901901, "grad_norm": 0.13347181677818298, "learning_rate": 0.00010655873806558738, "loss": 0.565, "step": 2401 }, { "epoch": 1.4497434349532146, "grad_norm": 0.13921667635440826, "learning_rate": 0.00010651722706517227, "loss": 0.5777, "step": 2402 }, { "epoch": 1.450347117416239, "grad_norm": 0.14336366951465607, "learning_rate": 0.00010647571606475717, "loss": 0.484, "step": 2403 }, { "epoch": 1.4509507998792635, "grad_norm": 0.16313937306404114, "learning_rate": 0.00010643420506434206, "loss": 0.5522, "step": 2404 }, { "epoch": 1.451554482342288, "grad_norm": 0.16395455598831177, "learning_rate": 0.00010639269406392696, "loss": 0.5376, "step": 2405 }, { "epoch": 1.4521581648053123, "grad_norm": 0.15675240755081177, "learning_rate": 0.00010635118306351184, "loss": 0.4248, "step": 2406 }, { "epoch": 1.4527618472683368, "grad_norm": 0.16776950657367706, "learning_rate": 0.00010630967206309672, "loss": 0.3375, "step": 2407 }, { "epoch": 1.4533655297313612, "grad_norm": 0.10852830857038498, "learning_rate": 0.00010626816106268163, "loss": 0.6665, "step": 2408 }, { "epoch": 1.4539692121943857, "grad_norm": 0.115415558218956, "learning_rate": 0.0001062266500622665, "loss": 0.7669, "step": 2409 }, { "epoch": 1.4545728946574101, "grad_norm": 0.12048669904470444, "learning_rate": 0.00010618513906185139, "loss": 0.7165, "step": 2410 }, { "epoch": 1.4551765771204346, "grad_norm": 0.15835049748420715, "learning_rate": 0.00010614362806143629, "loss": 0.7294, "step": 2411 }, { "epoch": 1.455780259583459, "grad_norm": 0.11197509616613388, "learning_rate": 0.00010610211706102118, "loss": 0.9129, "step": 2412 }, { "epoch": 1.4563839420464835, "grad_norm": 0.11068537831306458, "learning_rate": 0.00010606060606060606, "loss": 0.7544, "step": 2413 }, { "epoch": 1.456987624509508, "grad_norm": 0.1250140517950058, "learning_rate": 0.00010601909506019096, "loss": 0.8565, "step": 2414 }, { "epoch": 1.4575913069725326, "grad_norm": 0.11533838510513306, "learning_rate": 0.00010597758405977585, "loss": 0.7451, "step": 2415 }, { "epoch": 1.4581949894355568, "grad_norm": 0.11751358211040497, "learning_rate": 0.00010593607305936074, "loss": 0.852, "step": 2416 }, { "epoch": 1.4587986718985815, "grad_norm": 0.1017642691731453, "learning_rate": 0.00010589456205894564, "loss": 0.5441, "step": 2417 }, { "epoch": 1.4594023543616057, "grad_norm": 0.10722334682941437, "learning_rate": 0.00010585305105853052, "loss": 0.8963, "step": 2418 }, { "epoch": 1.4600060368246304, "grad_norm": 0.11673782765865326, "learning_rate": 0.0001058115400581154, "loss": 0.8146, "step": 2419 }, { "epoch": 1.4606097192876546, "grad_norm": 0.12002434581518173, "learning_rate": 0.00010577002905770031, "loss": 0.741, "step": 2420 }, { "epoch": 1.4612134017506793, "grad_norm": 0.11690894514322281, "learning_rate": 0.00010572851805728518, "loss": 0.7929, "step": 2421 }, { "epoch": 1.4618170842137035, "grad_norm": 0.12483794242143631, "learning_rate": 0.00010568700705687007, "loss": 0.8771, "step": 2422 }, { "epoch": 1.4624207666767282, "grad_norm": 0.11409945040941238, "learning_rate": 0.00010564549605645497, "loss": 0.6703, "step": 2423 }, { "epoch": 1.4630244491397524, "grad_norm": 0.11765747517347336, "learning_rate": 0.00010560398505603986, "loss": 0.6562, "step": 2424 }, { "epoch": 1.463628131602777, "grad_norm": 0.10898533463478088, "learning_rate": 0.00010556247405562474, "loss": 0.7042, "step": 2425 }, { "epoch": 1.4642318140658013, "grad_norm": 0.12223546206951141, "learning_rate": 0.00010552096305520964, "loss": 0.6604, "step": 2426 }, { "epoch": 1.464835496528826, "grad_norm": 0.10916266590356827, "learning_rate": 0.00010547945205479453, "loss": 0.7925, "step": 2427 }, { "epoch": 1.4654391789918502, "grad_norm": 0.1135077029466629, "learning_rate": 0.0001054379410543794, "loss": 0.7005, "step": 2428 }, { "epoch": 1.4660428614548748, "grad_norm": 0.11547960340976715, "learning_rate": 0.00010539643005396431, "loss": 0.7297, "step": 2429 }, { "epoch": 1.466646543917899, "grad_norm": 0.11045881360769272, "learning_rate": 0.0001053549190535492, "loss": 0.7663, "step": 2430 }, { "epoch": 1.4672502263809237, "grad_norm": 0.12327514588832855, "learning_rate": 0.00010531340805313407, "loss": 0.6968, "step": 2431 }, { "epoch": 1.467853908843948, "grad_norm": 0.11502881348133087, "learning_rate": 0.00010527189705271899, "loss": 0.7197, "step": 2432 }, { "epoch": 1.4684575913069726, "grad_norm": 0.11779715120792389, "learning_rate": 0.00010523038605230386, "loss": 0.7248, "step": 2433 }, { "epoch": 1.4690612737699968, "grad_norm": 0.11766118556261063, "learning_rate": 0.00010518887505188875, "loss": 0.7546, "step": 2434 }, { "epoch": 1.4696649562330215, "grad_norm": 0.11458957195281982, "learning_rate": 0.00010514736405147365, "loss": 0.7825, "step": 2435 }, { "epoch": 1.470268638696046, "grad_norm": 0.11159491539001465, "learning_rate": 0.00010510585305105853, "loss": 0.8465, "step": 2436 }, { "epoch": 1.4708723211590704, "grad_norm": 0.10881160199642181, "learning_rate": 0.00010506434205064342, "loss": 0.7381, "step": 2437 }, { "epoch": 1.4714760036220949, "grad_norm": 0.11549453437328339, "learning_rate": 0.00010502283105022832, "loss": 0.7784, "step": 2438 }, { "epoch": 1.4720796860851193, "grad_norm": 0.11238040775060654, "learning_rate": 0.0001049813200498132, "loss": 0.8611, "step": 2439 }, { "epoch": 1.4726833685481437, "grad_norm": 0.1089930310845375, "learning_rate": 0.00010493980904939808, "loss": 0.7186, "step": 2440 }, { "epoch": 1.4732870510111682, "grad_norm": 0.10445185750722885, "learning_rate": 0.00010489829804898299, "loss": 0.6156, "step": 2441 }, { "epoch": 1.4738907334741926, "grad_norm": 0.10923328250646591, "learning_rate": 0.00010485678704856787, "loss": 0.6738, "step": 2442 }, { "epoch": 1.474494415937217, "grad_norm": 0.11018381267786026, "learning_rate": 0.00010481527604815275, "loss": 0.6475, "step": 2443 }, { "epoch": 1.4750980984002415, "grad_norm": 0.11723464727401733, "learning_rate": 0.00010477376504773767, "loss": 0.7307, "step": 2444 }, { "epoch": 1.475701780863266, "grad_norm": 0.11130370199680328, "learning_rate": 0.00010473225404732254, "loss": 0.6028, "step": 2445 }, { "epoch": 1.4763054633262904, "grad_norm": 0.11799994856119156, "learning_rate": 0.00010469074304690743, "loss": 0.5884, "step": 2446 }, { "epoch": 1.4769091457893149, "grad_norm": 0.12066012620925903, "learning_rate": 0.00010464923204649233, "loss": 0.6483, "step": 2447 }, { "epoch": 1.4775128282523393, "grad_norm": 0.12716256082057953, "learning_rate": 0.00010460772104607721, "loss": 0.6412, "step": 2448 }, { "epoch": 1.4781165107153638, "grad_norm": 0.1431380957365036, "learning_rate": 0.00010456621004566211, "loss": 0.6544, "step": 2449 }, { "epoch": 1.4787201931783882, "grad_norm": 0.1343405693769455, "learning_rate": 0.000104524699045247, "loss": 0.5788, "step": 2450 }, { "epoch": 1.4793238756414127, "grad_norm": 0.1583533138036728, "learning_rate": 0.00010448318804483189, "loss": 0.6123, "step": 2451 }, { "epoch": 1.479927558104437, "grad_norm": 0.1386100798845291, "learning_rate": 0.00010444167704441679, "loss": 0.6458, "step": 2452 }, { "epoch": 1.4805312405674615, "grad_norm": 0.1477377563714981, "learning_rate": 0.00010440016604400167, "loss": 0.5432, "step": 2453 }, { "epoch": 1.481134923030486, "grad_norm": 0.14655126631259918, "learning_rate": 0.00010435865504358654, "loss": 0.4965, "step": 2454 }, { "epoch": 1.4817386054935104, "grad_norm": 0.15142911672592163, "learning_rate": 0.00010431714404317146, "loss": 0.4564, "step": 2455 }, { "epoch": 1.4823422879565349, "grad_norm": 0.1687132865190506, "learning_rate": 0.00010427563304275633, "loss": 0.4261, "step": 2456 }, { "epoch": 1.4829459704195593, "grad_norm": 0.15374094247817993, "learning_rate": 0.00010423412204234122, "loss": 0.2757, "step": 2457 }, { "epoch": 1.4835496528825838, "grad_norm": 0.11385196447372437, "learning_rate": 0.00010419261104192613, "loss": 0.6181, "step": 2458 }, { "epoch": 1.4841533353456082, "grad_norm": 0.12344599515199661, "learning_rate": 0.000104151100041511, "loss": 0.5809, "step": 2459 }, { "epoch": 1.4847570178086327, "grad_norm": 0.12225256860256195, "learning_rate": 0.00010410958904109589, "loss": 0.8097, "step": 2460 }, { "epoch": 1.4853607002716571, "grad_norm": 0.1283671259880066, "learning_rate": 0.00010406807804068079, "loss": 0.8238, "step": 2461 }, { "epoch": 1.4859643827346816, "grad_norm": 0.12050528824329376, "learning_rate": 0.00010402656704026568, "loss": 0.7068, "step": 2462 }, { "epoch": 1.486568065197706, "grad_norm": 0.12876033782958984, "learning_rate": 0.00010398505603985056, "loss": 0.7713, "step": 2463 }, { "epoch": 1.4871717476607305, "grad_norm": 0.12785589694976807, "learning_rate": 0.00010394354503943546, "loss": 0.9149, "step": 2464 }, { "epoch": 1.487775430123755, "grad_norm": 0.10582014918327332, "learning_rate": 0.00010390203403902035, "loss": 0.71, "step": 2465 }, { "epoch": 1.4883791125867794, "grad_norm": 0.10887051373720169, "learning_rate": 0.00010386052303860522, "loss": 1.0357, "step": 2466 }, { "epoch": 1.4889827950498038, "grad_norm": 0.10681470483541489, "learning_rate": 0.00010381901203819014, "loss": 0.7339, "step": 2467 }, { "epoch": 1.4895864775128282, "grad_norm": 0.13924428820610046, "learning_rate": 0.00010377750103777501, "loss": 0.6848, "step": 2468 }, { "epoch": 1.4901901599758527, "grad_norm": 0.10944285243749619, "learning_rate": 0.0001037359900373599, "loss": 0.7028, "step": 2469 }, { "epoch": 1.4907938424388771, "grad_norm": 0.11315060406923294, "learning_rate": 0.0001036944790369448, "loss": 0.7433, "step": 2470 }, { "epoch": 1.4913975249019016, "grad_norm": 0.11983177065849304, "learning_rate": 0.00010365296803652968, "loss": 0.7025, "step": 2471 }, { "epoch": 1.492001207364926, "grad_norm": 0.11219801008701324, "learning_rate": 0.00010361145703611457, "loss": 0.7046, "step": 2472 }, { "epoch": 1.4926048898279505, "grad_norm": 0.12625454366207123, "learning_rate": 0.00010356994603569947, "loss": 0.7736, "step": 2473 }, { "epoch": 1.493208572290975, "grad_norm": 0.12454172223806381, "learning_rate": 0.00010352843503528436, "loss": 0.7124, "step": 2474 }, { "epoch": 1.4938122547539994, "grad_norm": 0.11191127449274063, "learning_rate": 0.00010348692403486924, "loss": 0.8456, "step": 2475 }, { "epoch": 1.4944159372170238, "grad_norm": 0.10700973868370056, "learning_rate": 0.00010344541303445414, "loss": 0.8491, "step": 2476 }, { "epoch": 1.4950196196800483, "grad_norm": 0.7383321523666382, "learning_rate": 0.00010340390203403903, "loss": 0.8734, "step": 2477 }, { "epoch": 1.4956233021430727, "grad_norm": 0.11059489101171494, "learning_rate": 0.0001033623910336239, "loss": 0.6143, "step": 2478 }, { "epoch": 1.4962269846060972, "grad_norm": 0.11313536763191223, "learning_rate": 0.00010332088003320882, "loss": 0.6963, "step": 2479 }, { "epoch": 1.4968306670691216, "grad_norm": 0.11421024799346924, "learning_rate": 0.00010327936903279369, "loss": 0.7297, "step": 2480 }, { "epoch": 1.497434349532146, "grad_norm": 0.11083009093999863, "learning_rate": 0.00010323785803237857, "loss": 0.6467, "step": 2481 }, { "epoch": 1.4980380319951705, "grad_norm": 0.110516257584095, "learning_rate": 0.00010319634703196347, "loss": 1.0344, "step": 2482 }, { "epoch": 1.498641714458195, "grad_norm": 0.12053235620260239, "learning_rate": 0.00010315483603154836, "loss": 0.7158, "step": 2483 }, { "epoch": 1.4992453969212194, "grad_norm": 0.11220195889472961, "learning_rate": 0.00010311332503113325, "loss": 0.9486, "step": 2484 }, { "epoch": 1.4998490793842438, "grad_norm": 0.121260866522789, "learning_rate": 0.00010307181403071815, "loss": 0.6877, "step": 2485 }, { "epoch": 1.5004527618472685, "grad_norm": 0.11453468352556229, "learning_rate": 0.00010303030303030303, "loss": 0.6868, "step": 2486 }, { "epoch": 1.5010564443102927, "grad_norm": 0.11923350393772125, "learning_rate": 0.00010298879202988792, "loss": 0.7115, "step": 2487 }, { "epoch": 1.5016601267733174, "grad_norm": 0.11905788630247116, "learning_rate": 0.00010294728102947282, "loss": 0.894, "step": 2488 }, { "epoch": 1.5022638092363416, "grad_norm": 0.12174365669488907, "learning_rate": 0.00010290577002905771, "loss": 0.8226, "step": 2489 }, { "epoch": 1.5028674916993663, "grad_norm": 0.1140153631567955, "learning_rate": 0.00010286425902864258, "loss": 0.744, "step": 2490 }, { "epoch": 1.5034711741623905, "grad_norm": 0.11370176821947098, "learning_rate": 0.0001028227480282275, "loss": 0.6642, "step": 2491 }, { "epoch": 1.5040748566254152, "grad_norm": 0.11257719993591309, "learning_rate": 0.00010278123702781237, "loss": 0.6352, "step": 2492 }, { "epoch": 1.5046785390884394, "grad_norm": 0.11130896955728531, "learning_rate": 0.00010273972602739728, "loss": 0.6231, "step": 2493 }, { "epoch": 1.505282221551464, "grad_norm": 0.11869868636131287, "learning_rate": 0.00010269821502698215, "loss": 0.695, "step": 2494 }, { "epoch": 1.5058859040144883, "grad_norm": 0.12181151658296585, "learning_rate": 0.00010265670402656704, "loss": 0.6434, "step": 2495 }, { "epoch": 1.506489586477513, "grad_norm": 0.12335172295570374, "learning_rate": 0.00010261519302615194, "loss": 0.6727, "step": 2496 }, { "epoch": 1.5070932689405372, "grad_norm": 0.12331590801477432, "learning_rate": 0.00010257368202573683, "loss": 0.6231, "step": 2497 }, { "epoch": 1.5076969514035619, "grad_norm": 0.13102783262729645, "learning_rate": 0.00010253217102532171, "loss": 0.5846, "step": 2498 }, { "epoch": 1.508300633866586, "grad_norm": 0.1286454200744629, "learning_rate": 0.00010249066002490661, "loss": 0.6141, "step": 2499 }, { "epoch": 1.5089043163296108, "grad_norm": 0.1343620866537094, "learning_rate": 0.0001024491490244915, "loss": 0.6316, "step": 2500 }, { "epoch": 1.5089043163296108, "eval_loss": 0.795173168182373, "eval_runtime": 1219.0487, "eval_samples_per_second": 2.289, "eval_steps_per_second": 0.286, "step": 2500 }, { "epoch": 1.509507998792635, "grad_norm": 0.1346864253282547, "learning_rate": 0.00010240763802407639, "loss": 0.6343, "step": 2501 }, { "epoch": 1.5101116812556596, "grad_norm": 0.1306980401277542, "learning_rate": 0.00010236612702366129, "loss": 0.5615, "step": 2502 }, { "epoch": 1.5107153637186839, "grad_norm": 0.1505928337574005, "learning_rate": 0.00010232461602324617, "loss": 0.5591, "step": 2503 }, { "epoch": 1.5113190461817085, "grad_norm": 0.1430431306362152, "learning_rate": 0.00010228310502283104, "loss": 0.496, "step": 2504 }, { "epoch": 1.5119227286447328, "grad_norm": 0.1660575270652771, "learning_rate": 0.00010224159402241596, "loss": 0.4632, "step": 2505 }, { "epoch": 1.5125264111077574, "grad_norm": 0.17529623210430145, "learning_rate": 0.00010220008302200083, "loss": 0.4824, "step": 2506 }, { "epoch": 1.5131300935707817, "grad_norm": 0.15798205137252808, "learning_rate": 0.00010215857202158572, "loss": 0.3551, "step": 2507 }, { "epoch": 1.5137337760338063, "grad_norm": 0.12132739275693893, "learning_rate": 0.00010211706102117062, "loss": 0.9828, "step": 2508 }, { "epoch": 1.5143374584968305, "grad_norm": 0.12069849669933319, "learning_rate": 0.0001020755500207555, "loss": 0.7057, "step": 2509 }, { "epoch": 1.5149411409598552, "grad_norm": 0.1271534413099289, "learning_rate": 0.00010203403902034039, "loss": 0.7686, "step": 2510 }, { "epoch": 1.5155448234228794, "grad_norm": 0.10612853616476059, "learning_rate": 0.00010199252801992529, "loss": 0.7055, "step": 2511 }, { "epoch": 1.516148505885904, "grad_norm": 0.11403614282608032, "learning_rate": 0.00010195101701951018, "loss": 0.673, "step": 2512 }, { "epoch": 1.5167521883489283, "grad_norm": 0.11351923644542694, "learning_rate": 0.00010190950601909506, "loss": 0.6534, "step": 2513 }, { "epoch": 1.517355870811953, "grad_norm": 0.12284346669912338, "learning_rate": 0.00010186799501867996, "loss": 0.7367, "step": 2514 }, { "epoch": 1.5179595532749772, "grad_norm": 0.11047890037298203, "learning_rate": 0.00010182648401826485, "loss": 0.7392, "step": 2515 }, { "epoch": 1.518563235738002, "grad_norm": 0.13320279121398926, "learning_rate": 0.00010178497301784972, "loss": 0.7075, "step": 2516 }, { "epoch": 1.5191669182010261, "grad_norm": 0.1211395412683487, "learning_rate": 0.00010174346201743464, "loss": 0.7255, "step": 2517 }, { "epoch": 1.5197706006640508, "grad_norm": 0.10632450878620148, "learning_rate": 0.00010170195101701951, "loss": 0.6936, "step": 2518 }, { "epoch": 1.520374283127075, "grad_norm": 0.10908479243516922, "learning_rate": 0.0001016604400166044, "loss": 0.783, "step": 2519 }, { "epoch": 1.5209779655900997, "grad_norm": 0.11272008717060089, "learning_rate": 0.0001016189290161893, "loss": 0.8508, "step": 2520 }, { "epoch": 1.521581648053124, "grad_norm": 0.11356555670499802, "learning_rate": 0.00010157741801577418, "loss": 0.7808, "step": 2521 }, { "epoch": 1.5221853305161486, "grad_norm": 0.1155843734741211, "learning_rate": 0.00010153590701535907, "loss": 0.9635, "step": 2522 }, { "epoch": 1.5227890129791728, "grad_norm": 0.10973259806632996, "learning_rate": 0.00010149439601494397, "loss": 0.7365, "step": 2523 }, { "epoch": 1.5233926954421975, "grad_norm": 0.1167822778224945, "learning_rate": 0.00010145288501452886, "loss": 0.7307, "step": 2524 }, { "epoch": 1.5239963779052217, "grad_norm": 0.11712843924760818, "learning_rate": 0.00010141137401411374, "loss": 0.6832, "step": 2525 }, { "epoch": 1.5246000603682464, "grad_norm": 0.11597023159265518, "learning_rate": 0.00010136986301369864, "loss": 0.7971, "step": 2526 }, { "epoch": 1.5252037428312708, "grad_norm": 0.12328234314918518, "learning_rate": 0.00010132835201328353, "loss": 0.9113, "step": 2527 }, { "epoch": 1.5258074252942952, "grad_norm": 0.1276886761188507, "learning_rate": 0.0001012868410128684, "loss": 0.659, "step": 2528 }, { "epoch": 1.5264111077573197, "grad_norm": 0.11608514934778214, "learning_rate": 0.00010124533001245332, "loss": 0.6754, "step": 2529 }, { "epoch": 1.5270147902203441, "grad_norm": 0.10984505712985992, "learning_rate": 0.00010120381901203819, "loss": 0.9727, "step": 2530 }, { "epoch": 1.5276184726833686, "grad_norm": 0.1020493283867836, "learning_rate": 0.00010116230801162307, "loss": 0.6945, "step": 2531 }, { "epoch": 1.528222155146393, "grad_norm": 0.11108864098787308, "learning_rate": 0.00010112079701120797, "loss": 0.6623, "step": 2532 }, { "epoch": 1.5288258376094175, "grad_norm": 0.11104828864336014, "learning_rate": 0.00010107928601079286, "loss": 0.7971, "step": 2533 }, { "epoch": 1.529429520072442, "grad_norm": 0.11642879247665405, "learning_rate": 0.00010103777501037775, "loss": 0.7904, "step": 2534 }, { "epoch": 1.5300332025354664, "grad_norm": 0.11537434905767441, "learning_rate": 0.00010099626400996265, "loss": 0.7779, "step": 2535 }, { "epoch": 1.5306368849984908, "grad_norm": 0.11406029760837555, "learning_rate": 0.00010095475300954753, "loss": 0.6764, "step": 2536 }, { "epoch": 1.5312405674615153, "grad_norm": 0.12164659798145294, "learning_rate": 0.00010091324200913243, "loss": 0.6798, "step": 2537 }, { "epoch": 1.5318442499245397, "grad_norm": 0.11121895164251328, "learning_rate": 0.00010087173100871732, "loss": 0.9903, "step": 2538 }, { "epoch": 1.5324479323875642, "grad_norm": 0.13991357386112213, "learning_rate": 0.00010083022000830221, "loss": 0.6855, "step": 2539 }, { "epoch": 1.5330516148505886, "grad_norm": 0.11784554272890091, "learning_rate": 0.00010078870900788711, "loss": 0.6766, "step": 2540 }, { "epoch": 1.533655297313613, "grad_norm": 0.12097886949777603, "learning_rate": 0.000100747198007472, "loss": 0.7029, "step": 2541 }, { "epoch": 1.5342589797766375, "grad_norm": 0.12290269136428833, "learning_rate": 0.00010070568700705687, "loss": 0.6882, "step": 2542 }, { "epoch": 1.534862662239662, "grad_norm": 0.11816665530204773, "learning_rate": 0.00010066417600664178, "loss": 0.6425, "step": 2543 }, { "epoch": 1.5354663447026864, "grad_norm": 0.11926699429750443, "learning_rate": 0.00010062266500622665, "loss": 0.6564, "step": 2544 }, { "epoch": 1.5360700271657108, "grad_norm": 0.13197208940982819, "learning_rate": 0.00010058115400581154, "loss": 0.6045, "step": 2545 }, { "epoch": 1.5366737096287353, "grad_norm": 0.12189357727766037, "learning_rate": 0.00010053964300539644, "loss": 0.6137, "step": 2546 }, { "epoch": 1.5372773920917597, "grad_norm": 0.12850208580493927, "learning_rate": 0.00010049813200498133, "loss": 0.6533, "step": 2547 }, { "epoch": 1.5378810745547842, "grad_norm": 0.13015426695346832, "learning_rate": 0.00010045662100456621, "loss": 0.597, "step": 2548 }, { "epoch": 1.5384847570178086, "grad_norm": 0.13129258155822754, "learning_rate": 0.00010041511000415111, "loss": 0.5412, "step": 2549 }, { "epoch": 1.539088439480833, "grad_norm": 0.1495341658592224, "learning_rate": 0.000100373599003736, "loss": 0.6265, "step": 2550 }, { "epoch": 1.5396921219438575, "grad_norm": 0.1443036049604416, "learning_rate": 0.00010033208800332087, "loss": 0.5974, "step": 2551 }, { "epoch": 1.540295804406882, "grad_norm": 0.13055108487606049, "learning_rate": 0.00010029057700290579, "loss": 0.5123, "step": 2552 }, { "epoch": 1.5408994868699064, "grad_norm": 0.15532496571540833, "learning_rate": 0.00010024906600249067, "loss": 0.5273, "step": 2553 }, { "epoch": 1.5415031693329309, "grad_norm": 0.15928137302398682, "learning_rate": 0.00010020755500207555, "loss": 0.5119, "step": 2554 }, { "epoch": 1.5421068517959553, "grad_norm": 0.1558419167995453, "learning_rate": 0.00010016604400166046, "loss": 0.4332, "step": 2555 }, { "epoch": 1.5427105342589797, "grad_norm": 0.1683838963508606, "learning_rate": 0.00010012453300124533, "loss": 0.3724, "step": 2556 }, { "epoch": 1.5433142167220042, "grad_norm": 0.15534067153930664, "learning_rate": 0.00010008302200083022, "loss": 0.3148, "step": 2557 }, { "epoch": 1.5439178991850286, "grad_norm": 0.10773934423923492, "learning_rate": 0.00010004151100041512, "loss": 0.7099, "step": 2558 }, { "epoch": 1.544521581648053, "grad_norm": 0.1566229909658432, "learning_rate": 0.0001, "loss": 0.677, "step": 2559 }, { "epoch": 1.5451252641110775, "grad_norm": 0.12028037011623383, "learning_rate": 9.99584889995849e-05, "loss": 0.8106, "step": 2560 }, { "epoch": 1.545728946574102, "grad_norm": 0.1171046793460846, "learning_rate": 9.991697799916978e-05, "loss": 0.7519, "step": 2561 }, { "epoch": 1.5463326290371264, "grad_norm": 0.11625587195158005, "learning_rate": 9.987546699875468e-05, "loss": 0.7559, "step": 2562 }, { "epoch": 1.5469363115001509, "grad_norm": 0.11146189272403717, "learning_rate": 9.983395599833956e-05, "loss": 0.6246, "step": 2563 }, { "epoch": 1.5475399939631753, "grad_norm": 0.1186312735080719, "learning_rate": 9.979244499792445e-05, "loss": 0.8725, "step": 2564 }, { "epoch": 1.5481436764261998, "grad_norm": 0.11686369776725769, "learning_rate": 9.975093399750934e-05, "loss": 0.869, "step": 2565 }, { "epoch": 1.5487473588892242, "grad_norm": 0.11561167985200882, "learning_rate": 9.970942299709424e-05, "loss": 0.7023, "step": 2566 }, { "epoch": 1.5493510413522489, "grad_norm": 0.12024765461683273, "learning_rate": 9.966791199667912e-05, "loss": 0.9882, "step": 2567 }, { "epoch": 1.549954723815273, "grad_norm": 0.10977562516927719, "learning_rate": 9.962640099626401e-05, "loss": 0.7127, "step": 2568 }, { "epoch": 1.5505584062782978, "grad_norm": 0.10764128714799881, "learning_rate": 9.958488999584891e-05, "loss": 0.617, "step": 2569 }, { "epoch": 1.551162088741322, "grad_norm": 0.10783983021974564, "learning_rate": 9.954337899543378e-05, "loss": 0.6401, "step": 2570 }, { "epoch": 1.5517657712043467, "grad_norm": 0.12389583140611649, "learning_rate": 9.950186799501868e-05, "loss": 0.6535, "step": 2571 }, { "epoch": 1.552369453667371, "grad_norm": 0.11414086818695068, "learning_rate": 9.946035699460357e-05, "loss": 0.7015, "step": 2572 }, { "epoch": 1.5529731361303956, "grad_norm": 0.11838074028491974, "learning_rate": 9.941884599418847e-05, "loss": 0.9455, "step": 2573 }, { "epoch": 1.5535768185934198, "grad_norm": 0.11571443825960159, "learning_rate": 9.937733499377336e-05, "loss": 1.1223, "step": 2574 }, { "epoch": 1.5541805010564445, "grad_norm": 0.11583063751459122, "learning_rate": 9.933582399335824e-05, "loss": 0.7297, "step": 2575 }, { "epoch": 1.5547841835194687, "grad_norm": 0.11269959062337875, "learning_rate": 9.929431299294314e-05, "loss": 0.7613, "step": 2576 }, { "epoch": 1.5553878659824933, "grad_norm": 0.1328345686197281, "learning_rate": 9.925280199252802e-05, "loss": 0.7758, "step": 2577 }, { "epoch": 1.5559915484455176, "grad_norm": 0.12824542820453644, "learning_rate": 9.921129099211292e-05, "loss": 0.7867, "step": 2578 }, { "epoch": 1.5565952309085422, "grad_norm": 0.10252919793128967, "learning_rate": 9.91697799916978e-05, "loss": 0.6038, "step": 2579 }, { "epoch": 1.5571989133715665, "grad_norm": 0.11735723912715912, "learning_rate": 9.912826899128269e-05, "loss": 0.718, "step": 2580 }, { "epoch": 1.5578025958345911, "grad_norm": 0.12741461396217346, "learning_rate": 9.908675799086759e-05, "loss": 0.62, "step": 2581 }, { "epoch": 1.5584062782976154, "grad_norm": 0.11475570499897003, "learning_rate": 9.904524699045248e-05, "loss": 0.7747, "step": 2582 }, { "epoch": 1.55900996076064, "grad_norm": 0.11514543741941452, "learning_rate": 9.900373599003736e-05, "loss": 0.7556, "step": 2583 }, { "epoch": 1.5596136432236642, "grad_norm": 0.10353957116603851, "learning_rate": 9.896222498962225e-05, "loss": 0.6466, "step": 2584 }, { "epoch": 1.560217325686689, "grad_norm": 0.10758772492408752, "learning_rate": 9.892071398920715e-05, "loss": 1.2809, "step": 2585 }, { "epoch": 1.5608210081497131, "grad_norm": 0.11749143153429031, "learning_rate": 9.887920298879203e-05, "loss": 0.6888, "step": 2586 }, { "epoch": 1.5614246906127378, "grad_norm": 0.1256055384874344, "learning_rate": 9.883769198837692e-05, "loss": 0.7231, "step": 2587 }, { "epoch": 1.562028373075762, "grad_norm": 0.11417461931705475, "learning_rate": 9.879618098796182e-05, "loss": 0.7842, "step": 2588 }, { "epoch": 1.5626320555387867, "grad_norm": 0.10440021008253098, "learning_rate": 9.87546699875467e-05, "loss": 0.6953, "step": 2589 }, { "epoch": 1.563235738001811, "grad_norm": 0.10997917503118515, "learning_rate": 9.87131589871316e-05, "loss": 0.6638, "step": 2590 }, { "epoch": 1.5638394204648356, "grad_norm": 0.1075371578335762, "learning_rate": 9.867164798671648e-05, "loss": 0.5918, "step": 2591 }, { "epoch": 1.5644431029278598, "grad_norm": 0.12016437947750092, "learning_rate": 9.863013698630137e-05, "loss": 0.7686, "step": 2592 }, { "epoch": 1.5650467853908845, "grad_norm": 0.119369737803936, "learning_rate": 9.858862598588627e-05, "loss": 0.6284, "step": 2593 }, { "epoch": 1.5656504678539087, "grad_norm": 0.11695661395788193, "learning_rate": 9.854711498547115e-05, "loss": 0.6282, "step": 2594 }, { "epoch": 1.5662541503169334, "grad_norm": 0.11359156668186188, "learning_rate": 9.850560398505605e-05, "loss": 0.648, "step": 2595 }, { "epoch": 1.5668578327799576, "grad_norm": 0.11783526092767715, "learning_rate": 9.846409298464093e-05, "loss": 0.5769, "step": 2596 }, { "epoch": 1.5674615152429823, "grad_norm": 0.12867143750190735, "learning_rate": 9.842258198422583e-05, "loss": 0.7181, "step": 2597 }, { "epoch": 1.5680651977060065, "grad_norm": 0.12808844447135925, "learning_rate": 9.838107098381071e-05, "loss": 0.6346, "step": 2598 }, { "epoch": 1.5686688801690312, "grad_norm": 0.12455364316701889, "learning_rate": 9.83395599833956e-05, "loss": 0.6019, "step": 2599 }, { "epoch": 1.5692725626320554, "grad_norm": 0.1380644142627716, "learning_rate": 9.82980489829805e-05, "loss": 0.6872, "step": 2600 }, { "epoch": 1.56987624509508, "grad_norm": 0.14129561185836792, "learning_rate": 9.825653798256539e-05, "loss": 0.5949, "step": 2601 }, { "epoch": 1.5704799275581043, "grad_norm": 0.13899260759353638, "learning_rate": 9.821502698215027e-05, "loss": 0.5492, "step": 2602 }, { "epoch": 1.571083610021129, "grad_norm": 0.13930819928646088, "learning_rate": 9.817351598173516e-05, "loss": 0.5476, "step": 2603 }, { "epoch": 1.5716872924841532, "grad_norm": 0.14664794504642487, "learning_rate": 9.813200498132006e-05, "loss": 0.4718, "step": 2604 }, { "epoch": 1.5722909749471778, "grad_norm": 0.1534009575843811, "learning_rate": 9.809049398090495e-05, "loss": 0.4487, "step": 2605 }, { "epoch": 1.572894657410202, "grad_norm": 0.16464664041996002, "learning_rate": 9.804898298048983e-05, "loss": 0.4443, "step": 2606 }, { "epoch": 1.5734983398732267, "grad_norm": 0.15683351457118988, "learning_rate": 9.800747198007473e-05, "loss": 0.3545, "step": 2607 }, { "epoch": 1.5741020223362512, "grad_norm": 0.142044797539711, "learning_rate": 9.79659609796596e-05, "loss": 0.7322, "step": 2608 }, { "epoch": 1.5747057047992756, "grad_norm": 0.13781249523162842, "learning_rate": 9.79244499792445e-05, "loss": 0.7452, "step": 2609 }, { "epoch": 1.5753093872623, "grad_norm": 0.12717604637145996, "learning_rate": 9.788293897882939e-05, "loss": 0.749, "step": 2610 }, { "epoch": 1.5759130697253245, "grad_norm": 0.11919858306646347, "learning_rate": 9.784142797841428e-05, "loss": 0.7633, "step": 2611 }, { "epoch": 1.576516752188349, "grad_norm": 0.12019964307546616, "learning_rate": 9.779991697799918e-05, "loss": 0.732, "step": 2612 }, { "epoch": 1.5771204346513734, "grad_norm": 0.1292140632867813, "learning_rate": 9.775840597758406e-05, "loss": 0.7751, "step": 2613 }, { "epoch": 1.5777241171143979, "grad_norm": 0.11317221075296402, "learning_rate": 9.771689497716895e-05, "loss": 0.7186, "step": 2614 }, { "epoch": 1.5783277995774223, "grad_norm": 0.10719773173332214, "learning_rate": 9.767538397675384e-05, "loss": 0.7511, "step": 2615 }, { "epoch": 1.5789314820404468, "grad_norm": 0.11181782931089401, "learning_rate": 9.763387297633874e-05, "loss": 0.7537, "step": 2616 }, { "epoch": 1.5795351645034712, "grad_norm": 0.11379014700651169, "learning_rate": 9.759236197592362e-05, "loss": 0.7367, "step": 2617 }, { "epoch": 1.5801388469664956, "grad_norm": 0.10861673206090927, "learning_rate": 9.755085097550851e-05, "loss": 0.6676, "step": 2618 }, { "epoch": 1.58074252942952, "grad_norm": 0.11700250953435898, "learning_rate": 9.750933997509341e-05, "loss": 0.6992, "step": 2619 }, { "epoch": 1.5813462118925445, "grad_norm": 0.10913542658090591, "learning_rate": 9.74678289746783e-05, "loss": 0.7337, "step": 2620 }, { "epoch": 1.581949894355569, "grad_norm": 0.10216758400201797, "learning_rate": 9.742631797426318e-05, "loss": 0.658, "step": 2621 }, { "epoch": 1.5825535768185934, "grad_norm": 0.10746050626039505, "learning_rate": 9.738480697384807e-05, "loss": 0.673, "step": 2622 }, { "epoch": 1.5831572592816179, "grad_norm": 0.10291888564825058, "learning_rate": 9.734329597343297e-05, "loss": 0.7131, "step": 2623 }, { "epoch": 1.5837609417446423, "grad_norm": 0.11497275531291962, "learning_rate": 9.730178497301786e-05, "loss": 0.7278, "step": 2624 }, { "epoch": 1.5843646242076668, "grad_norm": 0.11286524683237076, "learning_rate": 9.726027397260274e-05, "loss": 0.7349, "step": 2625 }, { "epoch": 1.5849683066706912, "grad_norm": 0.12478066235780716, "learning_rate": 9.721876297218764e-05, "loss": 0.686, "step": 2626 }, { "epoch": 1.5855719891337157, "grad_norm": 0.1130586788058281, "learning_rate": 9.717725197177252e-05, "loss": 0.7609, "step": 2627 }, { "epoch": 1.58617567159674, "grad_norm": 0.11195287853479385, "learning_rate": 9.713574097135742e-05, "loss": 0.6909, "step": 2628 }, { "epoch": 1.5867793540597646, "grad_norm": 0.127894327044487, "learning_rate": 9.70942299709423e-05, "loss": 0.7867, "step": 2629 }, { "epoch": 1.587383036522789, "grad_norm": 0.10859086364507675, "learning_rate": 9.705271897052719e-05, "loss": 0.7013, "step": 2630 }, { "epoch": 1.5879867189858135, "grad_norm": 0.10940494388341904, "learning_rate": 9.701120797011209e-05, "loss": 0.9159, "step": 2631 }, { "epoch": 1.588590401448838, "grad_norm": 0.1171516478061676, "learning_rate": 9.696969696969698e-05, "loss": 0.6883, "step": 2632 }, { "epoch": 1.5891940839118623, "grad_norm": 0.14016135036945343, "learning_rate": 9.692818596928186e-05, "loss": 0.6465, "step": 2633 }, { "epoch": 1.5897977663748868, "grad_norm": 0.10493374615907669, "learning_rate": 9.688667496886675e-05, "loss": 0.6847, "step": 2634 }, { "epoch": 1.5904014488379112, "grad_norm": 0.12011600285768509, "learning_rate": 9.684516396845165e-05, "loss": 0.7248, "step": 2635 }, { "epoch": 1.5910051313009357, "grad_norm": 0.1263171285390854, "learning_rate": 9.680365296803654e-05, "loss": 0.646, "step": 2636 }, { "epoch": 1.5916088137639601, "grad_norm": 0.1184513196349144, "learning_rate": 9.676214196762142e-05, "loss": 0.6105, "step": 2637 }, { "epoch": 1.5922124962269846, "grad_norm": 0.13153387606143951, "learning_rate": 9.672063096720632e-05, "loss": 0.7279, "step": 2638 }, { "epoch": 1.592816178690009, "grad_norm": 0.12811937928199768, "learning_rate": 9.667911996679121e-05, "loss": 0.7132, "step": 2639 }, { "epoch": 1.5934198611530335, "grad_norm": 0.10892543196678162, "learning_rate": 9.66376089663761e-05, "loss": 0.8649, "step": 2640 }, { "epoch": 1.594023543616058, "grad_norm": 0.1102840006351471, "learning_rate": 9.659609796596098e-05, "loss": 0.685, "step": 2641 }, { "epoch": 1.5946272260790824, "grad_norm": 0.11115845292806625, "learning_rate": 9.655458696554588e-05, "loss": 0.5798, "step": 2642 }, { "epoch": 1.5952309085421068, "grad_norm": 0.10934832692146301, "learning_rate": 9.651307596513077e-05, "loss": 0.6023, "step": 2643 }, { "epoch": 1.5958345910051313, "grad_norm": 0.12175562977790833, "learning_rate": 9.647156496471565e-05, "loss": 0.7045, "step": 2644 }, { "epoch": 1.5964382734681557, "grad_norm": 0.12231041491031647, "learning_rate": 9.643005396430055e-05, "loss": 0.6613, "step": 2645 }, { "epoch": 1.5970419559311801, "grad_norm": 0.11286479979753494, "learning_rate": 9.638854296388543e-05, "loss": 0.6132, "step": 2646 }, { "epoch": 1.5976456383942046, "grad_norm": 0.13253097236156464, "learning_rate": 9.634703196347033e-05, "loss": 0.65, "step": 2647 }, { "epoch": 1.5982493208572293, "grad_norm": 0.1287652999162674, "learning_rate": 9.630552096305521e-05, "loss": 0.6567, "step": 2648 }, { "epoch": 1.5988530033202535, "grad_norm": 0.15067052841186523, "learning_rate": 9.62640099626401e-05, "loss": 0.6427, "step": 2649 }, { "epoch": 1.5994566857832782, "grad_norm": 0.13497310876846313, "learning_rate": 9.6222498962225e-05, "loss": 0.6178, "step": 2650 }, { "epoch": 1.6000603682463024, "grad_norm": 0.13561968505382538, "learning_rate": 9.618098796180989e-05, "loss": 0.5794, "step": 2651 }, { "epoch": 1.600664050709327, "grad_norm": 0.1425342857837677, "learning_rate": 9.613947696139477e-05, "loss": 0.5064, "step": 2652 }, { "epoch": 1.6012677331723513, "grad_norm": 0.13813789188861847, "learning_rate": 9.609796596097966e-05, "loss": 0.4899, "step": 2653 }, { "epoch": 1.601871415635376, "grad_norm": 0.14464135468006134, "learning_rate": 9.605645496056456e-05, "loss": 0.4724, "step": 2654 }, { "epoch": 1.6024750980984002, "grad_norm": 0.15810330212116241, "learning_rate": 9.601494396014945e-05, "loss": 0.4372, "step": 2655 }, { "epoch": 1.6030787805614248, "grad_norm": 0.17366820573806763, "learning_rate": 9.597343295973433e-05, "loss": 0.4398, "step": 2656 }, { "epoch": 1.603682463024449, "grad_norm": 0.159001424908638, "learning_rate": 9.593192195931923e-05, "loss": 0.2789, "step": 2657 }, { "epoch": 1.6042861454874737, "grad_norm": 0.2292790710926056, "learning_rate": 9.58904109589041e-05, "loss": 0.7525, "step": 2658 }, { "epoch": 1.604889827950498, "grad_norm": 0.10119739919900894, "learning_rate": 9.5848899958489e-05, "loss": 0.717, "step": 2659 }, { "epoch": 1.6054935104135226, "grad_norm": 0.10911940038204193, "learning_rate": 9.580738895807389e-05, "loss": 0.6715, "step": 2660 }, { "epoch": 1.6060971928765468, "grad_norm": 0.12329068779945374, "learning_rate": 9.576587795765879e-05, "loss": 0.7417, "step": 2661 }, { "epoch": 1.6067008753395715, "grad_norm": 0.12208274751901627, "learning_rate": 9.572436695724368e-05, "loss": 0.7459, "step": 2662 }, { "epoch": 1.6073045578025957, "grad_norm": 0.10721976310014725, "learning_rate": 9.568285595682856e-05, "loss": 0.6844, "step": 2663 }, { "epoch": 1.6079082402656204, "grad_norm": 0.1042000949382782, "learning_rate": 9.564134495641347e-05, "loss": 0.7077, "step": 2664 }, { "epoch": 1.6085119227286446, "grad_norm": 0.1249273270368576, "learning_rate": 9.559983395599834e-05, "loss": 0.7202, "step": 2665 }, { "epoch": 1.6091156051916693, "grad_norm": 0.11742489039897919, "learning_rate": 9.555832295558324e-05, "loss": 0.6975, "step": 2666 }, { "epoch": 1.6097192876546935, "grad_norm": 0.1132553368806839, "learning_rate": 9.551681195516812e-05, "loss": 0.9097, "step": 2667 }, { "epoch": 1.6103229701177182, "grad_norm": 0.1129176989197731, "learning_rate": 9.547530095475301e-05, "loss": 0.6419, "step": 2668 }, { "epoch": 1.6109266525807424, "grad_norm": 0.1597258597612381, "learning_rate": 9.543378995433791e-05, "loss": 0.7293, "step": 2669 }, { "epoch": 1.611530335043767, "grad_norm": 0.11292175203561783, "learning_rate": 9.53922789539228e-05, "loss": 0.8514, "step": 2670 }, { "epoch": 1.6121340175067913, "grad_norm": 0.11544401198625565, "learning_rate": 9.535076795350768e-05, "loss": 0.6806, "step": 2671 }, { "epoch": 1.612737699969816, "grad_norm": 0.10773265361785889, "learning_rate": 9.530925695309257e-05, "loss": 0.7818, "step": 2672 }, { "epoch": 1.6133413824328402, "grad_norm": 0.11048367619514465, "learning_rate": 9.526774595267747e-05, "loss": 0.6819, "step": 2673 }, { "epoch": 1.6139450648958649, "grad_norm": 0.12018372118473053, "learning_rate": 9.522623495226234e-05, "loss": 0.6918, "step": 2674 }, { "epoch": 1.614548747358889, "grad_norm": 0.1126914694905281, "learning_rate": 9.518472395184724e-05, "loss": 1.157, "step": 2675 }, { "epoch": 1.6151524298219138, "grad_norm": 0.11121730506420135, "learning_rate": 9.514321295143214e-05, "loss": 0.7578, "step": 2676 }, { "epoch": 1.615756112284938, "grad_norm": 0.1760278046131134, "learning_rate": 9.510170195101702e-05, "loss": 0.7201, "step": 2677 }, { "epoch": 1.6163597947479627, "grad_norm": 0.1170695498585701, "learning_rate": 9.506019095060192e-05, "loss": 0.7593, "step": 2678 }, { "epoch": 1.6169634772109869, "grad_norm": 0.12168256938457489, "learning_rate": 9.50186799501868e-05, "loss": 0.6685, "step": 2679 }, { "epoch": 1.6175671596740115, "grad_norm": 0.1391884833574295, "learning_rate": 9.497716894977169e-05, "loss": 0.8251, "step": 2680 }, { "epoch": 1.6181708421370358, "grad_norm": 0.11195441335439682, "learning_rate": 9.493565794935658e-05, "loss": 0.6997, "step": 2681 }, { "epoch": 1.6187745246000604, "grad_norm": 0.11107617616653442, "learning_rate": 9.489414694894148e-05, "loss": 0.6234, "step": 2682 }, { "epoch": 1.6193782070630847, "grad_norm": 0.1288166344165802, "learning_rate": 9.485263594852638e-05, "loss": 1.0108, "step": 2683 }, { "epoch": 1.6199818895261093, "grad_norm": 0.11174750328063965, "learning_rate": 9.481112494811125e-05, "loss": 0.633, "step": 2684 }, { "epoch": 1.6205855719891336, "grad_norm": 0.11930475383996964, "learning_rate": 9.476961394769615e-05, "loss": 0.7875, "step": 2685 }, { "epoch": 1.6211892544521582, "grad_norm": 0.1079435870051384, "learning_rate": 9.472810294728104e-05, "loss": 0.9182, "step": 2686 }, { "epoch": 1.6217929369151824, "grad_norm": 0.10946819186210632, "learning_rate": 9.468659194686592e-05, "loss": 0.6537, "step": 2687 }, { "epoch": 1.6223966193782071, "grad_norm": 0.1106157973408699, "learning_rate": 9.464508094645081e-05, "loss": 0.5829, "step": 2688 }, { "epoch": 1.6230003018412316, "grad_norm": 0.1129320040345192, "learning_rate": 9.460356994603571e-05, "loss": 0.7042, "step": 2689 }, { "epoch": 1.623603984304256, "grad_norm": 0.10891727358102798, "learning_rate": 9.45620589456206e-05, "loss": 0.6194, "step": 2690 }, { "epoch": 1.6242076667672805, "grad_norm": 0.1271827220916748, "learning_rate": 9.452054794520548e-05, "loss": 0.6827, "step": 2691 }, { "epoch": 1.624811349230305, "grad_norm": 0.10997018963098526, "learning_rate": 9.447903694479038e-05, "loss": 0.6144, "step": 2692 }, { "epoch": 1.6254150316933293, "grad_norm": 0.11678522080183029, "learning_rate": 9.443752594437525e-05, "loss": 0.6143, "step": 2693 }, { "epoch": 1.6260187141563538, "grad_norm": 0.13937309384346008, "learning_rate": 9.439601494396015e-05, "loss": 0.7645, "step": 2694 }, { "epoch": 1.6266223966193782, "grad_norm": 0.12499664723873138, "learning_rate": 9.435450394354504e-05, "loss": 0.6144, "step": 2695 }, { "epoch": 1.6272260790824027, "grad_norm": 0.12473144382238388, "learning_rate": 9.431299294312993e-05, "loss": 0.5597, "step": 2696 }, { "epoch": 1.6278297615454271, "grad_norm": 0.13701920211315155, "learning_rate": 9.427148194271483e-05, "loss": 0.6018, "step": 2697 }, { "epoch": 1.6284334440084516, "grad_norm": 0.12637366354465485, "learning_rate": 9.422997094229971e-05, "loss": 0.6128, "step": 2698 }, { "epoch": 1.629037126471476, "grad_norm": 0.12743206322193146, "learning_rate": 9.41884599418846e-05, "loss": 0.6142, "step": 2699 }, { "epoch": 1.6296408089345005, "grad_norm": 0.1346701979637146, "learning_rate": 9.414694894146949e-05, "loss": 0.6122, "step": 2700 }, { "epoch": 1.630244491397525, "grad_norm": 0.13191986083984375, "learning_rate": 9.410543794105439e-05, "loss": 0.566, "step": 2701 }, { "epoch": 1.6308481738605494, "grad_norm": 0.14156706631183624, "learning_rate": 9.406392694063927e-05, "loss": 0.5651, "step": 2702 }, { "epoch": 1.6314518563235738, "grad_norm": 0.1478833556175232, "learning_rate": 9.402241594022416e-05, "loss": 0.5533, "step": 2703 }, { "epoch": 1.6320555387865983, "grad_norm": 0.1488913893699646, "learning_rate": 9.398090493980906e-05, "loss": 0.5218, "step": 2704 }, { "epoch": 1.6326592212496227, "grad_norm": 0.1752762794494629, "learning_rate": 9.393939393939395e-05, "loss": 0.4899, "step": 2705 }, { "epoch": 1.6332629037126472, "grad_norm": 0.17121976613998413, "learning_rate": 9.389788293897883e-05, "loss": 0.4118, "step": 2706 }, { "epoch": 1.6338665861756716, "grad_norm": 0.16428853571414948, "learning_rate": 9.385637193856372e-05, "loss": 0.3176, "step": 2707 }, { "epoch": 1.634470268638696, "grad_norm": 0.11714405566453934, "learning_rate": 9.381486093814862e-05, "loss": 0.8348, "step": 2708 }, { "epoch": 1.6350739511017205, "grad_norm": 0.11836113035678864, "learning_rate": 9.37733499377335e-05, "loss": 0.7564, "step": 2709 }, { "epoch": 1.635677633564745, "grad_norm": 0.11199770867824554, "learning_rate": 9.373183893731839e-05, "loss": 0.6204, "step": 2710 }, { "epoch": 1.6362813160277694, "grad_norm": 0.12327376753091812, "learning_rate": 9.369032793690329e-05, "loss": 0.7127, "step": 2711 }, { "epoch": 1.6368849984907938, "grad_norm": 0.12553828954696655, "learning_rate": 9.364881693648817e-05, "loss": 0.7299, "step": 2712 }, { "epoch": 1.6374886809538183, "grad_norm": 0.10863874852657318, "learning_rate": 9.360730593607307e-05, "loss": 0.669, "step": 2713 }, { "epoch": 1.6380923634168427, "grad_norm": 0.12236540764570236, "learning_rate": 9.356579493565795e-05, "loss": 0.977, "step": 2714 }, { "epoch": 1.6386960458798672, "grad_norm": 0.1284298151731491, "learning_rate": 9.352428393524284e-05, "loss": 0.9, "step": 2715 }, { "epoch": 1.6392997283428916, "grad_norm": 0.12449759244918823, "learning_rate": 9.348277293482774e-05, "loss": 0.5975, "step": 2716 }, { "epoch": 1.639903410805916, "grad_norm": 0.11096858978271484, "learning_rate": 9.344126193441262e-05, "loss": 0.7423, "step": 2717 }, { "epoch": 1.6405070932689405, "grad_norm": 0.11665850877761841, "learning_rate": 9.339975093399751e-05, "loss": 0.7947, "step": 2718 }, { "epoch": 1.641110775731965, "grad_norm": 0.1213374212384224, "learning_rate": 9.33582399335824e-05, "loss": 0.7891, "step": 2719 }, { "epoch": 1.6417144581949894, "grad_norm": 0.11723334342241287, "learning_rate": 9.33167289331673e-05, "loss": 0.7008, "step": 2720 }, { "epoch": 1.6423181406580138, "grad_norm": 0.39810943603515625, "learning_rate": 9.327521793275218e-05, "loss": 0.7724, "step": 2721 }, { "epoch": 1.6429218231210383, "grad_norm": 0.1107415184378624, "learning_rate": 9.323370693233707e-05, "loss": 0.6663, "step": 2722 }, { "epoch": 1.6435255055840627, "grad_norm": 0.14163663983345032, "learning_rate": 9.319219593192197e-05, "loss": 0.8732, "step": 2723 }, { "epoch": 1.6441291880470872, "grad_norm": 0.11508408188819885, "learning_rate": 9.315068493150684e-05, "loss": 0.7499, "step": 2724 }, { "epoch": 1.6447328705101116, "grad_norm": 0.11651081591844559, "learning_rate": 9.310917393109174e-05, "loss": 0.6448, "step": 2725 }, { "epoch": 1.645336552973136, "grad_norm": 0.11762251704931259, "learning_rate": 9.306766293067663e-05, "loss": 0.6679, "step": 2726 }, { "epoch": 1.6459402354361605, "grad_norm": 0.11654434353113174, "learning_rate": 9.302615193026153e-05, "loss": 0.9322, "step": 2727 }, { "epoch": 1.646543917899185, "grad_norm": 0.11348237842321396, "learning_rate": 9.298464092984642e-05, "loss": 0.7319, "step": 2728 }, { "epoch": 1.6471476003622096, "grad_norm": 0.19426052272319794, "learning_rate": 9.29431299294313e-05, "loss": 0.7589, "step": 2729 }, { "epoch": 1.6477512828252339, "grad_norm": 0.1138731986284256, "learning_rate": 9.29016189290162e-05, "loss": 0.6751, "step": 2730 }, { "epoch": 1.6483549652882585, "grad_norm": 0.11389704793691635, "learning_rate": 9.286010792860108e-05, "loss": 0.8278, "step": 2731 }, { "epoch": 1.6489586477512828, "grad_norm": 0.13111789524555206, "learning_rate": 9.281859692818598e-05, "loss": 0.8243, "step": 2732 }, { "epoch": 1.6495623302143074, "grad_norm": 0.10878365486860275, "learning_rate": 9.277708592777086e-05, "loss": 0.9464, "step": 2733 }, { "epoch": 1.6501660126773317, "grad_norm": 0.11654222756624222, "learning_rate": 9.273557492735575e-05, "loss": 0.7815, "step": 2734 }, { "epoch": 1.6507696951403563, "grad_norm": 0.10811632871627808, "learning_rate": 9.269406392694065e-05, "loss": 0.7112, "step": 2735 }, { "epoch": 1.6513733776033805, "grad_norm": 0.1144120842218399, "learning_rate": 9.265255292652554e-05, "loss": 0.6915, "step": 2736 }, { "epoch": 1.6519770600664052, "grad_norm": 0.12281805276870728, "learning_rate": 9.261104192611042e-05, "loss": 0.7846, "step": 2737 }, { "epoch": 1.6525807425294294, "grad_norm": 0.11404890567064285, "learning_rate": 9.256953092569531e-05, "loss": 0.6841, "step": 2738 }, { "epoch": 1.653184424992454, "grad_norm": 0.10799200087785721, "learning_rate": 9.252801992528021e-05, "loss": 0.7854, "step": 2739 }, { "epoch": 1.6537881074554783, "grad_norm": 0.11798325926065445, "learning_rate": 9.24865089248651e-05, "loss": 0.6409, "step": 2740 }, { "epoch": 1.654391789918503, "grad_norm": 0.11283228546380997, "learning_rate": 9.244499792444998e-05, "loss": 0.7441, "step": 2741 }, { "epoch": 1.6549954723815272, "grad_norm": 0.11868225038051605, "learning_rate": 9.240348692403488e-05, "loss": 0.7046, "step": 2742 }, { "epoch": 1.655599154844552, "grad_norm": 0.11451592296361923, "learning_rate": 9.236197592361975e-05, "loss": 0.677, "step": 2743 }, { "epoch": 1.6562028373075761, "grad_norm": 0.11685509234666824, "learning_rate": 9.232046492320465e-05, "loss": 0.6602, "step": 2744 }, { "epoch": 1.6568065197706008, "grad_norm": 0.11831925809383392, "learning_rate": 9.227895392278954e-05, "loss": 0.7313, "step": 2745 }, { "epoch": 1.657410202233625, "grad_norm": 0.11828596144914627, "learning_rate": 9.223744292237443e-05, "loss": 0.5914, "step": 2746 }, { "epoch": 1.6580138846966497, "grad_norm": 0.12427350878715515, "learning_rate": 9.219593192195933e-05, "loss": 0.6278, "step": 2747 }, { "epoch": 1.658617567159674, "grad_norm": 0.12681810557842255, "learning_rate": 9.215442092154421e-05, "loss": 0.6496, "step": 2748 }, { "epoch": 1.6592212496226986, "grad_norm": 0.1341797262430191, "learning_rate": 9.211290992112911e-05, "loss": 0.6372, "step": 2749 }, { "epoch": 1.6598249320857228, "grad_norm": 0.1370631456375122, "learning_rate": 9.207139892071399e-05, "loss": 0.6669, "step": 2750 }, { "epoch": 1.6604286145487475, "grad_norm": 0.14159443974494934, "learning_rate": 9.202988792029889e-05, "loss": 0.5644, "step": 2751 }, { "epoch": 1.6610322970117717, "grad_norm": 0.13964007794857025, "learning_rate": 9.198837691988377e-05, "loss": 0.6035, "step": 2752 }, { "epoch": 1.6616359794747964, "grad_norm": 0.154313325881958, "learning_rate": 9.194686591946866e-05, "loss": 0.5195, "step": 2753 }, { "epoch": 1.6622396619378206, "grad_norm": 0.14789967238903046, "learning_rate": 9.190535491905356e-05, "loss": 0.5039, "step": 2754 }, { "epoch": 1.6628433444008452, "grad_norm": 0.1544695347547531, "learning_rate": 9.186384391863845e-05, "loss": 0.4487, "step": 2755 }, { "epoch": 1.6634470268638695, "grad_norm": 0.15225286781787872, "learning_rate": 9.182233291822333e-05, "loss": 0.4129, "step": 2756 }, { "epoch": 1.6640507093268941, "grad_norm": 0.16880232095718384, "learning_rate": 9.178082191780822e-05, "loss": 0.3468, "step": 2757 }, { "epoch": 1.6646543917899184, "grad_norm": 0.10654531419277191, "learning_rate": 9.173931091739312e-05, "loss": 0.6731, "step": 2758 }, { "epoch": 1.665258074252943, "grad_norm": 0.12524224817752838, "learning_rate": 9.1697799916978e-05, "loss": 0.7284, "step": 2759 }, { "epoch": 1.6658617567159673, "grad_norm": 0.11452171951532364, "learning_rate": 9.165628891656289e-05, "loss": 0.7422, "step": 2760 }, { "epoch": 1.6670691216420161, "grad_norm": 0.11638887971639633, "learning_rate": 9.161477791614779e-05, "loss": 0.7024, "step": 2761 }, { "epoch": 1.6676728041050408, "grad_norm": 0.1171191856265068, "learning_rate": 9.157326691573267e-05, "loss": 0.6618, "step": 2762 }, { "epoch": 1.668276486568065, "grad_norm": 0.11434746533632278, "learning_rate": 9.153175591531757e-05, "loss": 0.876, "step": 2763 }, { "epoch": 1.6688801690310897, "grad_norm": 0.11305181682109833, "learning_rate": 9.149024491490245e-05, "loss": 0.8247, "step": 2764 }, { "epoch": 1.669483851494114, "grad_norm": 0.10942018032073975, "learning_rate": 9.144873391448734e-05, "loss": 0.6116, "step": 2765 }, { "epoch": 1.6700875339571386, "grad_norm": 0.14481772482395172, "learning_rate": 9.140722291407224e-05, "loss": 0.7061, "step": 2766 }, { "epoch": 1.6706912164201628, "grad_norm": 0.12722566723823547, "learning_rate": 9.136571191365713e-05, "loss": 0.6226, "step": 2767 }, { "epoch": 1.6712948988831875, "grad_norm": 0.12751980125904083, "learning_rate": 9.132420091324201e-05, "loss": 0.7537, "step": 2768 }, { "epoch": 1.671898581346212, "grad_norm": 0.1445055454969406, "learning_rate": 9.12826899128269e-05, "loss": 0.7353, "step": 2769 }, { "epoch": 1.6725022638092364, "grad_norm": 0.15946505963802338, "learning_rate": 9.12411789124118e-05, "loss": 0.6124, "step": 2770 }, { "epoch": 1.6731059462722608, "grad_norm": 0.11702273786067963, "learning_rate": 9.119966791199668e-05, "loss": 0.7526, "step": 2771 }, { "epoch": 1.6737096287352853, "grad_norm": 0.12233686447143555, "learning_rate": 9.115815691158157e-05, "loss": 0.6447, "step": 2772 }, { "epoch": 1.6743133111983097, "grad_norm": 0.12842793762683868, "learning_rate": 9.111664591116647e-05, "loss": 0.8035, "step": 2773 }, { "epoch": 1.6749169936613342, "grad_norm": 0.27307209372520447, "learning_rate": 9.107513491075136e-05, "loss": 0.8121, "step": 2774 }, { "epoch": 1.6755206761243586, "grad_norm": 0.12133491784334183, "learning_rate": 9.103362391033624e-05, "loss": 0.6252, "step": 2775 }, { "epoch": 1.676124358587383, "grad_norm": 0.12983252108097076, "learning_rate": 9.099211290992113e-05, "loss": 0.6484, "step": 2776 }, { "epoch": 1.6767280410504075, "grad_norm": 0.12295020371675491, "learning_rate": 9.095060190950603e-05, "loss": 0.6794, "step": 2777 }, { "epoch": 1.677331723513432, "grad_norm": 0.12450043857097626, "learning_rate": 9.090909090909092e-05, "loss": 0.7732, "step": 2778 }, { "epoch": 1.6779354059764564, "grad_norm": 0.15557287633419037, "learning_rate": 9.08675799086758e-05, "loss": 0.601, "step": 2779 }, { "epoch": 1.6785390884394809, "grad_norm": 0.1215049996972084, "learning_rate": 9.08260689082607e-05, "loss": 0.6817, "step": 2780 }, { "epoch": 1.6791427709025053, "grad_norm": 0.11101005971431732, "learning_rate": 9.078455790784558e-05, "loss": 0.5621, "step": 2781 }, { "epoch": 1.6797464533655297, "grad_norm": 0.11614114046096802, "learning_rate": 9.074304690743048e-05, "loss": 0.6261, "step": 2782 }, { "epoch": 1.6803501358285542, "grad_norm": 0.11843936145305634, "learning_rate": 9.070153590701536e-05, "loss": 0.8192, "step": 2783 }, { "epoch": 1.6809538182915786, "grad_norm": 0.12000729143619537, "learning_rate": 9.066002490660025e-05, "loss": 0.6353, "step": 2784 }, { "epoch": 1.681557500754603, "grad_norm": 0.11501980572938919, "learning_rate": 9.061851390618515e-05, "loss": 0.6151, "step": 2785 }, { "epoch": 1.6821611832176275, "grad_norm": 0.13076992332935333, "learning_rate": 9.057700290577004e-05, "loss": 0.7825, "step": 2786 }, { "epoch": 1.682764865680652, "grad_norm": 0.13332267105579376, "learning_rate": 9.053549190535492e-05, "loss": 0.8015, "step": 2787 }, { "epoch": 1.6833685481436764, "grad_norm": 0.12166225165128708, "learning_rate": 9.049398090493981e-05, "loss": 0.6739, "step": 2788 }, { "epoch": 1.6839722306067009, "grad_norm": 0.13338027894496918, "learning_rate": 9.045246990452471e-05, "loss": 0.6793, "step": 2789 }, { "epoch": 1.6845759130697253, "grad_norm": 0.11463173478841782, "learning_rate": 9.041095890410958e-05, "loss": 0.535, "step": 2790 }, { "epoch": 1.6851795955327498, "grad_norm": 0.11622385680675507, "learning_rate": 9.036944790369448e-05, "loss": 0.6049, "step": 2791 }, { "epoch": 1.6857832779957742, "grad_norm": 0.1293102502822876, "learning_rate": 9.032793690327938e-05, "loss": 0.6065, "step": 2792 }, { "epoch": 1.6863869604587987, "grad_norm": 0.12283488363027573, "learning_rate": 9.028642590286426e-05, "loss": 0.5595, "step": 2793 }, { "epoch": 1.686990642921823, "grad_norm": 0.13364657759666443, "learning_rate": 9.024491490244916e-05, "loss": 0.5544, "step": 2794 }, { "epoch": 1.6875943253848475, "grad_norm": 0.1329110860824585, "learning_rate": 9.020340390203404e-05, "loss": 0.584, "step": 2795 }, { "epoch": 1.688198007847872, "grad_norm": 0.13042087852954865, "learning_rate": 9.016189290161894e-05, "loss": 0.5376, "step": 2796 }, { "epoch": 1.6888016903108964, "grad_norm": 0.13576170802116394, "learning_rate": 9.012038190120381e-05, "loss": 0.4989, "step": 2797 }, { "epoch": 1.6894053727739209, "grad_norm": 0.14754711091518402, "learning_rate": 9.007887090078871e-05, "loss": 0.5445, "step": 2798 }, { "epoch": 1.6900090552369453, "grad_norm": 0.14908315241336823, "learning_rate": 9.003735990037361e-05, "loss": 0.5666, "step": 2799 }, { "epoch": 1.6906127376999698, "grad_norm": 0.1546728014945984, "learning_rate": 8.999584889995849e-05, "loss": 0.472, "step": 2800 }, { "epoch": 1.6912164201629942, "grad_norm": 0.16218960285186768, "learning_rate": 8.995433789954339e-05, "loss": 0.3952, "step": 2801 }, { "epoch": 1.6918201026260187, "grad_norm": 0.1644655466079712, "learning_rate": 8.991282689912827e-05, "loss": 0.4062, "step": 2802 }, { "epoch": 1.6924237850890431, "grad_norm": 0.1662607192993164, "learning_rate": 8.987131589871316e-05, "loss": 0.3727, "step": 2803 }, { "epoch": 1.6930274675520676, "grad_norm": 0.1667158454656601, "learning_rate": 8.982980489829805e-05, "loss": 0.3083, "step": 2804 }, { "epoch": 1.693631150015092, "grad_norm": 0.16382873058319092, "learning_rate": 8.978829389788295e-05, "loss": 0.2529, "step": 2805 }, { "epoch": 1.6942348324781165, "grad_norm": 0.1675097644329071, "learning_rate": 8.974678289746783e-05, "loss": 0.2192, "step": 2806 }, { "epoch": 1.694838514941141, "grad_norm": 0.13541610538959503, "learning_rate": 8.970527189705272e-05, "loss": 0.6514, "step": 2807 }, { "epoch": 1.6954421974041654, "grad_norm": 0.13940273225307465, "learning_rate": 8.966376089663762e-05, "loss": 0.6512, "step": 2808 }, { "epoch": 1.69604587986719, "grad_norm": 0.12449961155653, "learning_rate": 8.962224989622249e-05, "loss": 0.6299, "step": 2809 }, { "epoch": 1.6966495623302142, "grad_norm": 0.14902494847774506, "learning_rate": 8.958073889580739e-05, "loss": 0.8532, "step": 2810 }, { "epoch": 1.697253244793239, "grad_norm": 0.14396223425865173, "learning_rate": 8.953922789539228e-05, "loss": 0.782, "step": 2811 }, { "epoch": 1.6978569272562631, "grad_norm": 0.1220279112458229, "learning_rate": 8.949771689497717e-05, "loss": 0.6884, "step": 2812 }, { "epoch": 1.6984606097192878, "grad_norm": 0.12583515048027039, "learning_rate": 8.945620589456207e-05, "loss": 0.7678, "step": 2813 }, { "epoch": 1.699064292182312, "grad_norm": 0.12447942793369293, "learning_rate": 8.941469489414695e-05, "loss": 0.6479, "step": 2814 }, { "epoch": 1.6996679746453367, "grad_norm": 0.1457148939371109, "learning_rate": 8.937318389373184e-05, "loss": 0.6261, "step": 2815 }, { "epoch": 1.700271657108361, "grad_norm": 0.11962111294269562, "learning_rate": 8.933167289331673e-05, "loss": 0.6798, "step": 2816 }, { "epoch": 1.7008753395713856, "grad_norm": 0.13384521007537842, "learning_rate": 8.929016189290163e-05, "loss": 0.7494, "step": 2817 }, { "epoch": 1.7014790220344098, "grad_norm": 0.12299605458974838, "learning_rate": 8.924865089248651e-05, "loss": 0.6161, "step": 2818 }, { "epoch": 1.7020827044974345, "grad_norm": 0.13254275918006897, "learning_rate": 8.92071398920714e-05, "loss": 0.693, "step": 2819 }, { "epoch": 1.7026863869604587, "grad_norm": 0.13070107996463776, "learning_rate": 8.91656288916563e-05, "loss": 0.5099, "step": 2820 }, { "epoch": 1.7032900694234834, "grad_norm": 0.13564686477184296, "learning_rate": 8.912411789124119e-05, "loss": 0.6848, "step": 2821 }, { "epoch": 1.7038937518865076, "grad_norm": 0.12641029059886932, "learning_rate": 8.908260689082607e-05, "loss": 0.655, "step": 2822 }, { "epoch": 1.7044974343495323, "grad_norm": 0.11833371967077255, "learning_rate": 8.904109589041096e-05, "loss": 0.6897, "step": 2823 }, { "epoch": 1.7051011168125565, "grad_norm": 0.1325460523366928, "learning_rate": 8.899958488999586e-05, "loss": 0.677, "step": 2824 }, { "epoch": 1.7057047992755812, "grad_norm": 0.13542653620243073, "learning_rate": 8.895807388958074e-05, "loss": 0.636, "step": 2825 }, { "epoch": 1.7063084817386054, "grad_norm": 0.1281089037656784, "learning_rate": 8.891656288916563e-05, "loss": 0.6886, "step": 2826 }, { "epoch": 1.70691216420163, "grad_norm": 0.11729513853788376, "learning_rate": 8.887505188875053e-05, "loss": 0.6064, "step": 2827 }, { "epoch": 1.7075158466646543, "grad_norm": 0.12129988521337509, "learning_rate": 8.88335408883354e-05, "loss": 0.6609, "step": 2828 }, { "epoch": 1.708119529127679, "grad_norm": 0.12838177382946014, "learning_rate": 8.87920298879203e-05, "loss": 0.615, "step": 2829 }, { "epoch": 1.7087232115907032, "grad_norm": 0.12746667861938477, "learning_rate": 8.875051888750519e-05, "loss": 0.6424, "step": 2830 }, { "epoch": 1.7093268940537278, "grad_norm": 0.12252197414636612, "learning_rate": 8.870900788709008e-05, "loss": 0.588, "step": 2831 }, { "epoch": 1.709930576516752, "grad_norm": 0.11774613708257675, "learning_rate": 8.866749688667498e-05, "loss": 0.8513, "step": 2832 }, { "epoch": 1.7105342589797767, "grad_norm": 0.13314837217330933, "learning_rate": 8.862598588625986e-05, "loss": 0.7945, "step": 2833 }, { "epoch": 1.711137941442801, "grad_norm": 0.14223603904247284, "learning_rate": 8.858447488584475e-05, "loss": 0.6475, "step": 2834 }, { "epoch": 1.7117416239058256, "grad_norm": 0.1323416829109192, "learning_rate": 8.854296388542964e-05, "loss": 0.9064, "step": 2835 }, { "epoch": 1.7123453063688499, "grad_norm": 0.13644298911094666, "learning_rate": 8.850145288501454e-05, "loss": 0.6499, "step": 2836 }, { "epoch": 1.7129489888318745, "grad_norm": 0.12705856561660767, "learning_rate": 8.845994188459942e-05, "loss": 0.846, "step": 2837 }, { "epoch": 1.7135526712948987, "grad_norm": 0.13383372128009796, "learning_rate": 8.841843088418431e-05, "loss": 0.671, "step": 2838 }, { "epoch": 1.7141563537579234, "grad_norm": 0.12026913464069366, "learning_rate": 8.837691988376921e-05, "loss": 0.6495, "step": 2839 }, { "epoch": 1.7147600362209476, "grad_norm": 0.12273270636796951, "learning_rate": 8.83354088833541e-05, "loss": 0.679, "step": 2840 }, { "epoch": 1.7153637186839723, "grad_norm": 0.1200748011469841, "learning_rate": 8.829389788293898e-05, "loss": 0.6017, "step": 2841 }, { "epoch": 1.7159674011469965, "grad_norm": 0.11923687905073166, "learning_rate": 8.825238688252387e-05, "loss": 0.5635, "step": 2842 }, { "epoch": 1.7165710836100212, "grad_norm": 0.1298258900642395, "learning_rate": 8.821087588210877e-05, "loss": 0.5879, "step": 2843 }, { "epoch": 1.7171747660730454, "grad_norm": 0.13525626063346863, "learning_rate": 8.816936488169366e-05, "loss": 0.5725, "step": 2844 }, { "epoch": 1.71777844853607, "grad_norm": 0.13261830806732178, "learning_rate": 8.812785388127854e-05, "loss": 0.511, "step": 2845 }, { "epoch": 1.7183821309990943, "grad_norm": 0.1352899670600891, "learning_rate": 8.808634288086344e-05, "loss": 0.5725, "step": 2846 }, { "epoch": 1.718985813462119, "grad_norm": 0.12940122187137604, "learning_rate": 8.804483188044831e-05, "loss": 0.5499, "step": 2847 }, { "epoch": 1.7195894959251434, "grad_norm": 0.16278207302093506, "learning_rate": 8.800332088003321e-05, "loss": 0.6085, "step": 2848 }, { "epoch": 1.7201931783881679, "grad_norm": 0.16731347143650055, "learning_rate": 8.79618098796181e-05, "loss": 0.57, "step": 2849 }, { "epoch": 1.7207968608511923, "grad_norm": 0.15214033424854279, "learning_rate": 8.792029887920299e-05, "loss": 0.5269, "step": 2850 }, { "epoch": 1.7214005433142168, "grad_norm": 0.1611013114452362, "learning_rate": 8.787878787878789e-05, "loss": 0.5705, "step": 2851 }, { "epoch": 1.7220042257772412, "grad_norm": 0.16054251790046692, "learning_rate": 8.783727687837277e-05, "loss": 0.4106, "step": 2852 }, { "epoch": 1.7226079082402657, "grad_norm": 0.16081520915031433, "learning_rate": 8.779576587795766e-05, "loss": 0.3376, "step": 2853 }, { "epoch": 1.72321159070329, "grad_norm": 0.18261933326721191, "learning_rate": 8.775425487754255e-05, "loss": 0.3801, "step": 2854 }, { "epoch": 1.7238152731663146, "grad_norm": 0.1675270050764084, "learning_rate": 8.771274387712745e-05, "loss": 0.3022, "step": 2855 }, { "epoch": 1.724418955629339, "grad_norm": 0.17773354053497314, "learning_rate": 8.767123287671233e-05, "loss": 0.2563, "step": 2856 }, { "epoch": 1.7250226380923634, "grad_norm": 0.12368903309106827, "learning_rate": 8.762972187629722e-05, "loss": 0.6228, "step": 2857 }, { "epoch": 1.725626320555388, "grad_norm": 0.12369142472743988, "learning_rate": 8.758821087588212e-05, "loss": 0.5617, "step": 2858 }, { "epoch": 1.7262300030184123, "grad_norm": 0.1579550802707672, "learning_rate": 8.7546699875467e-05, "loss": 0.6571, "step": 2859 }, { "epoch": 1.7268336854814368, "grad_norm": 0.13555946946144104, "learning_rate": 8.75051888750519e-05, "loss": 0.7208, "step": 2860 }, { "epoch": 1.7274373679444612, "grad_norm": 0.14086709916591644, "learning_rate": 8.746367787463678e-05, "loss": 0.801, "step": 2861 }, { "epoch": 1.7280410504074857, "grad_norm": 0.16081978380680084, "learning_rate": 8.742216687422168e-05, "loss": 0.6317, "step": 2862 }, { "epoch": 1.7286447328705101, "grad_norm": 0.1376749724149704, "learning_rate": 8.738065587380657e-05, "loss": 0.6912, "step": 2863 }, { "epoch": 1.7292484153335346, "grad_norm": 0.1608782410621643, "learning_rate": 8.733914487339145e-05, "loss": 0.6217, "step": 2864 }, { "epoch": 1.729852097796559, "grad_norm": 0.1334768831729889, "learning_rate": 8.729763387297635e-05, "loss": 0.9018, "step": 2865 }, { "epoch": 1.7304557802595835, "grad_norm": 0.1475462019443512, "learning_rate": 8.725612287256123e-05, "loss": 0.5775, "step": 2866 }, { "epoch": 1.731059462722608, "grad_norm": 0.26409459114074707, "learning_rate": 8.721461187214613e-05, "loss": 0.6337, "step": 2867 }, { "epoch": 1.7316631451856324, "grad_norm": 0.1443539261817932, "learning_rate": 8.717310087173101e-05, "loss": 0.619, "step": 2868 }, { "epoch": 1.7322668276486568, "grad_norm": 0.1351659595966339, "learning_rate": 8.71315898713159e-05, "loss": 0.6511, "step": 2869 }, { "epoch": 1.7328705101116813, "grad_norm": 0.1312374323606491, "learning_rate": 8.70900788709008e-05, "loss": 0.6636, "step": 2870 }, { "epoch": 1.7334741925747057, "grad_norm": 0.13947537541389465, "learning_rate": 8.704856787048569e-05, "loss": 0.682, "step": 2871 }, { "epoch": 1.7340778750377301, "grad_norm": 0.1194947212934494, "learning_rate": 8.700705687007057e-05, "loss": 0.6474, "step": 2872 }, { "epoch": 1.7346815575007546, "grad_norm": 0.11692965775728226, "learning_rate": 8.696554586965546e-05, "loss": 0.6318, "step": 2873 }, { "epoch": 1.735285239963779, "grad_norm": 0.13894155621528625, "learning_rate": 8.692403486924036e-05, "loss": 0.642, "step": 2874 }, { "epoch": 1.7358889224268035, "grad_norm": 0.1313973218202591, "learning_rate": 8.688252386882524e-05, "loss": 0.8355, "step": 2875 }, { "epoch": 1.736492604889828, "grad_norm": 0.15785734355449677, "learning_rate": 8.684101286841013e-05, "loss": 0.7546, "step": 2876 }, { "epoch": 1.7370962873528524, "grad_norm": 0.12201385200023651, "learning_rate": 8.679950186799503e-05, "loss": 0.5731, "step": 2877 }, { "epoch": 1.7376999698158768, "grad_norm": 0.13196420669555664, "learning_rate": 8.67579908675799e-05, "loss": 0.6438, "step": 2878 }, { "epoch": 1.7383036522789013, "grad_norm": 0.1272536814212799, "learning_rate": 8.67164798671648e-05, "loss": 0.5866, "step": 2879 }, { "epoch": 1.7389073347419257, "grad_norm": 0.1259777694940567, "learning_rate": 8.667496886674969e-05, "loss": 0.677, "step": 2880 }, { "epoch": 1.7395110172049502, "grad_norm": 0.13071627914905548, "learning_rate": 8.663345786633458e-05, "loss": 0.7244, "step": 2881 }, { "epoch": 1.7401146996679746, "grad_norm": 0.12744002044200897, "learning_rate": 8.659194686591948e-05, "loss": 0.9371, "step": 2882 }, { "epoch": 1.740718382130999, "grad_norm": 0.137882798910141, "learning_rate": 8.655043586550436e-05, "loss": 0.6717, "step": 2883 }, { "epoch": 1.7413220645940235, "grad_norm": 0.13243485987186432, "learning_rate": 8.650892486508926e-05, "loss": 0.7217, "step": 2884 }, { "epoch": 1.741925747057048, "grad_norm": 0.13238345086574554, "learning_rate": 8.646741386467414e-05, "loss": 0.7141, "step": 2885 }, { "epoch": 1.7425294295200724, "grad_norm": 0.13204292953014374, "learning_rate": 8.642590286425904e-05, "loss": 0.7224, "step": 2886 }, { "epoch": 1.7431331119830968, "grad_norm": 0.12200792878866196, "learning_rate": 8.638439186384392e-05, "loss": 0.585, "step": 2887 }, { "epoch": 1.7437367944461213, "grad_norm": 0.12141503393650055, "learning_rate": 8.634288086342881e-05, "loss": 0.9708, "step": 2888 }, { "epoch": 1.7443404769091457, "grad_norm": 0.14114977419376373, "learning_rate": 8.630136986301371e-05, "loss": 0.5942, "step": 2889 }, { "epoch": 1.7449441593721704, "grad_norm": 0.12927395105361938, "learning_rate": 8.62598588625986e-05, "loss": 0.6257, "step": 2890 }, { "epoch": 1.7455478418351946, "grad_norm": 0.1255059540271759, "learning_rate": 8.621834786218348e-05, "loss": 0.5921, "step": 2891 }, { "epoch": 1.7461515242982193, "grad_norm": 0.1200440302491188, "learning_rate": 8.617683686176837e-05, "loss": 0.5587, "step": 2892 }, { "epoch": 1.7467552067612435, "grad_norm": 0.12957656383514404, "learning_rate": 8.613532586135327e-05, "loss": 0.5773, "step": 2893 }, { "epoch": 1.7473588892242682, "grad_norm": 0.13232208788394928, "learning_rate": 8.609381486093814e-05, "loss": 0.681, "step": 2894 }, { "epoch": 1.7479625716872924, "grad_norm": 0.12879258394241333, "learning_rate": 8.605230386052304e-05, "loss": 0.5747, "step": 2895 }, { "epoch": 1.748566254150317, "grad_norm": 0.1347392499446869, "learning_rate": 8.601079286010794e-05, "loss": 0.5334, "step": 2896 }, { "epoch": 1.7491699366133413, "grad_norm": 0.16375549137592316, "learning_rate": 8.596928185969282e-05, "loss": 0.5834, "step": 2897 }, { "epoch": 1.749773619076366, "grad_norm": 0.14500340819358826, "learning_rate": 8.592777085927772e-05, "loss": 0.539, "step": 2898 }, { "epoch": 1.7503773015393902, "grad_norm": 0.16076421737670898, "learning_rate": 8.58862598588626e-05, "loss": 0.5071, "step": 2899 }, { "epoch": 1.7509809840024149, "grad_norm": 0.14887742698192596, "learning_rate": 8.584474885844749e-05, "loss": 0.4643, "step": 2900 }, { "epoch": 1.751584666465439, "grad_norm": 0.16193953156471252, "learning_rate": 8.580323785803237e-05, "loss": 0.494, "step": 2901 }, { "epoch": 1.7521883489284638, "grad_norm": 0.1675998717546463, "learning_rate": 8.576172685761727e-05, "loss": 0.3624, "step": 2902 }, { "epoch": 1.752792031391488, "grad_norm": 0.18320401012897491, "learning_rate": 8.572021585720216e-05, "loss": 0.4361, "step": 2903 }, { "epoch": 1.7533957138545126, "grad_norm": 0.1881968080997467, "learning_rate": 8.567870485678705e-05, "loss": 0.3647, "step": 2904 }, { "epoch": 1.7539993963175369, "grad_norm": 0.2042665183544159, "learning_rate": 8.563719385637195e-05, "loss": 0.3493, "step": 2905 }, { "epoch": 1.7546030787805615, "grad_norm": 0.18004922568798065, "learning_rate": 8.559568285595683e-05, "loss": 0.2261, "step": 2906 }, { "epoch": 1.7552067612435858, "grad_norm": 0.15333791077136993, "learning_rate": 8.555417185554172e-05, "loss": 1.0165, "step": 2907 }, { "epoch": 1.7558104437066104, "grad_norm": 0.1424827128648758, "learning_rate": 8.551266085512661e-05, "loss": 0.6105, "step": 2908 }, { "epoch": 1.7564141261696347, "grad_norm": 0.13473393023014069, "learning_rate": 8.547114985471151e-05, "loss": 0.6586, "step": 2909 }, { "epoch": 1.7570178086326593, "grad_norm": 0.1452847421169281, "learning_rate": 8.54296388542964e-05, "loss": 0.7158, "step": 2910 }, { "epoch": 1.7576214910956836, "grad_norm": 0.1423470824956894, "learning_rate": 8.538812785388128e-05, "loss": 0.6961, "step": 2911 }, { "epoch": 1.7582251735587082, "grad_norm": 0.12046578526496887, "learning_rate": 8.534661685346618e-05, "loss": 0.8319, "step": 2912 }, { "epoch": 1.7588288560217324, "grad_norm": 0.13641320168972015, "learning_rate": 8.530510585305105e-05, "loss": 0.6154, "step": 2913 }, { "epoch": 1.7594325384847571, "grad_norm": 0.1376158744096756, "learning_rate": 8.526359485263595e-05, "loss": 0.6259, "step": 2914 }, { "epoch": 1.7600362209477813, "grad_norm": 0.1257615089416504, "learning_rate": 8.522208385222084e-05, "loss": 0.6727, "step": 2915 }, { "epoch": 1.760639903410806, "grad_norm": 0.12842705845832825, "learning_rate": 8.518057285180573e-05, "loss": 0.8247, "step": 2916 }, { "epoch": 1.7612435858738302, "grad_norm": 0.12923744320869446, "learning_rate": 8.513906185139063e-05, "loss": 0.7409, "step": 2917 }, { "epoch": 1.761847268336855, "grad_norm": 0.17179526388645172, "learning_rate": 8.509755085097551e-05, "loss": 0.6223, "step": 2918 }, { "epoch": 1.7624509507998791, "grad_norm": 0.1274694800376892, "learning_rate": 8.50560398505604e-05, "loss": 0.6175, "step": 2919 }, { "epoch": 1.7630546332629038, "grad_norm": 0.1250637173652649, "learning_rate": 8.501452885014529e-05, "loss": 0.8064, "step": 2920 }, { "epoch": 1.763658315725928, "grad_norm": 0.12304326891899109, "learning_rate": 8.497301784973019e-05, "loss": 0.6699, "step": 2921 }, { "epoch": 1.7642619981889527, "grad_norm": 0.1374034285545349, "learning_rate": 8.493150684931507e-05, "loss": 0.6121, "step": 2922 }, { "epoch": 1.764865680651977, "grad_norm": 0.13162750005722046, "learning_rate": 8.488999584889996e-05, "loss": 0.7042, "step": 2923 }, { "epoch": 1.7654693631150016, "grad_norm": 0.13185490667819977, "learning_rate": 8.484848484848486e-05, "loss": 0.6263, "step": 2924 }, { "epoch": 1.7660730455780258, "grad_norm": 0.1368507295846939, "learning_rate": 8.480697384806973e-05, "loss": 0.695, "step": 2925 }, { "epoch": 1.7666767280410505, "grad_norm": 0.12848018109798431, "learning_rate": 8.476546284765463e-05, "loss": 0.5866, "step": 2926 }, { "epoch": 1.7672804105040747, "grad_norm": 0.12568457424640656, "learning_rate": 8.472395184723952e-05, "loss": 0.7202, "step": 2927 }, { "epoch": 1.7678840929670994, "grad_norm": 0.14278379082679749, "learning_rate": 8.468244084682442e-05, "loss": 0.8685, "step": 2928 }, { "epoch": 1.7684877754301238, "grad_norm": 0.1360083371400833, "learning_rate": 8.46409298464093e-05, "loss": 0.6535, "step": 2929 }, { "epoch": 1.7690914578931483, "grad_norm": 0.12538328766822815, "learning_rate": 8.459941884599419e-05, "loss": 0.614, "step": 2930 }, { "epoch": 1.7696951403561727, "grad_norm": 0.13750702142715454, "learning_rate": 8.455790784557909e-05, "loss": 0.6894, "step": 2931 }, { "epoch": 1.7702988228191971, "grad_norm": 0.12563706934452057, "learning_rate": 8.451639684516396e-05, "loss": 0.6747, "step": 2932 }, { "epoch": 1.7709025052822216, "grad_norm": 0.14879825711250305, "learning_rate": 8.447488584474886e-05, "loss": 0.747, "step": 2933 }, { "epoch": 1.771506187745246, "grad_norm": 0.12372935563325882, "learning_rate": 8.443337484433375e-05, "loss": 0.5759, "step": 2934 }, { "epoch": 1.7721098702082705, "grad_norm": 0.13434575498104095, "learning_rate": 8.439186384391864e-05, "loss": 0.8362, "step": 2935 }, { "epoch": 1.772713552671295, "grad_norm": 0.12838785350322723, "learning_rate": 8.435035284350354e-05, "loss": 0.8738, "step": 2936 }, { "epoch": 1.7733172351343194, "grad_norm": 0.12814517319202423, "learning_rate": 8.430884184308842e-05, "loss": 0.7118, "step": 2937 }, { "epoch": 1.7739209175973438, "grad_norm": 0.11639917641878128, "learning_rate": 8.426733084267331e-05, "loss": 0.7066, "step": 2938 }, { "epoch": 1.7745246000603683, "grad_norm": 0.1350032538175583, "learning_rate": 8.42258198422582e-05, "loss": 0.7287, "step": 2939 }, { "epoch": 1.7751282825233927, "grad_norm": 0.12561342120170593, "learning_rate": 8.41843088418431e-05, "loss": 0.6189, "step": 2940 }, { "epoch": 1.7757319649864172, "grad_norm": 0.13215911388397217, "learning_rate": 8.414279784142798e-05, "loss": 0.5686, "step": 2941 }, { "epoch": 1.7763356474494416, "grad_norm": 0.12100309878587723, "learning_rate": 8.410128684101287e-05, "loss": 0.5502, "step": 2942 }, { "epoch": 1.776939329912466, "grad_norm": 0.1172775849699974, "learning_rate": 8.405977584059777e-05, "loss": 0.5211, "step": 2943 }, { "epoch": 1.7775430123754905, "grad_norm": 0.13135451078414917, "learning_rate": 8.401826484018264e-05, "loss": 0.6114, "step": 2944 }, { "epoch": 1.778146694838515, "grad_norm": 0.16509322822093964, "learning_rate": 8.397675383976754e-05, "loss": 0.5613, "step": 2945 }, { "epoch": 1.7787503773015394, "grad_norm": 0.14752458035945892, "learning_rate": 8.393524283935243e-05, "loss": 0.5444, "step": 2946 }, { "epoch": 1.7793540597645638, "grad_norm": 0.15376845002174377, "learning_rate": 8.389373183893732e-05, "loss": 0.5722, "step": 2947 }, { "epoch": 1.7799577422275883, "grad_norm": 0.14886072278022766, "learning_rate": 8.385222083852222e-05, "loss": 0.5796, "step": 2948 }, { "epoch": 1.7805614246906127, "grad_norm": 0.14707441627979279, "learning_rate": 8.38107098381071e-05, "loss": 0.5235, "step": 2949 }, { "epoch": 1.7811651071536372, "grad_norm": 0.16511821746826172, "learning_rate": 8.3769198837692e-05, "loss": 0.6322, "step": 2950 }, { "epoch": 1.7817687896166616, "grad_norm": 0.1711074262857437, "learning_rate": 8.372768783727688e-05, "loss": 0.4643, "step": 2951 }, { "epoch": 1.782372472079686, "grad_norm": 0.1574665904045105, "learning_rate": 8.368617683686178e-05, "loss": 0.3831, "step": 2952 }, { "epoch": 1.7829761545427105, "grad_norm": 0.17709819972515106, "learning_rate": 8.364466583644666e-05, "loss": 0.3542, "step": 2953 }, { "epoch": 1.783579837005735, "grad_norm": 0.1887846142053604, "learning_rate": 8.360315483603155e-05, "loss": 0.3804, "step": 2954 }, { "epoch": 1.7841835194687594, "grad_norm": 0.19664205610752106, "learning_rate": 8.356164383561645e-05, "loss": 0.3055, "step": 2955 }, { "epoch": 1.7847872019317839, "grad_norm": 0.1803441047668457, "learning_rate": 8.352013283520133e-05, "loss": 0.2447, "step": 2956 }, { "epoch": 1.7853908843948083, "grad_norm": 0.13248445093631744, "learning_rate": 8.347862183478622e-05, "loss": 0.76, "step": 2957 }, { "epoch": 1.7859945668578328, "grad_norm": 0.12137053906917572, "learning_rate": 8.343711083437111e-05, "loss": 0.6176, "step": 2958 }, { "epoch": 1.7865982493208572, "grad_norm": 0.1513027399778366, "learning_rate": 8.339559983395601e-05, "loss": 0.611, "step": 2959 }, { "epoch": 1.7872019317838816, "grad_norm": 0.15577901899814606, "learning_rate": 8.33540888335409e-05, "loss": 0.6373, "step": 2960 }, { "epoch": 1.787805614246906, "grad_norm": 0.14755643904209137, "learning_rate": 8.331257783312578e-05, "loss": 1.0106, "step": 2961 }, { "epoch": 1.7884092967099305, "grad_norm": 0.127186581492424, "learning_rate": 8.327106683271068e-05, "loss": 0.7419, "step": 2962 }, { "epoch": 1.789012979172955, "grad_norm": 0.14396396279335022, "learning_rate": 8.322955583229555e-05, "loss": 0.6834, "step": 2963 }, { "epoch": 1.7896166616359794, "grad_norm": 0.12821994721889496, "learning_rate": 8.318804483188045e-05, "loss": 0.9047, "step": 2964 }, { "epoch": 1.7902203440990039, "grad_norm": 0.12685029208660126, "learning_rate": 8.314653383146534e-05, "loss": 0.5295, "step": 2965 }, { "epoch": 1.7908240265620283, "grad_norm": 0.16082331538200378, "learning_rate": 8.310502283105023e-05, "loss": 0.6977, "step": 2966 }, { "epoch": 1.7914277090250528, "grad_norm": 0.13325533270835876, "learning_rate": 8.306351183063513e-05, "loss": 1.0607, "step": 2967 }, { "epoch": 1.7920313914880772, "grad_norm": 0.12702691555023193, "learning_rate": 8.302200083022001e-05, "loss": 0.6663, "step": 2968 }, { "epoch": 1.7926350739511017, "grad_norm": 0.13634531199932098, "learning_rate": 8.29804898298049e-05, "loss": 0.6418, "step": 2969 }, { "epoch": 1.7932387564141261, "grad_norm": 0.11988352239131927, "learning_rate": 8.293897882938979e-05, "loss": 0.7847, "step": 2970 }, { "epoch": 1.7938424388771508, "grad_norm": 0.13620884716510773, "learning_rate": 8.289746782897469e-05, "loss": 0.6376, "step": 2971 }, { "epoch": 1.794446121340175, "grad_norm": 0.13789290189743042, "learning_rate": 8.285595682855957e-05, "loss": 1.144, "step": 2972 }, { "epoch": 1.7950498038031997, "grad_norm": 0.12296643108129501, "learning_rate": 8.281444582814446e-05, "loss": 0.6537, "step": 2973 }, { "epoch": 1.795653486266224, "grad_norm": 0.13792656362056732, "learning_rate": 8.277293482772936e-05, "loss": 0.628, "step": 2974 }, { "epoch": 1.7962571687292486, "grad_norm": 0.13332706689834595, "learning_rate": 8.273142382731425e-05, "loss": 0.584, "step": 2975 }, { "epoch": 1.7968608511922728, "grad_norm": 0.11899926513433456, "learning_rate": 8.268991282689913e-05, "loss": 0.6707, "step": 2976 }, { "epoch": 1.7974645336552975, "grad_norm": 0.3628057539463043, "learning_rate": 8.264840182648402e-05, "loss": 0.6138, "step": 2977 }, { "epoch": 1.7980682161183217, "grad_norm": 0.1424243003129959, "learning_rate": 8.260689082606892e-05, "loss": 0.708, "step": 2978 }, { "epoch": 1.7986718985813464, "grad_norm": 0.13742388784885406, "learning_rate": 8.25653798256538e-05, "loss": 0.7981, "step": 2979 }, { "epoch": 1.7992755810443706, "grad_norm": 0.12499826401472092, "learning_rate": 8.252386882523869e-05, "loss": 0.7833, "step": 2980 }, { "epoch": 1.7998792635073952, "grad_norm": 0.13351675868034363, "learning_rate": 8.248235782482359e-05, "loss": 0.9222, "step": 2981 }, { "epoch": 1.8004829459704195, "grad_norm": 0.13743843138217926, "learning_rate": 8.244084682440846e-05, "loss": 0.6614, "step": 2982 }, { "epoch": 1.8010866284334441, "grad_norm": 0.14949648082256317, "learning_rate": 8.239933582399336e-05, "loss": 0.5954, "step": 2983 }, { "epoch": 1.8016903108964684, "grad_norm": 0.1315966248512268, "learning_rate": 8.235782482357825e-05, "loss": 0.7275, "step": 2984 }, { "epoch": 1.802293993359493, "grad_norm": 0.16671594977378845, "learning_rate": 8.231631382316314e-05, "loss": 0.7025, "step": 2985 }, { "epoch": 1.8028976758225173, "grad_norm": 0.12334276735782623, "learning_rate": 8.227480282274804e-05, "loss": 0.6606, "step": 2986 }, { "epoch": 1.803501358285542, "grad_norm": 0.11940109729766846, "learning_rate": 8.223329182233292e-05, "loss": 0.6042, "step": 2987 }, { "epoch": 1.8041050407485661, "grad_norm": 0.1273908019065857, "learning_rate": 8.219178082191781e-05, "loss": 0.6492, "step": 2988 }, { "epoch": 1.8047087232115908, "grad_norm": 0.12909753620624542, "learning_rate": 8.21502698215027e-05, "loss": 0.5704, "step": 2989 }, { "epoch": 1.805312405674615, "grad_norm": 0.1314334124326706, "learning_rate": 8.21087588210876e-05, "loss": 0.548, "step": 2990 }, { "epoch": 1.8059160881376397, "grad_norm": 0.14163191616535187, "learning_rate": 8.206724782067248e-05, "loss": 0.6069, "step": 2991 }, { "epoch": 1.806519770600664, "grad_norm": 0.14366085827350616, "learning_rate": 8.202573682025737e-05, "loss": 0.5977, "step": 2992 }, { "epoch": 1.8071234530636886, "grad_norm": 0.1461668461561203, "learning_rate": 8.198422581984227e-05, "loss": 0.6007, "step": 2993 }, { "epoch": 1.8077271355267128, "grad_norm": 0.14245660603046417, "learning_rate": 8.194271481942716e-05, "loss": 0.5456, "step": 2994 }, { "epoch": 1.8083308179897375, "grad_norm": 0.15712513029575348, "learning_rate": 8.190120381901204e-05, "loss": 0.522, "step": 2995 }, { "epoch": 1.8089345004527617, "grad_norm": 0.14800165593624115, "learning_rate": 8.185969281859693e-05, "loss": 0.541, "step": 2996 }, { "epoch": 1.8095381829157864, "grad_norm": 0.1499776840209961, "learning_rate": 8.181818181818183e-05, "loss": 0.5566, "step": 2997 }, { "epoch": 1.8101418653788106, "grad_norm": 0.15904314815998077, "learning_rate": 8.177667081776672e-05, "loss": 0.533, "step": 2998 }, { "epoch": 1.8107455478418353, "grad_norm": 0.17498090863227844, "learning_rate": 8.17351598173516e-05, "loss": 0.535, "step": 2999 }, { "epoch": 1.8113492303048595, "grad_norm": 0.16436955332756042, "learning_rate": 8.16936488169365e-05, "loss": 0.5246, "step": 3000 }, { "epoch": 1.8113492303048595, "eval_loss": 0.6211307644844055, "eval_runtime": 1059.4646, "eval_samples_per_second": 2.633, "eval_steps_per_second": 0.329, "step": 3000 }, { "epoch": 1.8119529127678842, "grad_norm": 0.173828125, "learning_rate": 8.165213781652138e-05, "loss": 0.4371, "step": 3001 }, { "epoch": 1.8125565952309084, "grad_norm": 0.203125, "learning_rate": 8.161062681610628e-05, "loss": 0.4749, "step": 3002 }, { "epoch": 1.813160277693933, "grad_norm": 0.1767578125, "learning_rate": 8.156911581569116e-05, "loss": 0.3594, "step": 3003 }, { "epoch": 1.8137639601569573, "grad_norm": 0.1884765625, "learning_rate": 8.152760481527605e-05, "loss": 0.3817, "step": 3004 }, { "epoch": 1.814367642619982, "grad_norm": 0.185546875, "learning_rate": 8.148609381486095e-05, "loss": 0.3289, "step": 3005 }, { "epoch": 1.8149713250830062, "grad_norm": 0.185546875, "learning_rate": 8.144458281444583e-05, "loss": 0.276, "step": 3006 }, { "epoch": 1.8155750075460309, "grad_norm": 0.154296875, "learning_rate": 8.140307181403072e-05, "loss": 0.6628, "step": 3007 }, { "epoch": 1.816178690009055, "grad_norm": 0.1298828125, "learning_rate": 8.136156081361561e-05, "loss": 0.6329, "step": 3008 }, { "epoch": 1.8167823724720797, "grad_norm": 0.134765625, "learning_rate": 8.132004981320051e-05, "loss": 0.6207, "step": 3009 }, { "epoch": 1.8173860549351042, "grad_norm": 0.1376953125, "learning_rate": 8.127853881278538e-05, "loss": 0.7047, "step": 3010 }, { "epoch": 1.8179897373981286, "grad_norm": 0.171875, "learning_rate": 8.123702781237028e-05, "loss": 0.717, "step": 3011 }, { "epoch": 1.818593419861153, "grad_norm": 0.134765625, "learning_rate": 8.119551681195518e-05, "loss": 0.5587, "step": 3012 }, { "epoch": 1.8191971023241775, "grad_norm": 0.14453125, "learning_rate": 8.115400581154005e-05, "loss": 0.7154, "step": 3013 }, { "epoch": 1.819800784787202, "grad_norm": 0.1337890625, "learning_rate": 8.111249481112495e-05, "loss": 0.6538, "step": 3014 }, { "epoch": 1.8204044672502264, "grad_norm": 0.1328125, "learning_rate": 8.107098381070984e-05, "loss": 0.6695, "step": 3015 }, { "epoch": 1.8210081497132509, "grad_norm": 0.134765625, "learning_rate": 8.102947281029473e-05, "loss": 0.8783, "step": 3016 }, { "epoch": 1.8216118321762753, "grad_norm": 0.13671875, "learning_rate": 8.098796180987961e-05, "loss": 0.6434, "step": 3017 }, { "epoch": 1.8222155146392998, "grad_norm": 0.1220703125, "learning_rate": 8.094645080946451e-05, "loss": 0.6594, "step": 3018 }, { "epoch": 1.8228191971023242, "grad_norm": 0.142578125, "learning_rate": 8.090493980904941e-05, "loss": 0.6878, "step": 3019 }, { "epoch": 1.8234228795653487, "grad_norm": 0.125, "learning_rate": 8.086342880863429e-05, "loss": 0.7494, "step": 3020 }, { "epoch": 1.824026562028373, "grad_norm": 0.1337890625, "learning_rate": 8.082191780821919e-05, "loss": 0.9943, "step": 3021 }, { "epoch": 1.8246302444913975, "grad_norm": 0.1279296875, "learning_rate": 8.078040680780407e-05, "loss": 0.636, "step": 3022 }, { "epoch": 1.825233926954422, "grad_norm": 0.140625, "learning_rate": 8.073889580738896e-05, "loss": 0.7553, "step": 3023 }, { "epoch": 1.8258376094174464, "grad_norm": 0.1337890625, "learning_rate": 8.069738480697385e-05, "loss": 0.6998, "step": 3024 }, { "epoch": 1.8264412918804709, "grad_norm": 0.1318359375, "learning_rate": 8.065587380655875e-05, "loss": 0.6978, "step": 3025 }, { "epoch": 1.8270449743434953, "grad_norm": 0.11767578125, "learning_rate": 8.061436280614363e-05, "loss": 0.6634, "step": 3026 }, { "epoch": 1.8276486568065198, "grad_norm": 0.1357421875, "learning_rate": 8.057285180572852e-05, "loss": 0.6892, "step": 3027 }, { "epoch": 1.8282523392695442, "grad_norm": 0.138671875, "learning_rate": 8.053134080531342e-05, "loss": 0.7321, "step": 3028 }, { "epoch": 1.8288560217325687, "grad_norm": 0.123046875, "learning_rate": 8.048982980489829e-05, "loss": 0.6714, "step": 3029 }, { "epoch": 1.8294597041955931, "grad_norm": 0.1298828125, "learning_rate": 8.044831880448319e-05, "loss": 0.7105, "step": 3030 }, { "epoch": 1.8300633866586176, "grad_norm": 0.12890625, "learning_rate": 8.040680780406808e-05, "loss": 0.7032, "step": 3031 }, { "epoch": 1.830667069121642, "grad_norm": 0.1201171875, "learning_rate": 8.036529680365296e-05, "loss": 0.6389, "step": 3032 }, { "epoch": 1.8312707515846665, "grad_norm": 0.1220703125, "learning_rate": 8.032378580323786e-05, "loss": 0.8036, "step": 3033 }, { "epoch": 1.831874434047691, "grad_norm": 0.1455078125, "learning_rate": 8.028227480282275e-05, "loss": 0.6143, "step": 3034 }, { "epoch": 1.8324781165107153, "grad_norm": 0.134765625, "learning_rate": 8.024076380240764e-05, "loss": 0.6233, "step": 3035 }, { "epoch": 1.8330817989737398, "grad_norm": 0.138671875, "learning_rate": 8.019925280199252e-05, "loss": 0.5732, "step": 3036 }, { "epoch": 1.8336854814367642, "grad_norm": 0.125, "learning_rate": 8.015774180157742e-05, "loss": 0.7028, "step": 3037 }, { "epoch": 1.8342891638997887, "grad_norm": 0.1494140625, "learning_rate": 8.011623080116231e-05, "loss": 0.7947, "step": 3038 }, { "epoch": 1.8348928463628131, "grad_norm": 0.12890625, "learning_rate": 8.00747198007472e-05, "loss": 0.6311, "step": 3039 }, { "epoch": 1.8354965288258376, "grad_norm": 0.1376953125, "learning_rate": 8.00332088003321e-05, "loss": 0.6732, "step": 3040 }, { "epoch": 1.836100211288862, "grad_norm": 0.1259765625, "learning_rate": 7.999169779991698e-05, "loss": 0.5363, "step": 3041 }, { "epoch": 1.8367038937518865, "grad_norm": 0.126953125, "learning_rate": 7.995018679950187e-05, "loss": 0.5187, "step": 3042 }, { "epoch": 1.837307576214911, "grad_norm": 0.1376953125, "learning_rate": 7.990867579908676e-05, "loss": 0.6697, "step": 3043 }, { "epoch": 1.8379112586779354, "grad_norm": 0.12890625, "learning_rate": 7.986716479867166e-05, "loss": 0.5733, "step": 3044 }, { "epoch": 1.8385149411409598, "grad_norm": 0.142578125, "learning_rate": 7.982565379825654e-05, "loss": 0.5313, "step": 3045 }, { "epoch": 1.8391186236039843, "grad_norm": 0.1474609375, "learning_rate": 7.978414279784143e-05, "loss": 0.5485, "step": 3046 }, { "epoch": 1.8397223060670087, "grad_norm": 0.15625, "learning_rate": 7.974263179742633e-05, "loss": 0.5707, "step": 3047 }, { "epoch": 1.8403259885300332, "grad_norm": 0.1435546875, "learning_rate": 7.97011207970112e-05, "loss": 0.5065, "step": 3048 }, { "epoch": 1.8409296709930576, "grad_norm": 0.158203125, "learning_rate": 7.96596097965961e-05, "loss": 0.5392, "step": 3049 }, { "epoch": 1.8415333534560823, "grad_norm": 0.1533203125, "learning_rate": 7.961809879618099e-05, "loss": 0.4614, "step": 3050 }, { "epoch": 1.8421370359191065, "grad_norm": 0.171875, "learning_rate": 7.957658779576588e-05, "loss": 0.4686, "step": 3051 }, { "epoch": 1.8427407183821312, "grad_norm": 0.169921875, "learning_rate": 7.953507679535078e-05, "loss": 0.429, "step": 3052 }, { "epoch": 1.8433444008451554, "grad_norm": 0.173828125, "learning_rate": 7.949356579493566e-05, "loss": 0.3595, "step": 3053 }, { "epoch": 1.84394808330818, "grad_norm": 0.1806640625, "learning_rate": 7.945205479452055e-05, "loss": 0.3526, "step": 3054 }, { "epoch": 1.8445517657712043, "grad_norm": 0.17578125, "learning_rate": 7.941054379410544e-05, "loss": 0.3008, "step": 3055 }, { "epoch": 1.845155448234229, "grad_norm": 0.17578125, "learning_rate": 7.936903279369034e-05, "loss": 0.222, "step": 3056 }, { "epoch": 1.8457591306972532, "grad_norm": 0.1513671875, "learning_rate": 7.932752179327522e-05, "loss": 0.99, "step": 3057 }, { "epoch": 1.8463628131602778, "grad_norm": 0.1328125, "learning_rate": 7.928601079286011e-05, "loss": 0.6689, "step": 3058 }, { "epoch": 1.846966495623302, "grad_norm": 0.12890625, "learning_rate": 7.924449979244501e-05, "loss": 0.5613, "step": 3059 }, { "epoch": 1.8475701780863267, "grad_norm": 0.125, "learning_rate": 7.920298879202988e-05, "loss": 0.6002, "step": 3060 }, { "epoch": 1.848173860549351, "grad_norm": 0.15234375, "learning_rate": 7.916147779161478e-05, "loss": 0.6666, "step": 3061 }, { "epoch": 1.8487775430123756, "grad_norm": 0.13671875, "learning_rate": 7.911996679119967e-05, "loss": 0.6661, "step": 3062 }, { "epoch": 1.8493812254753998, "grad_norm": 0.1328125, "learning_rate": 7.907845579078457e-05, "loss": 0.588, "step": 3063 }, { "epoch": 1.8499849079384245, "grad_norm": 0.14453125, "learning_rate": 7.903694479036945e-05, "loss": 0.9441, "step": 3064 }, { "epoch": 1.8505885904014487, "grad_norm": 0.134765625, "learning_rate": 7.899543378995434e-05, "loss": 0.9012, "step": 3065 }, { "epoch": 1.8511922728644734, "grad_norm": 0.1240234375, "learning_rate": 7.895392278953924e-05, "loss": 0.8596, "step": 3066 }, { "epoch": 1.8517959553274976, "grad_norm": 0.1337890625, "learning_rate": 7.891241178912411e-05, "loss": 0.6868, "step": 3067 }, { "epoch": 1.8523996377905223, "grad_norm": 0.1474609375, "learning_rate": 7.887090078870901e-05, "loss": 0.6323, "step": 3068 }, { "epoch": 1.8530033202535465, "grad_norm": 0.1279296875, "learning_rate": 7.88293897882939e-05, "loss": 0.5647, "step": 3069 }, { "epoch": 1.8536070027165712, "grad_norm": 0.1279296875, "learning_rate": 7.878787878787879e-05, "loss": 0.6628, "step": 3070 }, { "epoch": 1.8542106851795954, "grad_norm": 0.1416015625, "learning_rate": 7.874636778746369e-05, "loss": 0.6885, "step": 3071 }, { "epoch": 1.85481436764262, "grad_norm": 0.1201171875, "learning_rate": 7.870485678704857e-05, "loss": 0.5712, "step": 3072 }, { "epoch": 1.8554180501056443, "grad_norm": 0.1318359375, "learning_rate": 7.866334578663346e-05, "loss": 0.6282, "step": 3073 }, { "epoch": 1.856021732568669, "grad_norm": 0.1279296875, "learning_rate": 7.862183478621835e-05, "loss": 0.6369, "step": 3074 }, { "epoch": 1.8566254150316932, "grad_norm": 0.134765625, "learning_rate": 7.858032378580325e-05, "loss": 0.6647, "step": 3075 }, { "epoch": 1.8572290974947179, "grad_norm": 0.1474609375, "learning_rate": 7.853881278538813e-05, "loss": 0.9152, "step": 3076 }, { "epoch": 1.857832779957742, "grad_norm": 0.12255859375, "learning_rate": 7.849730178497302e-05, "loss": 0.6018, "step": 3077 }, { "epoch": 1.8584364624207668, "grad_norm": 0.14453125, "learning_rate": 7.845579078455792e-05, "loss": 0.9967, "step": 3078 }, { "epoch": 1.859040144883791, "grad_norm": 0.11328125, "learning_rate": 7.841427978414279e-05, "loss": 0.6408, "step": 3079 }, { "epoch": 1.8596438273468157, "grad_norm": 0.1279296875, "learning_rate": 7.837276878372769e-05, "loss": 0.7113, "step": 3080 }, { "epoch": 1.8602475098098399, "grad_norm": 0.1357421875, "learning_rate": 7.833125778331258e-05, "loss": 0.6914, "step": 3081 }, { "epoch": 1.8608511922728646, "grad_norm": 0.125, "learning_rate": 7.828974678289747e-05, "loss": 0.8815, "step": 3082 }, { "epoch": 1.8614548747358888, "grad_norm": 0.1298828125, "learning_rate": 7.824823578248237e-05, "loss": 0.9352, "step": 3083 }, { "epoch": 1.8620585571989134, "grad_norm": 0.140625, "learning_rate": 7.820672478206725e-05, "loss": 0.6636, "step": 3084 }, { "epoch": 1.8626622396619377, "grad_norm": 0.13671875, "learning_rate": 7.816521378165215e-05, "loss": 0.6623, "step": 3085 }, { "epoch": 1.8632659221249623, "grad_norm": 0.1318359375, "learning_rate": 7.812370278123702e-05, "loss": 0.8932, "step": 3086 }, { "epoch": 1.8638696045879866, "grad_norm": 0.130859375, "learning_rate": 7.808219178082192e-05, "loss": 0.5604, "step": 3087 }, { "epoch": 1.8644732870510112, "grad_norm": 0.1455078125, "learning_rate": 7.804068078040681e-05, "loss": 0.6196, "step": 3088 }, { "epoch": 1.8650769695140355, "grad_norm": 0.1318359375, "learning_rate": 7.79991697799917e-05, "loss": 0.6475, "step": 3089 }, { "epoch": 1.8656806519770601, "grad_norm": 0.134765625, "learning_rate": 7.79576587795766e-05, "loss": 0.6318, "step": 3090 }, { "epoch": 1.8662843344400846, "grad_norm": 0.134765625, "learning_rate": 7.791614777916148e-05, "loss": 0.5979, "step": 3091 }, { "epoch": 1.866888016903109, "grad_norm": 0.130859375, "learning_rate": 7.787463677874637e-05, "loss": 0.5466, "step": 3092 }, { "epoch": 1.8674916993661335, "grad_norm": 0.13671875, "learning_rate": 7.783312577833126e-05, "loss": 0.6918, "step": 3093 }, { "epoch": 1.868095381829158, "grad_norm": 0.1552734375, "learning_rate": 7.779161477791616e-05, "loss": 0.8258, "step": 3094 }, { "epoch": 1.8686990642921824, "grad_norm": 0.169921875, "learning_rate": 7.775010377750104e-05, "loss": 0.5167, "step": 3095 }, { "epoch": 1.8693027467552068, "grad_norm": 0.13671875, "learning_rate": 7.770859277708593e-05, "loss": 0.5426, "step": 3096 }, { "epoch": 1.8699064292182312, "grad_norm": 0.1416015625, "learning_rate": 7.766708177667083e-05, "loss": 0.5094, "step": 3097 }, { "epoch": 1.8705101116812557, "grad_norm": 0.14453125, "learning_rate": 7.76255707762557e-05, "loss": 0.5342, "step": 3098 }, { "epoch": 1.8711137941442801, "grad_norm": 0.1455078125, "learning_rate": 7.75840597758406e-05, "loss": 0.5153, "step": 3099 }, { "epoch": 1.8717174766073046, "grad_norm": 0.1630859375, "learning_rate": 7.754254877542549e-05, "loss": 0.5314, "step": 3100 }, { "epoch": 1.872321159070329, "grad_norm": 0.1513671875, "learning_rate": 7.750103777501038e-05, "loss": 0.4814, "step": 3101 }, { "epoch": 1.8729248415333535, "grad_norm": 0.162109375, "learning_rate": 7.745952677459528e-05, "loss": 0.4209, "step": 3102 }, { "epoch": 1.873528523996378, "grad_norm": 0.177734375, "learning_rate": 7.741801577418016e-05, "loss": 0.4853, "step": 3103 }, { "epoch": 1.8741322064594024, "grad_norm": 0.1875, "learning_rate": 7.737650477376505e-05, "loss": 0.427, "step": 3104 }, { "epoch": 1.8747358889224268, "grad_norm": 0.1728515625, "learning_rate": 7.733499377334994e-05, "loss": 0.2704, "step": 3105 }, { "epoch": 1.8753395713854513, "grad_norm": 0.1748046875, "learning_rate": 7.729348277293484e-05, "loss": 0.2182, "step": 3106 }, { "epoch": 1.8759432538484757, "grad_norm": 0.1376953125, "learning_rate": 7.725197177251972e-05, "loss": 0.6389, "step": 3107 }, { "epoch": 1.8765469363115002, "grad_norm": 0.1357421875, "learning_rate": 7.721046077210461e-05, "loss": 0.7304, "step": 3108 }, { "epoch": 1.8771506187745246, "grad_norm": 0.1318359375, "learning_rate": 7.716894977168951e-05, "loss": 0.622, "step": 3109 }, { "epoch": 1.877754301237549, "grad_norm": 0.140625, "learning_rate": 7.71274387712744e-05, "loss": 0.6608, "step": 3110 }, { "epoch": 1.8783579837005735, "grad_norm": 0.13671875, "learning_rate": 7.708592777085928e-05, "loss": 0.6979, "step": 3111 }, { "epoch": 1.878961666163598, "grad_norm": 0.14453125, "learning_rate": 7.704441677044417e-05, "loss": 0.64, "step": 3112 }, { "epoch": 1.8795653486266224, "grad_norm": 0.1396484375, "learning_rate": 7.700290577002907e-05, "loss": 0.846, "step": 3113 }, { "epoch": 1.8801690310896468, "grad_norm": 0.134765625, "learning_rate": 7.696139476961395e-05, "loss": 0.6574, "step": 3114 }, { "epoch": 1.8807727135526713, "grad_norm": 0.12890625, "learning_rate": 7.691988376919884e-05, "loss": 0.6364, "step": 3115 }, { "epoch": 1.8813763960156957, "grad_norm": 0.1591796875, "learning_rate": 7.687837276878374e-05, "loss": 0.916, "step": 3116 }, { "epoch": 1.8819800784787202, "grad_norm": 0.134765625, "learning_rate": 7.683686176836861e-05, "loss": 0.7091, "step": 3117 }, { "epoch": 1.8825837609417446, "grad_norm": 0.1435546875, "learning_rate": 7.679535076795351e-05, "loss": 0.72, "step": 3118 }, { "epoch": 1.883187443404769, "grad_norm": 0.1318359375, "learning_rate": 7.67538397675384e-05, "loss": 0.6347, "step": 3119 }, { "epoch": 1.8837911258677935, "grad_norm": 0.12451171875, "learning_rate": 7.671232876712329e-05, "loss": 0.6204, "step": 3120 }, { "epoch": 1.884394808330818, "grad_norm": 0.126953125, "learning_rate": 7.667081776670819e-05, "loss": 0.5928, "step": 3121 }, { "epoch": 1.8849984907938424, "grad_norm": 0.1240234375, "learning_rate": 7.662930676629307e-05, "loss": 0.6527, "step": 3122 }, { "epoch": 1.8856021732568669, "grad_norm": 0.1279296875, "learning_rate": 7.658779576587796e-05, "loss": 0.6877, "step": 3123 }, { "epoch": 1.8862058557198913, "grad_norm": 0.146484375, "learning_rate": 7.654628476546285e-05, "loss": 0.792, "step": 3124 }, { "epoch": 1.8868095381829157, "grad_norm": 0.1328125, "learning_rate": 7.650477376504775e-05, "loss": 0.6725, "step": 3125 }, { "epoch": 1.8874132206459402, "grad_norm": 0.1298828125, "learning_rate": 7.646326276463262e-05, "loss": 0.7279, "step": 3126 }, { "epoch": 1.8880169031089646, "grad_norm": 0.1318359375, "learning_rate": 7.642175176421752e-05, "loss": 0.6495, "step": 3127 }, { "epoch": 1.888620585571989, "grad_norm": 0.130859375, "learning_rate": 7.638024076380242e-05, "loss": 0.516, "step": 3128 }, { "epoch": 1.8892242680350135, "grad_norm": 0.1279296875, "learning_rate": 7.63387297633873e-05, "loss": 0.9977, "step": 3129 }, { "epoch": 1.889827950498038, "grad_norm": 0.12060546875, "learning_rate": 7.629721876297219e-05, "loss": 0.5642, "step": 3130 }, { "epoch": 1.8904316329610626, "grad_norm": 0.1455078125, "learning_rate": 7.625570776255708e-05, "loss": 0.9672, "step": 3131 }, { "epoch": 1.8910353154240869, "grad_norm": 0.126953125, "learning_rate": 7.621419676214198e-05, "loss": 0.6098, "step": 3132 }, { "epoch": 1.8916389978871115, "grad_norm": 0.1474609375, "learning_rate": 7.617268576172685e-05, "loss": 0.5991, "step": 3133 }, { "epoch": 1.8922426803501358, "grad_norm": 0.1240234375, "learning_rate": 7.613117476131175e-05, "loss": 0.585, "step": 3134 }, { "epoch": 1.8928463628131604, "grad_norm": 0.12158203125, "learning_rate": 7.608966376089665e-05, "loss": 0.6032, "step": 3135 }, { "epoch": 1.8934500452761847, "grad_norm": 0.1396484375, "learning_rate": 7.604815276048153e-05, "loss": 0.5951, "step": 3136 }, { "epoch": 1.8940537277392093, "grad_norm": 0.1220703125, "learning_rate": 7.600664176006643e-05, "loss": 0.5891, "step": 3137 }, { "epoch": 1.8946574102022335, "grad_norm": 0.134765625, "learning_rate": 7.596513075965131e-05, "loss": 0.6856, "step": 3138 }, { "epoch": 1.8952610926652582, "grad_norm": 0.1279296875, "learning_rate": 7.59236197592362e-05, "loss": 0.5943, "step": 3139 }, { "epoch": 1.8958647751282824, "grad_norm": 0.1279296875, "learning_rate": 7.588210875882108e-05, "loss": 0.5702, "step": 3140 }, { "epoch": 1.896468457591307, "grad_norm": 0.1357421875, "learning_rate": 7.584059775840598e-05, "loss": 0.6605, "step": 3141 }, { "epoch": 1.8970721400543313, "grad_norm": 0.1298828125, "learning_rate": 7.579908675799087e-05, "loss": 0.5738, "step": 3142 }, { "epoch": 1.897675822517356, "grad_norm": 0.146484375, "learning_rate": 7.575757575757576e-05, "loss": 0.6339, "step": 3143 }, { "epoch": 1.8982795049803802, "grad_norm": 0.13671875, "learning_rate": 7.571606475716066e-05, "loss": 0.6243, "step": 3144 }, { "epoch": 1.898883187443405, "grad_norm": 0.1611328125, "learning_rate": 7.567455375674553e-05, "loss": 0.5782, "step": 3145 }, { "epoch": 1.8994868699064291, "grad_norm": 0.1435546875, "learning_rate": 7.563304275633043e-05, "loss": 0.4687, "step": 3146 }, { "epoch": 1.9000905523694538, "grad_norm": 0.14453125, "learning_rate": 7.559153175591532e-05, "loss": 0.4899, "step": 3147 }, { "epoch": 1.900694234832478, "grad_norm": 0.150390625, "learning_rate": 7.55500207555002e-05, "loss": 0.4926, "step": 3148 }, { "epoch": 1.9012979172955027, "grad_norm": 0.1552734375, "learning_rate": 7.55085097550851e-05, "loss": 0.511, "step": 3149 }, { "epoch": 1.901901599758527, "grad_norm": 0.1728515625, "learning_rate": 7.546699875466999e-05, "loss": 0.556, "step": 3150 }, { "epoch": 1.9025052822215516, "grad_norm": 0.1669921875, "learning_rate": 7.542548775425489e-05, "loss": 0.4537, "step": 3151 }, { "epoch": 1.9031089646845758, "grad_norm": 0.1669921875, "learning_rate": 7.538397675383976e-05, "loss": 0.4078, "step": 3152 }, { "epoch": 1.9037126471476005, "grad_norm": 0.1875, "learning_rate": 7.534246575342466e-05, "loss": 0.4259, "step": 3153 }, { "epoch": 1.9043163296106247, "grad_norm": 0.18359375, "learning_rate": 7.530095475300955e-05, "loss": 0.3296, "step": 3154 }, { "epoch": 1.9049200120736494, "grad_norm": 0.193359375, "learning_rate": 7.525944375259444e-05, "loss": 0.3254, "step": 3155 }, { "epoch": 1.9055236945366736, "grad_norm": 0.1748046875, "learning_rate": 7.521793275217934e-05, "loss": 0.21, "step": 3156 }, { "epoch": 1.9061273769996983, "grad_norm": 0.1357421875, "learning_rate": 7.517642175176422e-05, "loss": 0.604, "step": 3157 }, { "epoch": 1.9067310594627225, "grad_norm": 0.134765625, "learning_rate": 7.513491075134911e-05, "loss": 0.6113, "step": 3158 }, { "epoch": 1.9073347419257471, "grad_norm": 0.1494140625, "learning_rate": 7.5093399750934e-05, "loss": 0.6336, "step": 3159 }, { "epoch": 1.9079384243887714, "grad_norm": 0.1376953125, "learning_rate": 7.50518887505189e-05, "loss": 0.6376, "step": 3160 }, { "epoch": 1.908542106851796, "grad_norm": 0.13671875, "learning_rate": 7.501037775010378e-05, "loss": 0.6476, "step": 3161 }, { "epoch": 1.9091457893148203, "grad_norm": 0.1240234375, "learning_rate": 7.496886674968867e-05, "loss": 0.6033, "step": 3162 }, { "epoch": 1.909749471777845, "grad_norm": 0.1376953125, "learning_rate": 7.492735574927357e-05, "loss": 0.6158, "step": 3163 }, { "epoch": 1.9103531542408692, "grad_norm": 0.134765625, "learning_rate": 7.488584474885844e-05, "loss": 0.6094, "step": 3164 }, { "epoch": 1.9109568367038938, "grad_norm": 0.140625, "learning_rate": 7.484433374844334e-05, "loss": 0.6744, "step": 3165 }, { "epoch": 1.911560519166918, "grad_norm": 0.341796875, "learning_rate": 7.480282274802823e-05, "loss": 0.6104, "step": 3166 }, { "epoch": 1.9121642016299427, "grad_norm": 0.1396484375, "learning_rate": 7.476131174761311e-05, "loss": 0.6194, "step": 3167 }, { "epoch": 1.912767884092967, "grad_norm": 0.126953125, "learning_rate": 7.471980074719801e-05, "loss": 0.8527, "step": 3168 }, { "epoch": 1.9133715665559916, "grad_norm": 0.1572265625, "learning_rate": 7.46782897467829e-05, "loss": 0.6211, "step": 3169 }, { "epoch": 1.9139752490190158, "grad_norm": 0.146484375, "learning_rate": 7.463677874636779e-05, "loss": 0.6325, "step": 3170 }, { "epoch": 1.9145789314820405, "grad_norm": 0.1513671875, "learning_rate": 7.459526774595267e-05, "loss": 0.7137, "step": 3171 }, { "epoch": 1.915182613945065, "grad_norm": 0.15234375, "learning_rate": 7.455375674553757e-05, "loss": 0.6791, "step": 3172 }, { "epoch": 1.9157862964080894, "grad_norm": 0.1181640625, "learning_rate": 7.451224574512246e-05, "loss": 0.6181, "step": 3173 }, { "epoch": 1.9163899788711138, "grad_norm": 0.15234375, "learning_rate": 7.447073474470735e-05, "loss": 0.6748, "step": 3174 }, { "epoch": 1.9169936613341383, "grad_norm": 0.1533203125, "learning_rate": 7.442922374429225e-05, "loss": 0.7586, "step": 3175 }, { "epoch": 1.9175973437971627, "grad_norm": 0.1376953125, "learning_rate": 7.438771274387713e-05, "loss": 0.6438, "step": 3176 }, { "epoch": 1.9182010262601872, "grad_norm": 0.130859375, "learning_rate": 7.434620174346202e-05, "loss": 0.8398, "step": 3177 }, { "epoch": 1.9188047087232116, "grad_norm": 0.1494140625, "learning_rate": 7.43046907430469e-05, "loss": 0.6768, "step": 3178 }, { "epoch": 1.919408391186236, "grad_norm": 0.1484375, "learning_rate": 7.42631797426318e-05, "loss": 0.707, "step": 3179 }, { "epoch": 1.9200120736492605, "grad_norm": 0.1318359375, "learning_rate": 7.422166874221669e-05, "loss": 0.9903, "step": 3180 }, { "epoch": 1.920615756112285, "grad_norm": 0.1318359375, "learning_rate": 7.418015774180158e-05, "loss": 0.8726, "step": 3181 }, { "epoch": 1.9212194385753094, "grad_norm": 0.125, "learning_rate": 7.413864674138648e-05, "loss": 0.6792, "step": 3182 }, { "epoch": 1.9218231210383339, "grad_norm": 0.1396484375, "learning_rate": 7.409713574097135e-05, "loss": 0.6856, "step": 3183 }, { "epoch": 1.9224268035013583, "grad_norm": 0.134765625, "learning_rate": 7.405562474055625e-05, "loss": 0.6322, "step": 3184 }, { "epoch": 1.9230304859643828, "grad_norm": 0.1435546875, "learning_rate": 7.401411374014114e-05, "loss": 0.7408, "step": 3185 }, { "epoch": 1.9236341684274072, "grad_norm": 0.1328125, "learning_rate": 7.397260273972603e-05, "loss": 0.6326, "step": 3186 }, { "epoch": 1.9242378508904316, "grad_norm": 0.1328125, "learning_rate": 7.393109173931093e-05, "loss": 1.1231, "step": 3187 }, { "epoch": 1.924841533353456, "grad_norm": 0.1337890625, "learning_rate": 7.388958073889581e-05, "loss": 0.5966, "step": 3188 }, { "epoch": 1.9254452158164805, "grad_norm": 0.1357421875, "learning_rate": 7.38480697384807e-05, "loss": 0.6252, "step": 3189 }, { "epoch": 1.926048898279505, "grad_norm": 0.14453125, "learning_rate": 7.380655873806558e-05, "loss": 0.6251, "step": 3190 }, { "epoch": 1.9266525807425294, "grad_norm": 0.1435546875, "learning_rate": 7.376504773765048e-05, "loss": 0.6284, "step": 3191 }, { "epoch": 1.9272562632055539, "grad_norm": 0.1318359375, "learning_rate": 7.372353673723537e-05, "loss": 0.5263, "step": 3192 }, { "epoch": 1.9278599456685783, "grad_norm": 0.1513671875, "learning_rate": 7.368202573682026e-05, "loss": 0.5516, "step": 3193 }, { "epoch": 1.9284636281316028, "grad_norm": 0.138671875, "learning_rate": 7.364051473640516e-05, "loss": 0.5675, "step": 3194 }, { "epoch": 1.9290673105946272, "grad_norm": 0.146484375, "learning_rate": 7.359900373599004e-05, "loss": 0.5492, "step": 3195 }, { "epoch": 1.9296709930576517, "grad_norm": 0.14453125, "learning_rate": 7.355749273557493e-05, "loss": 0.5312, "step": 3196 }, { "epoch": 1.930274675520676, "grad_norm": 0.150390625, "learning_rate": 7.351598173515982e-05, "loss": 0.5808, "step": 3197 }, { "epoch": 1.9308783579837006, "grad_norm": 0.1533203125, "learning_rate": 7.347447073474472e-05, "loss": 0.5105, "step": 3198 }, { "epoch": 1.931482040446725, "grad_norm": 0.1513671875, "learning_rate": 7.34329597343296e-05, "loss": 0.514, "step": 3199 }, { "epoch": 1.9320857229097494, "grad_norm": 0.17578125, "learning_rate": 7.339144873391449e-05, "loss": 0.5566, "step": 3200 }, { "epoch": 1.932689405372774, "grad_norm": 0.1748046875, "learning_rate": 7.334993773349939e-05, "loss": 0.4912, "step": 3201 }, { "epoch": 1.9332930878357983, "grad_norm": 0.181640625, "learning_rate": 7.330842673308426e-05, "loss": 0.4308, "step": 3202 }, { "epoch": 1.9338967702988228, "grad_norm": 0.16796875, "learning_rate": 7.326691573266916e-05, "loss": 0.3551, "step": 3203 }, { "epoch": 1.9345004527618472, "grad_norm": 0.181640625, "learning_rate": 7.322540473225405e-05, "loss": 0.3277, "step": 3204 }, { "epoch": 1.9351041352248717, "grad_norm": 0.171875, "learning_rate": 7.318389373183894e-05, "loss": 0.2927, "step": 3205 }, { "epoch": 1.9357078176878961, "grad_norm": 0.189453125, "learning_rate": 7.314238273142384e-05, "loss": 0.2479, "step": 3206 }, { "epoch": 1.9363115001509206, "grad_norm": 0.1650390625, "learning_rate": 7.310087173100872e-05, "loss": 0.5836, "step": 3207 }, { "epoch": 1.936915182613945, "grad_norm": 0.140625, "learning_rate": 7.305936073059361e-05, "loss": 0.5894, "step": 3208 }, { "epoch": 1.9375188650769695, "grad_norm": 0.1259765625, "learning_rate": 7.30178497301785e-05, "loss": 0.6468, "step": 3209 }, { "epoch": 1.938122547539994, "grad_norm": 0.14453125, "learning_rate": 7.29763387297634e-05, "loss": 0.6436, "step": 3210 }, { "epoch": 1.9387262300030184, "grad_norm": 0.140625, "learning_rate": 7.293482772934828e-05, "loss": 0.6471, "step": 3211 }, { "epoch": 1.939329912466043, "grad_norm": 0.1435546875, "learning_rate": 7.289331672893317e-05, "loss": 0.6474, "step": 3212 }, { "epoch": 1.9399335949290673, "grad_norm": 0.1376953125, "learning_rate": 7.285180572851807e-05, "loss": 0.8865, "step": 3213 }, { "epoch": 1.940537277392092, "grad_norm": 0.12060546875, "learning_rate": 7.281029472810294e-05, "loss": 0.9, "step": 3214 }, { "epoch": 1.9411409598551161, "grad_norm": 0.1337890625, "learning_rate": 7.276878372768784e-05, "loss": 0.6733, "step": 3215 }, { "epoch": 1.9417446423181408, "grad_norm": 0.13671875, "learning_rate": 7.272727272727273e-05, "loss": 0.6217, "step": 3216 }, { "epoch": 1.942348324781165, "grad_norm": 0.1396484375, "learning_rate": 7.268576172685761e-05, "loss": 0.5774, "step": 3217 }, { "epoch": 1.9429520072441897, "grad_norm": 0.1298828125, "learning_rate": 7.264425072644251e-05, "loss": 0.7204, "step": 3218 }, { "epoch": 1.943555689707214, "grad_norm": 0.1552734375, "learning_rate": 7.26027397260274e-05, "loss": 0.709, "step": 3219 }, { "epoch": 1.9441593721702386, "grad_norm": 0.1416015625, "learning_rate": 7.25612287256123e-05, "loss": 0.685, "step": 3220 }, { "epoch": 1.9447630546332628, "grad_norm": 0.14453125, "learning_rate": 7.251971772519717e-05, "loss": 0.6819, "step": 3221 }, { "epoch": 1.9453667370962875, "grad_norm": 0.1396484375, "learning_rate": 7.247820672478207e-05, "loss": 0.6286, "step": 3222 }, { "epoch": 1.9459704195593117, "grad_norm": 0.126953125, "learning_rate": 7.243669572436696e-05, "loss": 0.6284, "step": 3223 }, { "epoch": 1.9465741020223364, "grad_norm": 0.138671875, "learning_rate": 7.239518472395185e-05, "loss": 0.7257, "step": 3224 }, { "epoch": 1.9471777844853606, "grad_norm": 0.1259765625, "learning_rate": 7.235367372353675e-05, "loss": 0.6584, "step": 3225 }, { "epoch": 1.9477814669483853, "grad_norm": 0.1337890625, "learning_rate": 7.231216272312163e-05, "loss": 0.6025, "step": 3226 }, { "epoch": 1.9483851494114095, "grad_norm": 0.1748046875, "learning_rate": 7.227065172270652e-05, "loss": 0.6027, "step": 3227 }, { "epoch": 1.9489888318744342, "grad_norm": 0.173828125, "learning_rate": 7.22291407222914e-05, "loss": 0.7539, "step": 3228 }, { "epoch": 1.9495925143374584, "grad_norm": 0.150390625, "learning_rate": 7.218762972187631e-05, "loss": 0.6194, "step": 3229 }, { "epoch": 1.950196196800483, "grad_norm": 0.1474609375, "learning_rate": 7.21461187214612e-05, "loss": 0.7173, "step": 3230 }, { "epoch": 1.9507998792635073, "grad_norm": 0.154296875, "learning_rate": 7.210460772104608e-05, "loss": 0.7522, "step": 3231 }, { "epoch": 1.951403561726532, "grad_norm": 0.1689453125, "learning_rate": 7.206309672063098e-05, "loss": 0.7973, "step": 3232 }, { "epoch": 1.9520072441895562, "grad_norm": 0.1337890625, "learning_rate": 7.202158572021585e-05, "loss": 0.7033, "step": 3233 }, { "epoch": 1.9526109266525808, "grad_norm": 0.1533203125, "learning_rate": 7.198007471980075e-05, "loss": 0.6033, "step": 3234 }, { "epoch": 1.953214609115605, "grad_norm": 0.140625, "learning_rate": 7.193856371938564e-05, "loss": 0.5346, "step": 3235 }, { "epoch": 1.9538182915786297, "grad_norm": 0.146484375, "learning_rate": 7.189705271897053e-05, "loss": 0.666, "step": 3236 }, { "epoch": 1.954421974041654, "grad_norm": 0.12890625, "learning_rate": 7.185554171855543e-05, "loss": 0.6379, "step": 3237 }, { "epoch": 1.9550256565046786, "grad_norm": 0.2041015625, "learning_rate": 7.181403071814031e-05, "loss": 0.5682, "step": 3238 }, { "epoch": 1.9556293389677029, "grad_norm": 0.134765625, "learning_rate": 7.17725197177252e-05, "loss": 0.697, "step": 3239 }, { "epoch": 1.9562330214307275, "grad_norm": 0.12890625, "learning_rate": 7.173100871731009e-05, "loss": 0.5934, "step": 3240 }, { "epoch": 1.9568367038937517, "grad_norm": 0.134765625, "learning_rate": 7.168949771689499e-05, "loss": 0.5425, "step": 3241 }, { "epoch": 1.9574403863567764, "grad_norm": 0.134765625, "learning_rate": 7.164798671647987e-05, "loss": 0.5661, "step": 3242 }, { "epoch": 1.9580440688198006, "grad_norm": 0.138671875, "learning_rate": 7.160647571606476e-05, "loss": 0.598, "step": 3243 }, { "epoch": 1.9586477512828253, "grad_norm": 0.140625, "learning_rate": 7.156496471564966e-05, "loss": 0.5434, "step": 3244 }, { "epoch": 1.9592514337458495, "grad_norm": 0.14453125, "learning_rate": 7.152345371523454e-05, "loss": 0.5406, "step": 3245 }, { "epoch": 1.9598551162088742, "grad_norm": 0.15625, "learning_rate": 7.148194271481943e-05, "loss": 0.5284, "step": 3246 }, { "epoch": 1.9604587986718984, "grad_norm": 0.154296875, "learning_rate": 7.144043171440432e-05, "loss": 0.5847, "step": 3247 }, { "epoch": 1.961062481134923, "grad_norm": 0.1533203125, "learning_rate": 7.139892071398922e-05, "loss": 0.5552, "step": 3248 }, { "epoch": 1.9616661635979473, "grad_norm": 0.1611328125, "learning_rate": 7.135740971357409e-05, "loss": 0.4877, "step": 3249 }, { "epoch": 1.962269846060972, "grad_norm": 0.1669921875, "learning_rate": 7.131589871315899e-05, "loss": 0.5421, "step": 3250 }, { "epoch": 1.9628735285239962, "grad_norm": 0.1796875, "learning_rate": 7.127438771274389e-05, "loss": 0.4781, "step": 3251 }, { "epoch": 1.9634772109870209, "grad_norm": 0.1650390625, "learning_rate": 7.123287671232876e-05, "loss": 0.4282, "step": 3252 }, { "epoch": 1.9640808934500453, "grad_norm": 0.1806640625, "learning_rate": 7.119136571191366e-05, "loss": 0.4592, "step": 3253 }, { "epoch": 1.9646845759130698, "grad_norm": 0.18359375, "learning_rate": 7.114985471149855e-05, "loss": 0.3757, "step": 3254 }, { "epoch": 1.9652882583760942, "grad_norm": 0.2041015625, "learning_rate": 7.110834371108344e-05, "loss": 0.3974, "step": 3255 }, { "epoch": 1.9658919408391187, "grad_norm": 0.1767578125, "learning_rate": 7.106683271066832e-05, "loss": 0.249, "step": 3256 }, { "epoch": 1.9664956233021431, "grad_norm": 0.1279296875, "learning_rate": 7.102532171025322e-05, "loss": 0.643, "step": 3257 }, { "epoch": 1.9670993057651676, "grad_norm": 0.1259765625, "learning_rate": 7.098381070983811e-05, "loss": 0.5703, "step": 3258 }, { "epoch": 1.967702988228192, "grad_norm": 0.138671875, "learning_rate": 7.0942299709423e-05, "loss": 0.6712, "step": 3259 }, { "epoch": 1.9683066706912165, "grad_norm": 0.138671875, "learning_rate": 7.09007887090079e-05, "loss": 0.6473, "step": 3260 }, { "epoch": 1.968910353154241, "grad_norm": 0.1552734375, "learning_rate": 7.085927770859277e-05, "loss": 0.7151, "step": 3261 }, { "epoch": 1.9695140356172653, "grad_norm": 0.1416015625, "learning_rate": 7.081776670817767e-05, "loss": 0.6657, "step": 3262 }, { "epoch": 1.9701177180802898, "grad_norm": 0.15625, "learning_rate": 7.077625570776256e-05, "loss": 0.7846, "step": 3263 }, { "epoch": 1.9707214005433142, "grad_norm": 0.1611328125, "learning_rate": 7.073474470734746e-05, "loss": 0.6625, "step": 3264 }, { "epoch": 1.9713250830063387, "grad_norm": 0.14453125, "learning_rate": 7.069323370693234e-05, "loss": 0.5718, "step": 3265 }, { "epoch": 1.9719287654693631, "grad_norm": 0.1416015625, "learning_rate": 7.065172270651723e-05, "loss": 0.855, "step": 3266 }, { "epoch": 1.9725324479323876, "grad_norm": 0.140625, "learning_rate": 7.061021170610213e-05, "loss": 0.8964, "step": 3267 }, { "epoch": 1.973136130395412, "grad_norm": 0.146484375, "learning_rate": 7.0568700705687e-05, "loss": 0.9168, "step": 3268 }, { "epoch": 1.9737398128584365, "grad_norm": 0.1455078125, "learning_rate": 7.05271897052719e-05, "loss": 0.6553, "step": 3269 }, { "epoch": 1.974343495321461, "grad_norm": 0.1396484375, "learning_rate": 7.048567870485679e-05, "loss": 0.6188, "step": 3270 }, { "epoch": 1.9749471777844854, "grad_norm": 0.1337890625, "learning_rate": 7.044416770444167e-05, "loss": 0.5445, "step": 3271 }, { "epoch": 1.9755508602475098, "grad_norm": 0.1396484375, "learning_rate": 7.040265670402657e-05, "loss": 0.6776, "step": 3272 }, { "epoch": 1.9761545427105343, "grad_norm": 0.140625, "learning_rate": 7.036114570361146e-05, "loss": 0.6244, "step": 3273 }, { "epoch": 1.9767582251735587, "grad_norm": 0.1416015625, "learning_rate": 7.031963470319635e-05, "loss": 0.6687, "step": 3274 }, { "epoch": 1.9773619076365831, "grad_norm": 0.11962890625, "learning_rate": 7.027812370278123e-05, "loss": 0.5678, "step": 3275 }, { "epoch": 1.9779655900996076, "grad_norm": 0.1298828125, "learning_rate": 7.023661270236613e-05, "loss": 0.7597, "step": 3276 }, { "epoch": 1.978569272562632, "grad_norm": 0.1279296875, "learning_rate": 7.019510170195102e-05, "loss": 0.6088, "step": 3277 }, { "epoch": 1.9791729550256565, "grad_norm": 0.12109375, "learning_rate": 7.015359070153591e-05, "loss": 0.8278, "step": 3278 }, { "epoch": 1.979776637488681, "grad_norm": 0.138671875, "learning_rate": 7.011207970112081e-05, "loss": 1.011, "step": 3279 }, { "epoch": 1.9803803199517054, "grad_norm": 0.1259765625, "learning_rate": 7.007056870070568e-05, "loss": 0.5667, "step": 3280 }, { "epoch": 1.9809840024147298, "grad_norm": 0.126953125, "learning_rate": 7.002905770029058e-05, "loss": 0.7015, "step": 3281 }, { "epoch": 1.9815876848777543, "grad_norm": 0.1318359375, "learning_rate": 6.998754669987547e-05, "loss": 0.614, "step": 3282 }, { "epoch": 1.9821913673407787, "grad_norm": 0.146484375, "learning_rate": 6.994603569946035e-05, "loss": 0.6907, "step": 3283 }, { "epoch": 1.9827950498038032, "grad_norm": 0.12890625, "learning_rate": 6.990452469904525e-05, "loss": 0.5913, "step": 3284 }, { "epoch": 1.9833987322668276, "grad_norm": 0.134765625, "learning_rate": 6.986301369863014e-05, "loss": 0.6824, "step": 3285 }, { "epoch": 1.984002414729852, "grad_norm": 0.1181640625, "learning_rate": 6.982150269821504e-05, "loss": 0.6013, "step": 3286 }, { "epoch": 1.9846060971928765, "grad_norm": 0.14453125, "learning_rate": 6.977999169779991e-05, "loss": 0.8288, "step": 3287 }, { "epoch": 1.985209779655901, "grad_norm": 0.1318359375, "learning_rate": 6.973848069738481e-05, "loss": 0.6383, "step": 3288 }, { "epoch": 1.9858134621189254, "grad_norm": 0.1259765625, "learning_rate": 6.96969696969697e-05, "loss": 0.7101, "step": 3289 }, { "epoch": 1.9864171445819498, "grad_norm": 0.123046875, "learning_rate": 6.965545869655459e-05, "loss": 0.7906, "step": 3290 }, { "epoch": 1.9870208270449743, "grad_norm": 0.1259765625, "learning_rate": 6.961394769613949e-05, "loss": 0.5729, "step": 3291 }, { "epoch": 1.9876245095079987, "grad_norm": 0.1376953125, "learning_rate": 6.957243669572437e-05, "loss": 0.6291, "step": 3292 }, { "epoch": 1.9882281919710234, "grad_norm": 0.1513671875, "learning_rate": 6.953092569530926e-05, "loss": 0.6279, "step": 3293 }, { "epoch": 1.9888318744340476, "grad_norm": 0.1376953125, "learning_rate": 6.948941469489415e-05, "loss": 0.5872, "step": 3294 }, { "epoch": 1.9894355568970723, "grad_norm": 0.150390625, "learning_rate": 6.944790369447905e-05, "loss": 0.5892, "step": 3295 }, { "epoch": 1.9900392393600965, "grad_norm": 0.140625, "learning_rate": 6.940639269406393e-05, "loss": 0.5677, "step": 3296 }, { "epoch": 1.9906429218231212, "grad_norm": 0.1513671875, "learning_rate": 6.936488169364882e-05, "loss": 0.5897, "step": 3297 }, { "epoch": 1.9912466042861454, "grad_norm": 0.14453125, "learning_rate": 6.932337069323372e-05, "loss": 0.5832, "step": 3298 }, { "epoch": 1.99185028674917, "grad_norm": 0.1533203125, "learning_rate": 6.928185969281859e-05, "loss": 0.5267, "step": 3299 }, { "epoch": 1.9924539692121943, "grad_norm": 0.1572265625, "learning_rate": 6.924034869240349e-05, "loss": 0.491, "step": 3300 }, { "epoch": 1.993057651675219, "grad_norm": 0.177734375, "learning_rate": 6.919883769198838e-05, "loss": 0.4929, "step": 3301 }, { "epoch": 1.9936613341382432, "grad_norm": 0.171875, "learning_rate": 6.915732669157326e-05, "loss": 0.4367, "step": 3302 }, { "epoch": 1.9942650166012679, "grad_norm": 0.1845703125, "learning_rate": 6.911581569115816e-05, "loss": 0.3382, "step": 3303 }, { "epoch": 1.994868699064292, "grad_norm": 0.1953125, "learning_rate": 6.907430469074305e-05, "loss": 0.3819, "step": 3304 }, { "epoch": 1.9954723815273168, "grad_norm": 0.1875, "learning_rate": 6.903279369032794e-05, "loss": 0.3094, "step": 3305 }, { "epoch": 1.996076063990341, "grad_norm": 0.220703125, "learning_rate": 6.899128268991282e-05, "loss": 0.2772, "step": 3306 }, { "epoch": 1.9966797464533657, "grad_norm": 0.1416015625, "learning_rate": 6.894977168949772e-05, "loss": 0.8034, "step": 3307 }, { "epoch": 1.9972834289163899, "grad_norm": 0.1484375, "learning_rate": 6.890826068908261e-05, "loss": 0.6767, "step": 3308 }, { "epoch": 1.9978871113794145, "grad_norm": 0.142578125, "learning_rate": 6.88667496886675e-05, "loss": 0.7063, "step": 3309 }, { "epoch": 1.9984907938424388, "grad_norm": 0.23828125, "learning_rate": 6.88252386882524e-05, "loss": 0.5946, "step": 3310 }, { "epoch": 1.9990944763054634, "grad_norm": 0.1298828125, "learning_rate": 6.878372768783728e-05, "loss": 0.7957, "step": 3311 }, { "epoch": 1.9996981587684877, "grad_norm": 0.1494140625, "learning_rate": 6.874221668742217e-05, "loss": 0.4724, "step": 3312 }, { "epoch": 2.0006036824630247, "grad_norm": 0.279296875, "learning_rate": 6.870070568700706e-05, "loss": 0.9675, "step": 3313 }, { "epoch": 2.001207364926049, "grad_norm": 0.1337890625, "learning_rate": 6.865919468659196e-05, "loss": 0.6052, "step": 3314 }, { "epoch": 2.0018110473890736, "grad_norm": 0.140625, "learning_rate": 6.861768368617684e-05, "loss": 0.675, "step": 3315 }, { "epoch": 2.002414729852098, "grad_norm": 0.12451171875, "learning_rate": 6.857617268576173e-05, "loss": 0.7636, "step": 3316 }, { "epoch": 2.0030184123151225, "grad_norm": 0.1337890625, "learning_rate": 6.853466168534663e-05, "loss": 0.5467, "step": 3317 }, { "epoch": 2.0036220947781467, "grad_norm": 0.1328125, "learning_rate": 6.84931506849315e-05, "loss": 0.6109, "step": 3318 }, { "epoch": 2.0042257772411713, "grad_norm": 0.1376953125, "learning_rate": 6.84516396845164e-05, "loss": 0.7048, "step": 3319 }, { "epoch": 2.0048294597041956, "grad_norm": 0.12060546875, "learning_rate": 6.841012868410129e-05, "loss": 1.0679, "step": 3320 }, { "epoch": 2.0054331421672202, "grad_norm": 0.1494140625, "learning_rate": 6.836861768368617e-05, "loss": 0.5886, "step": 3321 }, { "epoch": 2.0060368246302445, "grad_norm": 0.142578125, "learning_rate": 6.832710668327108e-05, "loss": 0.6399, "step": 3322 }, { "epoch": 2.006640507093269, "grad_norm": 0.134765625, "learning_rate": 6.828559568285596e-05, "loss": 0.6309, "step": 3323 }, { "epoch": 2.0072441895562934, "grad_norm": 0.130859375, "learning_rate": 6.824408468244085e-05, "loss": 0.5685, "step": 3324 }, { "epoch": 2.007847872019318, "grad_norm": 0.1279296875, "learning_rate": 6.820257368202573e-05, "loss": 0.5241, "step": 3325 }, { "epoch": 2.0084515544823422, "grad_norm": 0.1533203125, "learning_rate": 6.816106268161063e-05, "loss": 0.6275, "step": 3326 }, { "epoch": 2.009055236945367, "grad_norm": 0.1494140625, "learning_rate": 6.811955168119552e-05, "loss": 0.853, "step": 3327 }, { "epoch": 2.009658919408391, "grad_norm": 0.1337890625, "learning_rate": 6.807804068078041e-05, "loss": 0.5848, "step": 3328 }, { "epoch": 2.010262601871416, "grad_norm": 0.134765625, "learning_rate": 6.803652968036531e-05, "loss": 0.8565, "step": 3329 }, { "epoch": 2.01086628433444, "grad_norm": 0.1484375, "learning_rate": 6.79950186799502e-05, "loss": 0.7739, "step": 3330 }, { "epoch": 2.0114699667974647, "grad_norm": 0.1396484375, "learning_rate": 6.795350767953508e-05, "loss": 0.6635, "step": 3331 }, { "epoch": 2.012073649260489, "grad_norm": 0.12890625, "learning_rate": 6.791199667911997e-05, "loss": 0.5807, "step": 3332 }, { "epoch": 2.0126773317235136, "grad_norm": 0.1376953125, "learning_rate": 6.787048567870487e-05, "loss": 0.6374, "step": 3333 }, { "epoch": 2.013281014186538, "grad_norm": 0.125, "learning_rate": 6.782897467828975e-05, "loss": 0.5274, "step": 3334 }, { "epoch": 2.0138846966495625, "grad_norm": 0.1728515625, "learning_rate": 6.778746367787464e-05, "loss": 0.6502, "step": 3335 }, { "epoch": 2.0144883791125867, "grad_norm": 0.1455078125, "learning_rate": 6.774595267745954e-05, "loss": 0.4959, "step": 3336 }, { "epoch": 2.0150920615756114, "grad_norm": 0.1337890625, "learning_rate": 6.770444167704441e-05, "loss": 0.6071, "step": 3337 }, { "epoch": 2.0156957440386356, "grad_norm": 0.1357421875, "learning_rate": 6.766293067662931e-05, "loss": 0.5929, "step": 3338 }, { "epoch": 2.0162994265016603, "grad_norm": 0.15234375, "learning_rate": 6.76214196762142e-05, "loss": 0.6494, "step": 3339 }, { "epoch": 2.0169031089646845, "grad_norm": 0.150390625, "learning_rate": 6.757990867579909e-05, "loss": 1.0521, "step": 3340 }, { "epoch": 2.017506791427709, "grad_norm": 0.1337890625, "learning_rate": 6.753839767538399e-05, "loss": 0.6239, "step": 3341 }, { "epoch": 2.0181104738907334, "grad_norm": 0.134765625, "learning_rate": 6.749688667496887e-05, "loss": 0.6032, "step": 3342 }, { "epoch": 2.018714156353758, "grad_norm": 0.130859375, "learning_rate": 6.745537567455376e-05, "loss": 0.5677, "step": 3343 }, { "epoch": 2.0193178388167823, "grad_norm": 0.1318359375, "learning_rate": 6.741386467413865e-05, "loss": 0.5771, "step": 3344 }, { "epoch": 2.019921521279807, "grad_norm": 0.1357421875, "learning_rate": 6.737235367372355e-05, "loss": 0.5857, "step": 3345 }, { "epoch": 2.020525203742831, "grad_norm": 0.138671875, "learning_rate": 6.733084267330843e-05, "loss": 0.548, "step": 3346 }, { "epoch": 2.021128886205856, "grad_norm": 0.13671875, "learning_rate": 6.728933167289332e-05, "loss": 0.5641, "step": 3347 }, { "epoch": 2.02173256866888, "grad_norm": 0.126953125, "learning_rate": 6.724782067247822e-05, "loss": 0.5014, "step": 3348 }, { "epoch": 2.0223362511319047, "grad_norm": 0.1494140625, "learning_rate": 6.720630967206309e-05, "loss": 0.5817, "step": 3349 }, { "epoch": 2.022939933594929, "grad_norm": 0.1416015625, "learning_rate": 6.716479867164799e-05, "loss": 0.5907, "step": 3350 }, { "epoch": 2.0235436160579536, "grad_norm": 0.1484375, "learning_rate": 6.712328767123288e-05, "loss": 0.5136, "step": 3351 }, { "epoch": 2.024147298520978, "grad_norm": 0.150390625, "learning_rate": 6.708177667081778e-05, "loss": 0.5174, "step": 3352 }, { "epoch": 2.0247509809840025, "grad_norm": 0.1494140625, "learning_rate": 6.704026567040266e-05, "loss": 0.4647, "step": 3353 }, { "epoch": 2.0253546634470267, "grad_norm": 0.1591796875, "learning_rate": 6.699875466998755e-05, "loss": 0.4701, "step": 3354 }, { "epoch": 2.0259583459100514, "grad_norm": 0.1640625, "learning_rate": 6.695724366957245e-05, "loss": 0.4639, "step": 3355 }, { "epoch": 2.0265620283730756, "grad_norm": 0.169921875, "learning_rate": 6.691573266915732e-05, "loss": 0.4349, "step": 3356 }, { "epoch": 2.0271657108361003, "grad_norm": 0.181640625, "learning_rate": 6.687422166874222e-05, "loss": 0.4562, "step": 3357 }, { "epoch": 2.0277693932991245, "grad_norm": 0.1728515625, "learning_rate": 6.683271066832711e-05, "loss": 0.3564, "step": 3358 }, { "epoch": 2.028373075762149, "grad_norm": 0.201171875, "learning_rate": 6.6791199667912e-05, "loss": 0.3151, "step": 3359 }, { "epoch": 2.0289767582251734, "grad_norm": 0.1943359375, "learning_rate": 6.67496886674969e-05, "loss": 0.2686, "step": 3360 }, { "epoch": 2.029580440688198, "grad_norm": 0.2080078125, "learning_rate": 6.670817766708178e-05, "loss": 0.2815, "step": 3361 }, { "epoch": 2.0301841231512223, "grad_norm": 0.1923828125, "learning_rate": 6.666666666666667e-05, "loss": 0.2146, "step": 3362 }, { "epoch": 2.030787805614247, "grad_norm": 0.154296875, "learning_rate": 6.662515566625156e-05, "loss": 0.6592, "step": 3363 }, { "epoch": 2.031391488077271, "grad_norm": 0.158203125, "learning_rate": 6.658364466583646e-05, "loss": 0.6675, "step": 3364 }, { "epoch": 2.031995170540296, "grad_norm": 0.1494140625, "learning_rate": 6.654213366542133e-05, "loss": 0.6109, "step": 3365 }, { "epoch": 2.03259885300332, "grad_norm": 0.14453125, "learning_rate": 6.650062266500623e-05, "loss": 0.5674, "step": 3366 }, { "epoch": 2.0332025354663448, "grad_norm": 0.1416015625, "learning_rate": 6.645911166459113e-05, "loss": 0.5863, "step": 3367 }, { "epoch": 2.033806217929369, "grad_norm": 0.1435546875, "learning_rate": 6.6417600664176e-05, "loss": 0.7262, "step": 3368 }, { "epoch": 2.0344099003923937, "grad_norm": 0.142578125, "learning_rate": 6.63760896637609e-05, "loss": 0.8392, "step": 3369 }, { "epoch": 2.035013582855418, "grad_norm": 0.1376953125, "learning_rate": 6.633457866334579e-05, "loss": 0.5431, "step": 3370 }, { "epoch": 2.0356172653184426, "grad_norm": 0.1298828125, "learning_rate": 6.629306766293068e-05, "loss": 0.4897, "step": 3371 }, { "epoch": 2.036220947781467, "grad_norm": 0.1484375, "learning_rate": 6.625155666251556e-05, "loss": 0.7426, "step": 3372 }, { "epoch": 2.0368246302444915, "grad_norm": 0.138671875, "learning_rate": 6.621004566210046e-05, "loss": 0.8515, "step": 3373 }, { "epoch": 2.0374283127075157, "grad_norm": 0.1259765625, "learning_rate": 6.616853466168536e-05, "loss": 0.4825, "step": 3374 }, { "epoch": 2.0380319951705403, "grad_norm": 0.1376953125, "learning_rate": 6.612702366127023e-05, "loss": 0.6163, "step": 3375 }, { "epoch": 2.0386356776335646, "grad_norm": 0.1435546875, "learning_rate": 6.608551266085513e-05, "loss": 0.4817, "step": 3376 }, { "epoch": 2.0392393600965892, "grad_norm": 0.142578125, "learning_rate": 6.604400166044002e-05, "loss": 0.7172, "step": 3377 }, { "epoch": 2.0398430425596135, "grad_norm": 0.140625, "learning_rate": 6.600249066002491e-05, "loss": 0.6068, "step": 3378 }, { "epoch": 2.040446725022638, "grad_norm": 0.1455078125, "learning_rate": 6.59609796596098e-05, "loss": 0.8924, "step": 3379 }, { "epoch": 2.0410504074856624, "grad_norm": 0.134765625, "learning_rate": 6.59194686591947e-05, "loss": 0.8522, "step": 3380 }, { "epoch": 2.041654089948687, "grad_norm": 0.1298828125, "learning_rate": 6.587795765877958e-05, "loss": 0.7332, "step": 3381 }, { "epoch": 2.0422577724117112, "grad_norm": 0.1416015625, "learning_rate": 6.583644665836447e-05, "loss": 0.5677, "step": 3382 }, { "epoch": 2.042861454874736, "grad_norm": 0.15234375, "learning_rate": 6.579493565794937e-05, "loss": 0.8008, "step": 3383 }, { "epoch": 2.04346513733776, "grad_norm": 0.1376953125, "learning_rate": 6.575342465753424e-05, "loss": 0.6705, "step": 3384 }, { "epoch": 2.044068819800785, "grad_norm": 0.1376953125, "learning_rate": 6.571191365711914e-05, "loss": 0.5933, "step": 3385 }, { "epoch": 2.044672502263809, "grad_norm": 0.1845703125, "learning_rate": 6.567040265670403e-05, "loss": 0.8533, "step": 3386 }, { "epoch": 2.0452761847268337, "grad_norm": 0.1689453125, "learning_rate": 6.562889165628891e-05, "loss": 0.7038, "step": 3387 }, { "epoch": 2.045879867189858, "grad_norm": 0.1357421875, "learning_rate": 6.558738065587381e-05, "loss": 0.584, "step": 3388 }, { "epoch": 2.0464835496528826, "grad_norm": 0.14453125, "learning_rate": 6.55458696554587e-05, "loss": 0.9452, "step": 3389 }, { "epoch": 2.047087232115907, "grad_norm": 0.1611328125, "learning_rate": 6.550435865504359e-05, "loss": 0.5957, "step": 3390 }, { "epoch": 2.0476909145789315, "grad_norm": 0.1298828125, "learning_rate": 6.546284765462847e-05, "loss": 0.6208, "step": 3391 }, { "epoch": 2.0482945970419557, "grad_norm": 0.15234375, "learning_rate": 6.542133665421337e-05, "loss": 0.7178, "step": 3392 }, { "epoch": 2.0488982795049804, "grad_norm": 0.255859375, "learning_rate": 6.537982565379826e-05, "loss": 0.7035, "step": 3393 }, { "epoch": 2.049501961968005, "grad_norm": 0.154296875, "learning_rate": 6.533831465338315e-05, "loss": 0.7374, "step": 3394 }, { "epoch": 2.0501056444310293, "grad_norm": 0.130859375, "learning_rate": 6.529680365296805e-05, "loss": 0.5717, "step": 3395 }, { "epoch": 2.050709326894054, "grad_norm": 0.1357421875, "learning_rate": 6.525529265255293e-05, "loss": 0.5723, "step": 3396 }, { "epoch": 2.051313009357078, "grad_norm": 0.1591796875, "learning_rate": 6.521378165213782e-05, "loss": 0.6278, "step": 3397 }, { "epoch": 2.051916691820103, "grad_norm": 0.140625, "learning_rate": 6.51722706517227e-05, "loss": 0.5791, "step": 3398 }, { "epoch": 2.052520374283127, "grad_norm": 0.1416015625, "learning_rate": 6.51307596513076e-05, "loss": 0.5202, "step": 3399 }, { "epoch": 2.0531240567461517, "grad_norm": 0.1455078125, "learning_rate": 6.508924865089249e-05, "loss": 0.4888, "step": 3400 }, { "epoch": 2.053727739209176, "grad_norm": 0.1416015625, "learning_rate": 6.504773765047738e-05, "loss": 0.5308, "step": 3401 }, { "epoch": 2.0543314216722006, "grad_norm": 0.1552734375, "learning_rate": 6.500622665006228e-05, "loss": 0.4807, "step": 3402 }, { "epoch": 2.054935104135225, "grad_norm": 0.1708984375, "learning_rate": 6.496471564964715e-05, "loss": 0.5628, "step": 3403 }, { "epoch": 2.0555387865982495, "grad_norm": 0.17578125, "learning_rate": 6.492320464923205e-05, "loss": 0.5691, "step": 3404 }, { "epoch": 2.0561424690612737, "grad_norm": 0.15625, "learning_rate": 6.488169364881694e-05, "loss": 0.4539, "step": 3405 }, { "epoch": 2.0567461515242984, "grad_norm": 0.162109375, "learning_rate": 6.484018264840182e-05, "loss": 0.598, "step": 3406 }, { "epoch": 2.0573498339873226, "grad_norm": 0.1689453125, "learning_rate": 6.479867164798672e-05, "loss": 0.417, "step": 3407 }, { "epoch": 2.0579535164503473, "grad_norm": 0.1845703125, "learning_rate": 6.475716064757161e-05, "loss": 0.3549, "step": 3408 }, { "epoch": 2.0585571989133715, "grad_norm": 0.19921875, "learning_rate": 6.47156496471565e-05, "loss": 0.4513, "step": 3409 }, { "epoch": 2.059160881376396, "grad_norm": 0.18359375, "learning_rate": 6.467413864674138e-05, "loss": 0.3228, "step": 3410 }, { "epoch": 2.0597645638394204, "grad_norm": 0.2109375, "learning_rate": 6.463262764632628e-05, "loss": 0.3144, "step": 3411 }, { "epoch": 2.060368246302445, "grad_norm": 0.1826171875, "learning_rate": 6.459111664591117e-05, "loss": 0.2016, "step": 3412 }, { "epoch": 2.0609719287654693, "grad_norm": 0.1630859375, "learning_rate": 6.454960564549606e-05, "loss": 0.5913, "step": 3413 }, { "epoch": 2.061575611228494, "grad_norm": 0.14453125, "learning_rate": 6.450809464508096e-05, "loss": 1.0476, "step": 3414 }, { "epoch": 2.062179293691518, "grad_norm": 0.14453125, "learning_rate": 6.446658364466583e-05, "loss": 0.6198, "step": 3415 }, { "epoch": 2.062782976154543, "grad_norm": 0.1533203125, "learning_rate": 6.442507264425073e-05, "loss": 0.5562, "step": 3416 }, { "epoch": 2.063386658617567, "grad_norm": 0.150390625, "learning_rate": 6.438356164383562e-05, "loss": 0.646, "step": 3417 }, { "epoch": 2.0639903410805918, "grad_norm": 0.1416015625, "learning_rate": 6.434205064342052e-05, "loss": 0.6686, "step": 3418 }, { "epoch": 2.064594023543616, "grad_norm": 0.1650390625, "learning_rate": 6.43005396430054e-05, "loss": 0.8418, "step": 3419 }, { "epoch": 2.0651977060066407, "grad_norm": 0.142578125, "learning_rate": 6.425902864259029e-05, "loss": 1.04, "step": 3420 }, { "epoch": 2.065801388469665, "grad_norm": 0.1669921875, "learning_rate": 6.421751764217519e-05, "loss": 0.6007, "step": 3421 }, { "epoch": 2.0664050709326895, "grad_norm": 0.1435546875, "learning_rate": 6.417600664176006e-05, "loss": 0.6781, "step": 3422 }, { "epoch": 2.0670087533957138, "grad_norm": 0.1455078125, "learning_rate": 6.413449564134496e-05, "loss": 0.6264, "step": 3423 }, { "epoch": 2.0676124358587384, "grad_norm": 0.341796875, "learning_rate": 6.409298464092985e-05, "loss": 0.7689, "step": 3424 }, { "epoch": 2.0682161183217627, "grad_norm": 0.1318359375, "learning_rate": 6.405147364051474e-05, "loss": 0.5546, "step": 3425 }, { "epoch": 2.0688198007847873, "grad_norm": 0.130859375, "learning_rate": 6.400996264009964e-05, "loss": 0.7563, "step": 3426 }, { "epoch": 2.0694234832478116, "grad_norm": 0.13671875, "learning_rate": 6.396845163968452e-05, "loss": 0.6584, "step": 3427 }, { "epoch": 2.0700271657108362, "grad_norm": 0.1455078125, "learning_rate": 6.392694063926941e-05, "loss": 0.6751, "step": 3428 }, { "epoch": 2.0706308481738604, "grad_norm": 0.130859375, "learning_rate": 6.38854296388543e-05, "loss": 0.607, "step": 3429 }, { "epoch": 2.071234530636885, "grad_norm": 0.1689453125, "learning_rate": 6.38439186384392e-05, "loss": 0.6411, "step": 3430 }, { "epoch": 2.0718382130999093, "grad_norm": 0.1240234375, "learning_rate": 6.380240763802408e-05, "loss": 0.6765, "step": 3431 }, { "epoch": 2.072441895562934, "grad_norm": 0.14453125, "learning_rate": 6.376089663760897e-05, "loss": 1.0625, "step": 3432 }, { "epoch": 2.0730455780259582, "grad_norm": 0.13671875, "learning_rate": 6.371938563719387e-05, "loss": 0.8033, "step": 3433 }, { "epoch": 2.073649260488983, "grad_norm": 0.134765625, "learning_rate": 6.367787463677874e-05, "loss": 0.5758, "step": 3434 }, { "epoch": 2.074252942952007, "grad_norm": 0.154296875, "learning_rate": 6.363636363636364e-05, "loss": 0.8254, "step": 3435 }, { "epoch": 2.074856625415032, "grad_norm": 0.1357421875, "learning_rate": 6.359485263594853e-05, "loss": 0.5519, "step": 3436 }, { "epoch": 2.075460307878056, "grad_norm": 0.1435546875, "learning_rate": 6.355334163553341e-05, "loss": 0.5636, "step": 3437 }, { "epoch": 2.0760639903410807, "grad_norm": 0.1640625, "learning_rate": 6.351183063511831e-05, "loss": 0.7238, "step": 3438 }, { "epoch": 2.076667672804105, "grad_norm": 0.1279296875, "learning_rate": 6.34703196347032e-05, "loss": 0.6205, "step": 3439 }, { "epoch": 2.0772713552671296, "grad_norm": 0.138671875, "learning_rate": 6.342880863428809e-05, "loss": 0.5848, "step": 3440 }, { "epoch": 2.077875037730154, "grad_norm": 0.140625, "learning_rate": 6.338729763387297e-05, "loss": 0.7892, "step": 3441 }, { "epoch": 2.0784787201931785, "grad_norm": 0.1494140625, "learning_rate": 6.334578663345787e-05, "loss": 0.6707, "step": 3442 }, { "epoch": 2.0790824026562027, "grad_norm": 0.1357421875, "learning_rate": 6.330427563304276e-05, "loss": 0.7932, "step": 3443 }, { "epoch": 2.0796860851192274, "grad_norm": 0.1474609375, "learning_rate": 6.326276463262765e-05, "loss": 0.662, "step": 3444 }, { "epoch": 2.0802897675822516, "grad_norm": 0.1572265625, "learning_rate": 6.322125363221255e-05, "loss": 0.7459, "step": 3445 }, { "epoch": 2.0808934500452763, "grad_norm": 0.1455078125, "learning_rate": 6.317974263179743e-05, "loss": 0.5336, "step": 3446 }, { "epoch": 2.0814971325083005, "grad_norm": 0.146484375, "learning_rate": 6.313823163138232e-05, "loss": 0.619, "step": 3447 }, { "epoch": 2.082100814971325, "grad_norm": 0.146484375, "learning_rate": 6.30967206309672e-05, "loss": 0.5999, "step": 3448 }, { "epoch": 2.0827044974343494, "grad_norm": 0.1337890625, "learning_rate": 6.30552096305521e-05, "loss": 0.5155, "step": 3449 }, { "epoch": 2.083308179897374, "grad_norm": 0.162109375, "learning_rate": 6.301369863013699e-05, "loss": 0.5144, "step": 3450 }, { "epoch": 2.0839118623603983, "grad_norm": 0.1513671875, "learning_rate": 6.297218762972188e-05, "loss": 0.5631, "step": 3451 }, { "epoch": 2.084515544823423, "grad_norm": 0.15234375, "learning_rate": 6.293067662930678e-05, "loss": 0.6323, "step": 3452 }, { "epoch": 2.085119227286447, "grad_norm": 0.162109375, "learning_rate": 6.288916562889165e-05, "loss": 0.447, "step": 3453 }, { "epoch": 2.085722909749472, "grad_norm": 0.16015625, "learning_rate": 6.284765462847655e-05, "loss": 0.5228, "step": 3454 }, { "epoch": 2.086326592212496, "grad_norm": 0.166015625, "learning_rate": 6.280614362806144e-05, "loss": 0.4278, "step": 3455 }, { "epoch": 2.0869302746755207, "grad_norm": 0.177734375, "learning_rate": 6.276463262764632e-05, "loss": 0.4298, "step": 3456 }, { "epoch": 2.087533957138545, "grad_norm": 0.18359375, "learning_rate": 6.272312162723122e-05, "loss": 0.4683, "step": 3457 }, { "epoch": 2.0881376396015696, "grad_norm": 0.18359375, "learning_rate": 6.268161062681611e-05, "loss": 0.4114, "step": 3458 }, { "epoch": 2.088741322064594, "grad_norm": 0.2021484375, "learning_rate": 6.2640099626401e-05, "loss": 0.4019, "step": 3459 }, { "epoch": 2.0893450045276185, "grad_norm": 0.203125, "learning_rate": 6.259858862598588e-05, "loss": 0.3359, "step": 3460 }, { "epoch": 2.0899486869906427, "grad_norm": 0.205078125, "learning_rate": 6.255707762557078e-05, "loss": 0.2717, "step": 3461 }, { "epoch": 2.0905523694536674, "grad_norm": 0.1962890625, "learning_rate": 6.251556662515566e-05, "loss": 0.1975, "step": 3462 }, { "epoch": 2.0911560519166916, "grad_norm": 0.142578125, "learning_rate": 6.247405562474056e-05, "loss": 0.9222, "step": 3463 }, { "epoch": 2.0917597343797163, "grad_norm": 0.142578125, "learning_rate": 6.243254462432546e-05, "loss": 0.5585, "step": 3464 }, { "epoch": 2.0923634168427405, "grad_norm": 0.1630859375, "learning_rate": 6.239103362391034e-05, "loss": 0.653, "step": 3465 }, { "epoch": 2.092967099305765, "grad_norm": 0.1474609375, "learning_rate": 6.234952262349523e-05, "loss": 0.6304, "step": 3466 }, { "epoch": 2.0935707817687894, "grad_norm": 0.154296875, "learning_rate": 6.230801162308012e-05, "loss": 0.6062, "step": 3467 }, { "epoch": 2.094174464231814, "grad_norm": 0.15625, "learning_rate": 6.226650062266502e-05, "loss": 0.5847, "step": 3468 }, { "epoch": 2.0947781466948383, "grad_norm": 0.1455078125, "learning_rate": 6.222498962224989e-05, "loss": 0.6474, "step": 3469 }, { "epoch": 2.095381829157863, "grad_norm": 0.1337890625, "learning_rate": 6.218347862183479e-05, "loss": 0.6167, "step": 3470 }, { "epoch": 2.095985511620887, "grad_norm": 0.1630859375, "learning_rate": 6.214196762141969e-05, "loss": 0.6131, "step": 3471 }, { "epoch": 2.096589194083912, "grad_norm": 0.1728515625, "learning_rate": 6.210045662100456e-05, "loss": 0.7248, "step": 3472 }, { "epoch": 2.097192876546936, "grad_norm": 0.177734375, "learning_rate": 6.205894562058946e-05, "loss": 0.5809, "step": 3473 }, { "epoch": 2.0977965590099608, "grad_norm": 0.150390625, "learning_rate": 6.201743462017435e-05, "loss": 0.6752, "step": 3474 }, { "epoch": 2.0984002414729854, "grad_norm": 0.1513671875, "learning_rate": 6.197592361975924e-05, "loss": 0.6793, "step": 3475 }, { "epoch": 2.0990039239360097, "grad_norm": 0.1455078125, "learning_rate": 6.193441261934412e-05, "loss": 0.6169, "step": 3476 }, { "epoch": 2.0996076063990343, "grad_norm": 0.1484375, "learning_rate": 6.189290161892902e-05, "loss": 0.5645, "step": 3477 }, { "epoch": 2.1002112888620585, "grad_norm": 0.1376953125, "learning_rate": 6.185139061851391e-05, "loss": 0.5911, "step": 3478 }, { "epoch": 2.100814971325083, "grad_norm": 0.146484375, "learning_rate": 6.18098796180988e-05, "loss": 0.5954, "step": 3479 }, { "epoch": 2.1014186537881074, "grad_norm": 0.134765625, "learning_rate": 6.17683686176837e-05, "loss": 0.5374, "step": 3480 }, { "epoch": 2.102022336251132, "grad_norm": 0.14453125, "learning_rate": 6.172685761726857e-05, "loss": 0.5509, "step": 3481 }, { "epoch": 2.1026260187141563, "grad_norm": 0.1376953125, "learning_rate": 6.168534661685347e-05, "loss": 0.6542, "step": 3482 }, { "epoch": 2.103229701177181, "grad_norm": 0.1494140625, "learning_rate": 6.164383561643835e-05, "loss": 0.7288, "step": 3483 }, { "epoch": 2.1038333836402052, "grad_norm": 0.1376953125, "learning_rate": 6.160232461602324e-05, "loss": 0.6775, "step": 3484 }, { "epoch": 2.10443706610323, "grad_norm": 0.1396484375, "learning_rate": 6.156081361560814e-05, "loss": 0.6655, "step": 3485 }, { "epoch": 2.105040748566254, "grad_norm": 0.142578125, "learning_rate": 6.151930261519303e-05, "loss": 0.5989, "step": 3486 }, { "epoch": 2.105644431029279, "grad_norm": 0.1416015625, "learning_rate": 6.147779161477793e-05, "loss": 0.6795, "step": 3487 }, { "epoch": 2.106248113492303, "grad_norm": 0.1396484375, "learning_rate": 6.14362806143628e-05, "loss": 0.6076, "step": 3488 }, { "epoch": 2.1068517959553277, "grad_norm": 0.1337890625, "learning_rate": 6.13947696139477e-05, "loss": 0.5628, "step": 3489 }, { "epoch": 2.107455478418352, "grad_norm": 0.1455078125, "learning_rate": 6.13532586135326e-05, "loss": 0.6294, "step": 3490 }, { "epoch": 2.1080591608813766, "grad_norm": 0.1435546875, "learning_rate": 6.131174761311747e-05, "loss": 0.6435, "step": 3491 }, { "epoch": 2.108662843344401, "grad_norm": 0.13671875, "learning_rate": 6.127023661270237e-05, "loss": 0.6564, "step": 3492 }, { "epoch": 2.1092665258074255, "grad_norm": 0.1435546875, "learning_rate": 6.122872561228726e-05, "loss": 0.6102, "step": 3493 }, { "epoch": 2.1098702082704497, "grad_norm": 0.1328125, "learning_rate": 6.118721461187215e-05, "loss": 0.5999, "step": 3494 }, { "epoch": 2.1104738907334744, "grad_norm": 0.1416015625, "learning_rate": 6.114570361145703e-05, "loss": 0.6853, "step": 3495 }, { "epoch": 2.1110775731964986, "grad_norm": 0.1357421875, "learning_rate": 6.110419261104193e-05, "loss": 0.5913, "step": 3496 }, { "epoch": 2.1116812556595232, "grad_norm": 0.1328125, "learning_rate": 6.106268161062682e-05, "loss": 0.5238, "step": 3497 }, { "epoch": 2.1122849381225475, "grad_norm": 0.1494140625, "learning_rate": 6.1021170610211706e-05, "loss": 0.5639, "step": 3498 }, { "epoch": 2.112888620585572, "grad_norm": 0.1455078125, "learning_rate": 6.09796596097966e-05, "loss": 0.5235, "step": 3499 }, { "epoch": 2.1134923030485964, "grad_norm": 0.1484375, "learning_rate": 6.0938148609381486e-05, "loss": 0.5606, "step": 3500 }, { "epoch": 2.1134923030485964, "eval_loss": 0.617886483669281, "eval_runtime": 1059.7393, "eval_samples_per_second": 2.633, "eval_steps_per_second": 0.329, "step": 3500 }, { "epoch": 2.114095985511621, "grad_norm": 0.1513671875, "learning_rate": 6.089663760896638e-05, "loss": 0.5319, "step": 3501 }, { "epoch": 2.1146996679746453, "grad_norm": 0.1474609375, "learning_rate": 6.085512660855127e-05, "loss": 0.4808, "step": 3502 }, { "epoch": 2.11530335043767, "grad_norm": 0.1630859375, "learning_rate": 6.081361560813615e-05, "loss": 0.5266, "step": 3503 }, { "epoch": 2.115907032900694, "grad_norm": 0.1806640625, "learning_rate": 6.0772104607721045e-05, "loss": 0.5523, "step": 3504 }, { "epoch": 2.116510715363719, "grad_norm": 0.1708984375, "learning_rate": 6.073059360730594e-05, "loss": 0.4785, "step": 3505 }, { "epoch": 2.117114397826743, "grad_norm": 0.1669921875, "learning_rate": 6.0689082606890825e-05, "loss": 0.4426, "step": 3506 }, { "epoch": 2.1177180802897677, "grad_norm": 0.177734375, "learning_rate": 6.064757160647572e-05, "loss": 0.4489, "step": 3507 }, { "epoch": 2.118321762752792, "grad_norm": 0.19921875, "learning_rate": 6.060606060606061e-05, "loss": 0.4412, "step": 3508 }, { "epoch": 2.1189254452158166, "grad_norm": 0.1845703125, "learning_rate": 6.0564549605645505e-05, "loss": 0.3421, "step": 3509 }, { "epoch": 2.119529127678841, "grad_norm": 0.1982421875, "learning_rate": 6.0523038605230384e-05, "loss": 0.3627, "step": 3510 }, { "epoch": 2.1201328101418655, "grad_norm": 0.1845703125, "learning_rate": 6.048152760481528e-05, "loss": 0.265, "step": 3511 }, { "epoch": 2.1207364926048897, "grad_norm": 0.1982421875, "learning_rate": 6.044001660440017e-05, "loss": 0.2138, "step": 3512 }, { "epoch": 2.1213401750679144, "grad_norm": 0.138671875, "learning_rate": 6.039850560398506e-05, "loss": 0.574, "step": 3513 }, { "epoch": 2.1219438575309386, "grad_norm": 0.158203125, "learning_rate": 6.035699460356995e-05, "loss": 0.754, "step": 3514 }, { "epoch": 2.1225475399939633, "grad_norm": 0.1396484375, "learning_rate": 6.0315483603154844e-05, "loss": 0.6438, "step": 3515 }, { "epoch": 2.1231512224569875, "grad_norm": 0.1376953125, "learning_rate": 6.0273972602739724e-05, "loss": 0.6454, "step": 3516 }, { "epoch": 2.123754904920012, "grad_norm": 0.1357421875, "learning_rate": 6.023246160232462e-05, "loss": 0.5961, "step": 3517 }, { "epoch": 2.1243585873830364, "grad_norm": 0.1572265625, "learning_rate": 6.019095060190951e-05, "loss": 0.6337, "step": 3518 }, { "epoch": 2.124962269846061, "grad_norm": 0.1357421875, "learning_rate": 6.01494396014944e-05, "loss": 0.58, "step": 3519 }, { "epoch": 2.1255659523090853, "grad_norm": 0.173828125, "learning_rate": 6.010792860107929e-05, "loss": 0.8145, "step": 3520 }, { "epoch": 2.12616963477211, "grad_norm": 0.12890625, "learning_rate": 6.006641760066418e-05, "loss": 0.5888, "step": 3521 }, { "epoch": 2.126773317235134, "grad_norm": 0.1494140625, "learning_rate": 6.002490660024906e-05, "loss": 0.5517, "step": 3522 }, { "epoch": 2.127376999698159, "grad_norm": 0.171875, "learning_rate": 5.9983395599833956e-05, "loss": 0.7016, "step": 3523 }, { "epoch": 2.127980682161183, "grad_norm": 0.154296875, "learning_rate": 5.994188459941885e-05, "loss": 0.6212, "step": 3524 }, { "epoch": 2.1285843646242077, "grad_norm": 0.140625, "learning_rate": 5.9900373599003736e-05, "loss": 0.8909, "step": 3525 }, { "epoch": 2.129188047087232, "grad_norm": 0.150390625, "learning_rate": 5.985886259858863e-05, "loss": 0.8154, "step": 3526 }, { "epoch": 2.1297917295502566, "grad_norm": 0.1533203125, "learning_rate": 5.981735159817352e-05, "loss": 0.6854, "step": 3527 }, { "epoch": 2.130395412013281, "grad_norm": 0.1416015625, "learning_rate": 5.97758405977584e-05, "loss": 0.5829, "step": 3528 }, { "epoch": 2.1309990944763055, "grad_norm": 0.134765625, "learning_rate": 5.9734329597343295e-05, "loss": 0.7917, "step": 3529 }, { "epoch": 2.1316027769393298, "grad_norm": 0.138671875, "learning_rate": 5.969281859692819e-05, "loss": 0.6034, "step": 3530 }, { "epoch": 2.1322064594023544, "grad_norm": 0.140625, "learning_rate": 5.965130759651308e-05, "loss": 0.6514, "step": 3531 }, { "epoch": 2.1328101418653787, "grad_norm": 0.1298828125, "learning_rate": 5.960979659609797e-05, "loss": 0.5277, "step": 3532 }, { "epoch": 2.1334138243284033, "grad_norm": 0.1552734375, "learning_rate": 5.956828559568286e-05, "loss": 0.6354, "step": 3533 }, { "epoch": 2.1340175067914275, "grad_norm": 0.140625, "learning_rate": 5.9526774595267755e-05, "loss": 0.7139, "step": 3534 }, { "epoch": 2.134621189254452, "grad_norm": 0.1484375, "learning_rate": 5.9485263594852635e-05, "loss": 0.6376, "step": 3535 }, { "epoch": 2.1352248717174764, "grad_norm": 0.1640625, "learning_rate": 5.944375259443753e-05, "loss": 0.6315, "step": 3536 }, { "epoch": 2.135828554180501, "grad_norm": 0.1318359375, "learning_rate": 5.940224159402242e-05, "loss": 0.5425, "step": 3537 }, { "epoch": 2.1364322366435253, "grad_norm": 0.1337890625, "learning_rate": 5.936073059360731e-05, "loss": 0.6503, "step": 3538 }, { "epoch": 2.13703591910655, "grad_norm": 0.1494140625, "learning_rate": 5.93192195931922e-05, "loss": 0.6238, "step": 3539 }, { "epoch": 2.137639601569574, "grad_norm": 0.185546875, "learning_rate": 5.9277708592777094e-05, "loss": 0.6117, "step": 3540 }, { "epoch": 2.138243284032599, "grad_norm": 0.1337890625, "learning_rate": 5.9236197592361974e-05, "loss": 0.7165, "step": 3541 }, { "epoch": 2.138846966495623, "grad_norm": 0.1513671875, "learning_rate": 5.919468659194687e-05, "loss": 0.7887, "step": 3542 }, { "epoch": 2.139450648958648, "grad_norm": 0.15625, "learning_rate": 5.915317559153176e-05, "loss": 0.7581, "step": 3543 }, { "epoch": 2.140054331421672, "grad_norm": 0.142578125, "learning_rate": 5.911166459111665e-05, "loss": 0.6213, "step": 3544 }, { "epoch": 2.1406580138846967, "grad_norm": 0.142578125, "learning_rate": 5.907015359070154e-05, "loss": 0.7873, "step": 3545 }, { "epoch": 2.141261696347721, "grad_norm": 0.142578125, "learning_rate": 5.902864259028643e-05, "loss": 0.6151, "step": 3546 }, { "epoch": 2.1418653788107456, "grad_norm": 0.1357421875, "learning_rate": 5.898713158987131e-05, "loss": 0.5618, "step": 3547 }, { "epoch": 2.14246906127377, "grad_norm": 0.1416015625, "learning_rate": 5.8945620589456206e-05, "loss": 0.5792, "step": 3548 }, { "epoch": 2.1430727437367945, "grad_norm": 0.1572265625, "learning_rate": 5.89041095890411e-05, "loss": 0.5894, "step": 3549 }, { "epoch": 2.1436764261998187, "grad_norm": 0.15234375, "learning_rate": 5.8862598588625986e-05, "loss": 0.5326, "step": 3550 }, { "epoch": 2.1442801086628434, "grad_norm": 0.1513671875, "learning_rate": 5.882108758821088e-05, "loss": 0.5008, "step": 3551 }, { "epoch": 2.144883791125868, "grad_norm": 0.1435546875, "learning_rate": 5.877957658779577e-05, "loss": 0.4708, "step": 3552 }, { "epoch": 2.1454874735888922, "grad_norm": 0.158203125, "learning_rate": 5.8738065587380666e-05, "loss": 0.4923, "step": 3553 }, { "epoch": 2.1460911560519165, "grad_norm": 0.1650390625, "learning_rate": 5.8696554586965546e-05, "loss": 0.4978, "step": 3554 }, { "epoch": 2.146694838514941, "grad_norm": 0.1708984375, "learning_rate": 5.865504358655044e-05, "loss": 0.4812, "step": 3555 }, { "epoch": 2.147298520977966, "grad_norm": 0.1650390625, "learning_rate": 5.861353258613533e-05, "loss": 0.4216, "step": 3556 }, { "epoch": 2.14790220344099, "grad_norm": 0.177734375, "learning_rate": 5.857202158572022e-05, "loss": 0.4385, "step": 3557 }, { "epoch": 2.1485058859040143, "grad_norm": 0.177734375, "learning_rate": 5.853051058530511e-05, "loss": 0.3954, "step": 3558 }, { "epoch": 2.149109568367039, "grad_norm": 0.189453125, "learning_rate": 5.8488999584890005e-05, "loss": 0.3699, "step": 3559 }, { "epoch": 2.1497132508300636, "grad_norm": 0.2001953125, "learning_rate": 5.8447488584474885e-05, "loss": 0.332, "step": 3560 }, { "epoch": 2.150316933293088, "grad_norm": 0.208984375, "learning_rate": 5.840597758405978e-05, "loss": 0.2789, "step": 3561 }, { "epoch": 2.1509206157561125, "grad_norm": 0.1904296875, "learning_rate": 5.836446658364467e-05, "loss": 0.2135, "step": 3562 }, { "epoch": 2.1515242982191367, "grad_norm": 0.15234375, "learning_rate": 5.832295558322956e-05, "loss": 0.6707, "step": 3563 }, { "epoch": 2.1521279806821614, "grad_norm": 0.138671875, "learning_rate": 5.828144458281445e-05, "loss": 0.5971, "step": 3564 }, { "epoch": 2.1527316631451856, "grad_norm": 0.1416015625, "learning_rate": 5.8239933582399344e-05, "loss": 0.7411, "step": 3565 }, { "epoch": 2.1533353456082103, "grad_norm": 0.150390625, "learning_rate": 5.8198422581984224e-05, "loss": 0.7833, "step": 3566 }, { "epoch": 2.1539390280712345, "grad_norm": 0.1474609375, "learning_rate": 5.815691158156912e-05, "loss": 0.519, "step": 3567 }, { "epoch": 2.154542710534259, "grad_norm": 0.138671875, "learning_rate": 5.811540058115401e-05, "loss": 0.5645, "step": 3568 }, { "epoch": 2.1551463929972834, "grad_norm": 0.1748046875, "learning_rate": 5.80738895807389e-05, "loss": 0.5951, "step": 3569 }, { "epoch": 2.155750075460308, "grad_norm": 0.150390625, "learning_rate": 5.803237858032379e-05, "loss": 0.7003, "step": 3570 }, { "epoch": 2.1563537579233323, "grad_norm": 0.1689453125, "learning_rate": 5.7990867579908683e-05, "loss": 0.7031, "step": 3571 }, { "epoch": 2.156957440386357, "grad_norm": 0.158203125, "learning_rate": 5.794935657949356e-05, "loss": 0.8678, "step": 3572 }, { "epoch": 2.157561122849381, "grad_norm": 0.16015625, "learning_rate": 5.7907845579078456e-05, "loss": 0.6421, "step": 3573 }, { "epoch": 2.158164805312406, "grad_norm": 0.15234375, "learning_rate": 5.786633457866335e-05, "loss": 0.631, "step": 3574 }, { "epoch": 2.15876848777543, "grad_norm": 0.15234375, "learning_rate": 5.782482357824824e-05, "loss": 0.5944, "step": 3575 }, { "epoch": 2.1593721702384547, "grad_norm": 0.1484375, "learning_rate": 5.778331257783313e-05, "loss": 0.6048, "step": 3576 }, { "epoch": 2.159975852701479, "grad_norm": 0.146484375, "learning_rate": 5.774180157741802e-05, "loss": 0.6322, "step": 3577 }, { "epoch": 2.1605795351645036, "grad_norm": 0.134765625, "learning_rate": 5.7700290577002916e-05, "loss": 0.8564, "step": 3578 }, { "epoch": 2.161183217627528, "grad_norm": 0.1533203125, "learning_rate": 5.7658779576587796e-05, "loss": 0.6573, "step": 3579 }, { "epoch": 2.1617869000905525, "grad_norm": 0.1455078125, "learning_rate": 5.761726857617269e-05, "loss": 0.6145, "step": 3580 }, { "epoch": 2.1623905825535767, "grad_norm": 0.15234375, "learning_rate": 5.757575757575758e-05, "loss": 0.6561, "step": 3581 }, { "epoch": 2.1629942650166014, "grad_norm": 0.150390625, "learning_rate": 5.753424657534247e-05, "loss": 0.5926, "step": 3582 }, { "epoch": 2.1635979474796256, "grad_norm": 0.15234375, "learning_rate": 5.749273557492736e-05, "loss": 0.5751, "step": 3583 }, { "epoch": 2.1642016299426503, "grad_norm": 0.1416015625, "learning_rate": 5.7451224574512255e-05, "loss": 0.6293, "step": 3584 }, { "epoch": 2.1648053124056745, "grad_norm": 0.13671875, "learning_rate": 5.7409713574097135e-05, "loss": 0.6692, "step": 3585 }, { "epoch": 2.165408994868699, "grad_norm": 0.12890625, "learning_rate": 5.736820257368203e-05, "loss": 0.7354, "step": 3586 }, { "epoch": 2.1660126773317234, "grad_norm": 0.1513671875, "learning_rate": 5.732669157326692e-05, "loss": 0.6266, "step": 3587 }, { "epoch": 2.166616359794748, "grad_norm": 0.14453125, "learning_rate": 5.72851805728518e-05, "loss": 0.6857, "step": 3588 }, { "epoch": 2.1672200422577723, "grad_norm": 0.1435546875, "learning_rate": 5.72436695724367e-05, "loss": 0.7397, "step": 3589 }, { "epoch": 2.167823724720797, "grad_norm": 0.1318359375, "learning_rate": 5.7202158572021594e-05, "loss": 0.5876, "step": 3590 }, { "epoch": 2.168427407183821, "grad_norm": 0.1357421875, "learning_rate": 5.7160647571606474e-05, "loss": 0.5402, "step": 3591 }, { "epoch": 2.169031089646846, "grad_norm": 0.142578125, "learning_rate": 5.711913657119137e-05, "loss": 0.6663, "step": 3592 }, { "epoch": 2.16963477210987, "grad_norm": 0.166015625, "learning_rate": 5.707762557077626e-05, "loss": 0.8411, "step": 3593 }, { "epoch": 2.1702384545728948, "grad_norm": 0.14453125, "learning_rate": 5.703611457036114e-05, "loss": 0.5573, "step": 3594 }, { "epoch": 2.170842137035919, "grad_norm": 0.150390625, "learning_rate": 5.6994603569946034e-05, "loss": 0.5112, "step": 3595 }, { "epoch": 2.1714458194989437, "grad_norm": 0.1318359375, "learning_rate": 5.6953092569530934e-05, "loss": 0.5262, "step": 3596 }, { "epoch": 2.172049501961968, "grad_norm": 0.140625, "learning_rate": 5.691158156911583e-05, "loss": 0.5988, "step": 3597 }, { "epoch": 2.1726531844249926, "grad_norm": 0.1435546875, "learning_rate": 5.687007056870071e-05, "loss": 0.5237, "step": 3598 }, { "epoch": 2.173256866888017, "grad_norm": 0.1494140625, "learning_rate": 5.68285595682856e-05, "loss": 0.7144, "step": 3599 }, { "epoch": 2.1738605493510414, "grad_norm": 0.158203125, "learning_rate": 5.678704856787049e-05, "loss": 0.5637, "step": 3600 }, { "epoch": 2.1744642318140657, "grad_norm": 0.15625, "learning_rate": 5.674553756745537e-05, "loss": 0.5446, "step": 3601 }, { "epoch": 2.1750679142770903, "grad_norm": 0.162109375, "learning_rate": 5.6704026567040266e-05, "loss": 0.5356, "step": 3602 }, { "epoch": 2.1756715967401146, "grad_norm": 0.1650390625, "learning_rate": 5.6662515566625166e-05, "loss": 0.4565, "step": 3603 }, { "epoch": 2.1762752792031392, "grad_norm": 0.1708984375, "learning_rate": 5.6621004566210046e-05, "loss": 0.4962, "step": 3604 }, { "epoch": 2.1768789616661635, "grad_norm": 0.1650390625, "learning_rate": 5.657949356579494e-05, "loss": 0.4551, "step": 3605 }, { "epoch": 2.177482644129188, "grad_norm": 0.189453125, "learning_rate": 5.653798256537983e-05, "loss": 0.5525, "step": 3606 }, { "epoch": 2.1780863265922124, "grad_norm": 0.185546875, "learning_rate": 5.649647156496471e-05, "loss": 0.4038, "step": 3607 }, { "epoch": 2.178690009055237, "grad_norm": 0.1845703125, "learning_rate": 5.6454960564549605e-05, "loss": 0.3714, "step": 3608 }, { "epoch": 2.1792936915182612, "grad_norm": 0.1875, "learning_rate": 5.64134495641345e-05, "loss": 0.3403, "step": 3609 }, { "epoch": 2.179897373981286, "grad_norm": 0.220703125, "learning_rate": 5.6371938563719385e-05, "loss": 0.3037, "step": 3610 }, { "epoch": 2.18050105644431, "grad_norm": 0.197265625, "learning_rate": 5.633042756330428e-05, "loss": 0.23, "step": 3611 }, { "epoch": 2.181104738907335, "grad_norm": 0.197265625, "learning_rate": 5.628891656288917e-05, "loss": 0.2189, "step": 3612 }, { "epoch": 2.181708421370359, "grad_norm": 0.1494140625, "learning_rate": 5.624740556247405e-05, "loss": 0.85, "step": 3613 }, { "epoch": 2.1823121038333837, "grad_norm": 0.150390625, "learning_rate": 5.6205894562058945e-05, "loss": 0.6516, "step": 3614 }, { "epoch": 2.182915786296408, "grad_norm": 0.142578125, "learning_rate": 5.616438356164384e-05, "loss": 0.6577, "step": 3615 }, { "epoch": 2.1835194687594326, "grad_norm": 0.1416015625, "learning_rate": 5.6122872561228724e-05, "loss": 0.5393, "step": 3616 }, { "epoch": 2.184123151222457, "grad_norm": 0.134765625, "learning_rate": 5.608136156081362e-05, "loss": 0.5632, "step": 3617 }, { "epoch": 2.1847268336854815, "grad_norm": 0.171875, "learning_rate": 5.603985056039851e-05, "loss": 0.6989, "step": 3618 }, { "epoch": 2.1853305161485057, "grad_norm": 0.1630859375, "learning_rate": 5.5998339559983404e-05, "loss": 0.5195, "step": 3619 }, { "epoch": 2.1859341986115304, "grad_norm": 0.1435546875, "learning_rate": 5.5956828559568284e-05, "loss": 0.6129, "step": 3620 }, { "epoch": 2.1865378810745546, "grad_norm": 0.1611328125, "learning_rate": 5.591531755915318e-05, "loss": 0.6626, "step": 3621 }, { "epoch": 2.1871415635375793, "grad_norm": 0.14453125, "learning_rate": 5.587380655873807e-05, "loss": 0.8414, "step": 3622 }, { "epoch": 2.1877452460006035, "grad_norm": 0.140625, "learning_rate": 5.583229555832296e-05, "loss": 0.5994, "step": 3623 }, { "epoch": 2.188348928463628, "grad_norm": 0.1455078125, "learning_rate": 5.579078455790785e-05, "loss": 0.5107, "step": 3624 }, { "epoch": 2.1889526109266524, "grad_norm": 0.1669921875, "learning_rate": 5.574927355749274e-05, "loss": 0.612, "step": 3625 }, { "epoch": 2.189556293389677, "grad_norm": 0.1474609375, "learning_rate": 5.570776255707762e-05, "loss": 0.6054, "step": 3626 }, { "epoch": 2.1901599758527013, "grad_norm": 0.154296875, "learning_rate": 5.5666251556662516e-05, "loss": 0.6852, "step": 3627 }, { "epoch": 2.190763658315726, "grad_norm": 0.1396484375, "learning_rate": 5.562474055624741e-05, "loss": 0.6717, "step": 3628 }, { "epoch": 2.19136734077875, "grad_norm": 0.125, "learning_rate": 5.5583229555832296e-05, "loss": 0.5286, "step": 3629 }, { "epoch": 2.191971023241775, "grad_norm": 0.1279296875, "learning_rate": 5.554171855541719e-05, "loss": 0.627, "step": 3630 }, { "epoch": 2.192574705704799, "grad_norm": 0.1357421875, "learning_rate": 5.550020755500208e-05, "loss": 0.5631, "step": 3631 }, { "epoch": 2.1931783881678237, "grad_norm": 0.13671875, "learning_rate": 5.545869655458696e-05, "loss": 0.644, "step": 3632 }, { "epoch": 2.1937820706308484, "grad_norm": 0.142578125, "learning_rate": 5.5417185554171856e-05, "loss": 0.6144, "step": 3633 }, { "epoch": 2.1943857530938726, "grad_norm": 0.1318359375, "learning_rate": 5.537567455375675e-05, "loss": 0.5612, "step": 3634 }, { "epoch": 2.194989435556897, "grad_norm": 0.130859375, "learning_rate": 5.5334163553341635e-05, "loss": 0.8231, "step": 3635 }, { "epoch": 2.1955931180199215, "grad_norm": 0.1650390625, "learning_rate": 5.529265255292653e-05, "loss": 0.6544, "step": 3636 }, { "epoch": 2.196196800482946, "grad_norm": 0.13671875, "learning_rate": 5.525114155251142e-05, "loss": 0.5665, "step": 3637 }, { "epoch": 2.1968004829459704, "grad_norm": 0.1494140625, "learning_rate": 5.52096305520963e-05, "loss": 0.6914, "step": 3638 }, { "epoch": 2.1974041654089946, "grad_norm": 0.1484375, "learning_rate": 5.5168119551681195e-05, "loss": 0.6857, "step": 3639 }, { "epoch": 2.1980078478720193, "grad_norm": 0.140625, "learning_rate": 5.512660855126609e-05, "loss": 0.5609, "step": 3640 }, { "epoch": 2.198611530335044, "grad_norm": 0.15234375, "learning_rate": 5.508509755085098e-05, "loss": 0.5702, "step": 3641 }, { "epoch": 2.199215212798068, "grad_norm": 0.14453125, "learning_rate": 5.504358655043587e-05, "loss": 0.6845, "step": 3642 }, { "epoch": 2.199818895261093, "grad_norm": 0.1298828125, "learning_rate": 5.500207555002076e-05, "loss": 0.8029, "step": 3643 }, { "epoch": 2.200422577724117, "grad_norm": 0.1279296875, "learning_rate": 5.4960564549605654e-05, "loss": 0.582, "step": 3644 }, { "epoch": 2.2010262601871418, "grad_norm": 0.216796875, "learning_rate": 5.4919053549190534e-05, "loss": 0.6569, "step": 3645 }, { "epoch": 2.201629942650166, "grad_norm": 0.1435546875, "learning_rate": 5.487754254877543e-05, "loss": 0.5649, "step": 3646 }, { "epoch": 2.2022336251131907, "grad_norm": 0.142578125, "learning_rate": 5.483603154836032e-05, "loss": 0.5068, "step": 3647 }, { "epoch": 2.202837307576215, "grad_norm": 0.146484375, "learning_rate": 5.479452054794521e-05, "loss": 0.5033, "step": 3648 }, { "epoch": 2.2034409900392395, "grad_norm": 0.1591796875, "learning_rate": 5.47530095475301e-05, "loss": 0.5824, "step": 3649 }, { "epoch": 2.2040446725022638, "grad_norm": 0.14453125, "learning_rate": 5.4711498547114994e-05, "loss": 0.5361, "step": 3650 }, { "epoch": 2.2046483549652884, "grad_norm": 0.169921875, "learning_rate": 5.466998754669987e-05, "loss": 0.5451, "step": 3651 }, { "epoch": 2.2052520374283127, "grad_norm": 0.1484375, "learning_rate": 5.4628476546284766e-05, "loss": 0.4929, "step": 3652 }, { "epoch": 2.2058557198913373, "grad_norm": 0.16796875, "learning_rate": 5.458696554586966e-05, "loss": 0.5349, "step": 3653 }, { "epoch": 2.2064594023543616, "grad_norm": 0.1767578125, "learning_rate": 5.4545454545454546e-05, "loss": 0.5587, "step": 3654 }, { "epoch": 2.2070630848173862, "grad_norm": 0.1845703125, "learning_rate": 5.450394354503944e-05, "loss": 0.4678, "step": 3655 }, { "epoch": 2.2076667672804104, "grad_norm": 0.181640625, "learning_rate": 5.446243254462433e-05, "loss": 0.4529, "step": 3656 }, { "epoch": 2.208270449743435, "grad_norm": 0.2041015625, "learning_rate": 5.442092154420921e-05, "loss": 0.4281, "step": 3657 }, { "epoch": 2.2088741322064593, "grad_norm": 0.1787109375, "learning_rate": 5.4379410543794106e-05, "loss": 0.3754, "step": 3658 }, { "epoch": 2.209477814669484, "grad_norm": 0.3203125, "learning_rate": 5.4337899543379e-05, "loss": 0.3509, "step": 3659 }, { "epoch": 2.2100814971325082, "grad_norm": 0.201171875, "learning_rate": 5.4296388542963885e-05, "loss": 0.368, "step": 3660 }, { "epoch": 2.210685179595533, "grad_norm": 0.2041015625, "learning_rate": 5.425487754254878e-05, "loss": 0.2797, "step": 3661 }, { "epoch": 2.211288862058557, "grad_norm": 0.189453125, "learning_rate": 5.421336654213367e-05, "loss": 0.19, "step": 3662 }, { "epoch": 2.211892544521582, "grad_norm": 0.1552734375, "learning_rate": 5.417185554171855e-05, "loss": 1.0725, "step": 3663 }, { "epoch": 2.212496226984606, "grad_norm": 0.234375, "learning_rate": 5.4130344541303445e-05, "loss": 0.655, "step": 3664 }, { "epoch": 2.2130999094476307, "grad_norm": 0.154296875, "learning_rate": 5.408883354088834e-05, "loss": 0.693, "step": 3665 }, { "epoch": 2.213703591910655, "grad_norm": 0.146484375, "learning_rate": 5.404732254047323e-05, "loss": 0.8072, "step": 3666 }, { "epoch": 2.2143072743736796, "grad_norm": 0.1572265625, "learning_rate": 5.400581154005812e-05, "loss": 0.686, "step": 3667 }, { "epoch": 2.214910956836704, "grad_norm": 0.1474609375, "learning_rate": 5.396430053964301e-05, "loss": 0.9237, "step": 3668 }, { "epoch": 2.2155146392997285, "grad_norm": 0.1494140625, "learning_rate": 5.3922789539227904e-05, "loss": 0.5754, "step": 3669 }, { "epoch": 2.2161183217627527, "grad_norm": 0.1474609375, "learning_rate": 5.3881278538812784e-05, "loss": 0.6025, "step": 3670 }, { "epoch": 2.2167220042257774, "grad_norm": 0.1513671875, "learning_rate": 5.383976753839768e-05, "loss": 0.7161, "step": 3671 }, { "epoch": 2.2173256866888016, "grad_norm": 0.134765625, "learning_rate": 5.379825653798257e-05, "loss": 0.8643, "step": 3672 }, { "epoch": 2.2179293691518263, "grad_norm": 0.142578125, "learning_rate": 5.375674553756746e-05, "loss": 0.7475, "step": 3673 }, { "epoch": 2.2185330516148505, "grad_norm": 0.1240234375, "learning_rate": 5.371523453715235e-05, "loss": 0.5257, "step": 3674 }, { "epoch": 2.219136734077875, "grad_norm": 0.1474609375, "learning_rate": 5.3673723536737244e-05, "loss": 0.6046, "step": 3675 }, { "epoch": 2.2197404165408994, "grad_norm": 0.146484375, "learning_rate": 5.3632212536322123e-05, "loss": 0.5928, "step": 3676 }, { "epoch": 2.220344099003924, "grad_norm": 0.154296875, "learning_rate": 5.359070153590702e-05, "loss": 0.7284, "step": 3677 }, { "epoch": 2.2209477814669483, "grad_norm": 0.1494140625, "learning_rate": 5.354919053549191e-05, "loss": 0.6325, "step": 3678 }, { "epoch": 2.221551463929973, "grad_norm": 0.162109375, "learning_rate": 5.3507679535076796e-05, "loss": 0.7032, "step": 3679 }, { "epoch": 2.222155146392997, "grad_norm": 0.1455078125, "learning_rate": 5.346616853466169e-05, "loss": 0.5552, "step": 3680 }, { "epoch": 2.222758828856022, "grad_norm": 0.154296875, "learning_rate": 5.342465753424658e-05, "loss": 0.6397, "step": 3681 }, { "epoch": 2.223362511319046, "grad_norm": 0.1318359375, "learning_rate": 5.338314653383146e-05, "loss": 0.6766, "step": 3682 }, { "epoch": 2.2239661937820707, "grad_norm": 0.15234375, "learning_rate": 5.3341635533416356e-05, "loss": 0.6068, "step": 3683 }, { "epoch": 2.224569876245095, "grad_norm": 0.1513671875, "learning_rate": 5.330012453300125e-05, "loss": 0.7731, "step": 3684 }, { "epoch": 2.2251735587081196, "grad_norm": 0.1552734375, "learning_rate": 5.3258613532586136e-05, "loss": 0.6333, "step": 3685 }, { "epoch": 2.225777241171144, "grad_norm": 0.1435546875, "learning_rate": 5.321710253217103e-05, "loss": 0.6319, "step": 3686 }, { "epoch": 2.2263809236341685, "grad_norm": 0.142578125, "learning_rate": 5.317559153175592e-05, "loss": 0.6213, "step": 3687 }, { "epoch": 2.2269846060971927, "grad_norm": 0.1396484375, "learning_rate": 5.3134080531340815e-05, "loss": 0.6248, "step": 3688 }, { "epoch": 2.2275882885602174, "grad_norm": 0.1357421875, "learning_rate": 5.3092569530925695e-05, "loss": 0.6109, "step": 3689 }, { "epoch": 2.2281919710232416, "grad_norm": 0.1396484375, "learning_rate": 5.305105853051059e-05, "loss": 0.8635, "step": 3690 }, { "epoch": 2.2287956534862663, "grad_norm": 0.1455078125, "learning_rate": 5.300954753009548e-05, "loss": 0.5466, "step": 3691 }, { "epoch": 2.2293993359492905, "grad_norm": 0.169921875, "learning_rate": 5.296803652968037e-05, "loss": 0.7101, "step": 3692 }, { "epoch": 2.230003018412315, "grad_norm": 0.1435546875, "learning_rate": 5.292652552926526e-05, "loss": 0.6589, "step": 3693 }, { "epoch": 2.2306067008753394, "grad_norm": 0.1474609375, "learning_rate": 5.2885014528850155e-05, "loss": 0.6498, "step": 3694 }, { "epoch": 2.231210383338364, "grad_norm": 0.1435546875, "learning_rate": 5.2843503528435034e-05, "loss": 0.5665, "step": 3695 }, { "epoch": 2.2318140658013883, "grad_norm": 0.146484375, "learning_rate": 5.280199252801993e-05, "loss": 0.5706, "step": 3696 }, { "epoch": 2.232417748264413, "grad_norm": 0.140625, "learning_rate": 5.276048152760482e-05, "loss": 0.55, "step": 3697 }, { "epoch": 2.233021430727437, "grad_norm": 0.162109375, "learning_rate": 5.27189705271897e-05, "loss": 0.5897, "step": 3698 }, { "epoch": 2.233625113190462, "grad_norm": 0.193359375, "learning_rate": 5.26774595267746e-05, "loss": 0.6028, "step": 3699 }, { "epoch": 2.234228795653486, "grad_norm": 0.162109375, "learning_rate": 5.2635948526359494e-05, "loss": 0.5925, "step": 3700 }, { "epoch": 2.2348324781165108, "grad_norm": 0.1455078125, "learning_rate": 5.2594437525944374e-05, "loss": 0.5198, "step": 3701 }, { "epoch": 2.235436160579535, "grad_norm": 0.158203125, "learning_rate": 5.255292652552927e-05, "loss": 0.5255, "step": 3702 }, { "epoch": 2.2360398430425596, "grad_norm": 0.16015625, "learning_rate": 5.251141552511416e-05, "loss": 0.5007, "step": 3703 }, { "epoch": 2.236643525505584, "grad_norm": 0.171875, "learning_rate": 5.246990452469904e-05, "loss": 0.5518, "step": 3704 }, { "epoch": 2.2372472079686085, "grad_norm": 0.173828125, "learning_rate": 5.242839352428393e-05, "loss": 0.487, "step": 3705 }, { "epoch": 2.2378508904316328, "grad_norm": 0.1845703125, "learning_rate": 5.238688252386883e-05, "loss": 0.4804, "step": 3706 }, { "epoch": 2.2384545728946574, "grad_norm": 0.181640625, "learning_rate": 5.234537152345371e-05, "loss": 0.4207, "step": 3707 }, { "epoch": 2.2390582553576817, "grad_norm": 0.1787109375, "learning_rate": 5.2303860523038606e-05, "loss": 0.391, "step": 3708 }, { "epoch": 2.2396619378207063, "grad_norm": 0.1982421875, "learning_rate": 5.22623495226235e-05, "loss": 0.3689, "step": 3709 }, { "epoch": 2.2402656202837306, "grad_norm": 0.2080078125, "learning_rate": 5.222083852220839e-05, "loss": 0.34, "step": 3710 }, { "epoch": 2.240869302746755, "grad_norm": 0.2109375, "learning_rate": 5.217932752179327e-05, "loss": 0.2817, "step": 3711 }, { "epoch": 2.2414729852097794, "grad_norm": 0.2080078125, "learning_rate": 5.2137816521378166e-05, "loss": 0.2022, "step": 3712 }, { "epoch": 2.242076667672804, "grad_norm": 0.140625, "learning_rate": 5.2096305520963066e-05, "loss": 0.5713, "step": 3713 }, { "epoch": 2.242680350135829, "grad_norm": 0.1533203125, "learning_rate": 5.2054794520547945e-05, "loss": 0.5972, "step": 3714 }, { "epoch": 2.243284032598853, "grad_norm": 0.1572265625, "learning_rate": 5.201328352013284e-05, "loss": 1.3534, "step": 3715 }, { "epoch": 2.2438877150618772, "grad_norm": 0.13671875, "learning_rate": 5.197177251971773e-05, "loss": 0.544, "step": 3716 }, { "epoch": 2.244491397524902, "grad_norm": 0.14453125, "learning_rate": 5.193026151930261e-05, "loss": 0.7116, "step": 3717 }, { "epoch": 2.2450950799879266, "grad_norm": 0.1474609375, "learning_rate": 5.1888750518887505e-05, "loss": 0.4888, "step": 3718 }, { "epoch": 2.245698762450951, "grad_norm": 0.1494140625, "learning_rate": 5.18472395184724e-05, "loss": 0.6472, "step": 3719 }, { "epoch": 2.246302444913975, "grad_norm": 0.181640625, "learning_rate": 5.1805728518057285e-05, "loss": 0.7131, "step": 3720 }, { "epoch": 2.2469061273769997, "grad_norm": 0.1357421875, "learning_rate": 5.176421751764218e-05, "loss": 0.6801, "step": 3721 }, { "epoch": 2.2475098098400244, "grad_norm": 0.146484375, "learning_rate": 5.172270651722707e-05, "loss": 0.5962, "step": 3722 }, { "epoch": 2.2481134923030486, "grad_norm": 0.1455078125, "learning_rate": 5.168119551681195e-05, "loss": 0.6611, "step": 3723 }, { "epoch": 2.2487171747660732, "grad_norm": 0.1337890625, "learning_rate": 5.1639684516396844e-05, "loss": 0.6505, "step": 3724 }, { "epoch": 2.2493208572290975, "grad_norm": 0.1318359375, "learning_rate": 5.159817351598174e-05, "loss": 0.5929, "step": 3725 }, { "epoch": 2.249924539692122, "grad_norm": 0.1455078125, "learning_rate": 5.1556662515566624e-05, "loss": 0.6366, "step": 3726 }, { "epoch": 2.2505282221551464, "grad_norm": 0.158203125, "learning_rate": 5.151515151515152e-05, "loss": 0.7009, "step": 3727 }, { "epoch": 2.251131904618171, "grad_norm": 0.1572265625, "learning_rate": 5.147364051473641e-05, "loss": 0.6801, "step": 3728 }, { "epoch": 2.2517355870811953, "grad_norm": 0.146484375, "learning_rate": 5.143212951432129e-05, "loss": 0.7714, "step": 3729 }, { "epoch": 2.25233926954422, "grad_norm": 0.1357421875, "learning_rate": 5.139061851390618e-05, "loss": 0.5927, "step": 3730 }, { "epoch": 2.252942952007244, "grad_norm": 0.1484375, "learning_rate": 5.1349107513491076e-05, "loss": 0.581, "step": 3731 }, { "epoch": 2.253546634470269, "grad_norm": 0.1484375, "learning_rate": 5.130759651307597e-05, "loss": 0.6646, "step": 3732 }, { "epoch": 2.254150316933293, "grad_norm": 0.138671875, "learning_rate": 5.1266085512660856e-05, "loss": 0.6749, "step": 3733 }, { "epoch": 2.2547539993963177, "grad_norm": 0.1513671875, "learning_rate": 5.122457451224575e-05, "loss": 0.7835, "step": 3734 }, { "epoch": 2.255357681859342, "grad_norm": 0.13671875, "learning_rate": 5.118306351183064e-05, "loss": 0.5993, "step": 3735 }, { "epoch": 2.2559613643223666, "grad_norm": 0.146484375, "learning_rate": 5.114155251141552e-05, "loss": 0.8175, "step": 3736 }, { "epoch": 2.256565046785391, "grad_norm": 0.1767578125, "learning_rate": 5.1100041511000416e-05, "loss": 0.5801, "step": 3737 }, { "epoch": 2.2571687292484155, "grad_norm": 0.1533203125, "learning_rate": 5.105853051058531e-05, "loss": 0.6434, "step": 3738 }, { "epoch": 2.2577724117114397, "grad_norm": 0.197265625, "learning_rate": 5.1017019510170195e-05, "loss": 0.6335, "step": 3739 }, { "epoch": 2.2583760941744644, "grad_norm": 0.16015625, "learning_rate": 5.097550850975509e-05, "loss": 0.8821, "step": 3740 }, { "epoch": 2.2589797766374886, "grad_norm": 0.1611328125, "learning_rate": 5.093399750933998e-05, "loss": 0.6041, "step": 3741 }, { "epoch": 2.2595834591005133, "grad_norm": 0.19140625, "learning_rate": 5.089248650892486e-05, "loss": 0.8298, "step": 3742 }, { "epoch": 2.2601871415635375, "grad_norm": 0.142578125, "learning_rate": 5.0850975508509755e-05, "loss": 0.7614, "step": 3743 }, { "epoch": 2.260790824026562, "grad_norm": 0.1689453125, "learning_rate": 5.080946450809465e-05, "loss": 0.508, "step": 3744 }, { "epoch": 2.2613945064895864, "grad_norm": 0.275390625, "learning_rate": 5.0767953507679535e-05, "loss": 0.7969, "step": 3745 }, { "epoch": 2.261998188952611, "grad_norm": 0.1337890625, "learning_rate": 5.072644250726443e-05, "loss": 0.4903, "step": 3746 }, { "epoch": 2.2626018714156353, "grad_norm": 0.1484375, "learning_rate": 5.068493150684932e-05, "loss": 0.5706, "step": 3747 }, { "epoch": 2.26320555387866, "grad_norm": 0.1474609375, "learning_rate": 5.06434205064342e-05, "loss": 0.5735, "step": 3748 }, { "epoch": 2.263809236341684, "grad_norm": 0.1455078125, "learning_rate": 5.0601909506019094e-05, "loss": 0.568, "step": 3749 }, { "epoch": 2.264412918804709, "grad_norm": 0.16796875, "learning_rate": 5.056039850560399e-05, "loss": 0.5466, "step": 3750 }, { "epoch": 2.265016601267733, "grad_norm": 0.154296875, "learning_rate": 5.0518887505188874e-05, "loss": 0.5017, "step": 3751 }, { "epoch": 2.2656202837307577, "grad_norm": 0.1611328125, "learning_rate": 5.047737650477377e-05, "loss": 0.4916, "step": 3752 }, { "epoch": 2.266223966193782, "grad_norm": 0.1611328125, "learning_rate": 5.043586550435866e-05, "loss": 0.5218, "step": 3753 }, { "epoch": 2.2668276486568066, "grad_norm": 0.1640625, "learning_rate": 5.0394354503943554e-05, "loss": 0.4766, "step": 3754 }, { "epoch": 2.267431331119831, "grad_norm": 0.1708984375, "learning_rate": 5.0352843503528433e-05, "loss": 0.5308, "step": 3755 }, { "epoch": 2.2680350135828555, "grad_norm": 0.1728515625, "learning_rate": 5.031133250311333e-05, "loss": 0.4462, "step": 3756 }, { "epoch": 2.2686386960458798, "grad_norm": 0.1806640625, "learning_rate": 5.026982150269822e-05, "loss": 0.3928, "step": 3757 }, { "epoch": 2.2692423785089044, "grad_norm": 0.193359375, "learning_rate": 5.0228310502283106e-05, "loss": 0.3917, "step": 3758 }, { "epoch": 2.2698460609719286, "grad_norm": 0.1875, "learning_rate": 5.0186799501868e-05, "loss": 0.349, "step": 3759 }, { "epoch": 2.2704497434349533, "grad_norm": 0.203125, "learning_rate": 5.014528850145289e-05, "loss": 0.3521, "step": 3760 }, { "epoch": 2.2710534258979775, "grad_norm": 0.2138671875, "learning_rate": 5.010377750103777e-05, "loss": 0.2588, "step": 3761 }, { "epoch": 2.271657108361002, "grad_norm": 0.1767578125, "learning_rate": 5.0062266500622666e-05, "loss": 0.1769, "step": 3762 }, { "epoch": 2.2722607908240264, "grad_norm": 0.1728515625, "learning_rate": 5.002075550020756e-05, "loss": 0.6932, "step": 3763 }, { "epoch": 2.272864473287051, "grad_norm": 0.1533203125, "learning_rate": 4.997924449979245e-05, "loss": 0.6689, "step": 3764 }, { "epoch": 2.2734681557500753, "grad_norm": 0.1552734375, "learning_rate": 4.993773349937734e-05, "loss": 1.0518, "step": 3765 }, { "epoch": 2.2740718382131, "grad_norm": 0.1474609375, "learning_rate": 4.9896222498962225e-05, "loss": 0.6322, "step": 3766 }, { "epoch": 2.274675520676124, "grad_norm": 0.1533203125, "learning_rate": 4.985471149854712e-05, "loss": 0.6422, "step": 3767 }, { "epoch": 2.275279203139149, "grad_norm": 0.1484375, "learning_rate": 4.9813200498132005e-05, "loss": 0.6127, "step": 3768 }, { "epoch": 2.275882885602173, "grad_norm": 0.1416015625, "learning_rate": 4.977168949771689e-05, "loss": 0.6145, "step": 3769 }, { "epoch": 2.276486568065198, "grad_norm": 0.146484375, "learning_rate": 4.9730178497301785e-05, "loss": 0.5599, "step": 3770 }, { "epoch": 2.277090250528222, "grad_norm": 0.142578125, "learning_rate": 4.968866749688668e-05, "loss": 0.8418, "step": 3771 }, { "epoch": 2.2776939329912467, "grad_norm": 0.1376953125, "learning_rate": 4.964715649647157e-05, "loss": 0.562, "step": 3772 }, { "epoch": 2.278297615454271, "grad_norm": 0.1494140625, "learning_rate": 4.960564549605646e-05, "loss": 0.6149, "step": 3773 }, { "epoch": 2.2789012979172956, "grad_norm": 0.1494140625, "learning_rate": 4.9564134495641344e-05, "loss": 0.6435, "step": 3774 }, { "epoch": 2.27950498038032, "grad_norm": 0.1494140625, "learning_rate": 4.952262349522624e-05, "loss": 0.7169, "step": 3775 }, { "epoch": 2.2801086628433445, "grad_norm": 0.1357421875, "learning_rate": 4.9481112494811124e-05, "loss": 0.9838, "step": 3776 }, { "epoch": 2.2807123453063687, "grad_norm": 0.150390625, "learning_rate": 4.943960149439602e-05, "loss": 0.6183, "step": 3777 }, { "epoch": 2.2813160277693934, "grad_norm": 0.138671875, "learning_rate": 4.939809049398091e-05, "loss": 0.5653, "step": 3778 }, { "epoch": 2.2819197102324176, "grad_norm": 0.1298828125, "learning_rate": 4.93565794935658e-05, "loss": 0.8278, "step": 3779 }, { "epoch": 2.2825233926954422, "grad_norm": 0.142578125, "learning_rate": 4.9315068493150684e-05, "loss": 0.6817, "step": 3780 }, { "epoch": 2.2831270751584665, "grad_norm": 0.154296875, "learning_rate": 4.927355749273558e-05, "loss": 0.548, "step": 3781 }, { "epoch": 2.283730757621491, "grad_norm": 0.142578125, "learning_rate": 4.923204649232046e-05, "loss": 0.6265, "step": 3782 }, { "epoch": 2.2843344400845154, "grad_norm": 0.1484375, "learning_rate": 4.9190535491905357e-05, "loss": 0.5528, "step": 3783 }, { "epoch": 2.28493812254754, "grad_norm": 0.1513671875, "learning_rate": 4.914902449149025e-05, "loss": 0.9304, "step": 3784 }, { "epoch": 2.2855418050105643, "grad_norm": 0.154296875, "learning_rate": 4.9107513491075136e-05, "loss": 0.6049, "step": 3785 }, { "epoch": 2.286145487473589, "grad_norm": 0.140625, "learning_rate": 4.906600249066003e-05, "loss": 0.6067, "step": 3786 }, { "epoch": 2.2867491699366136, "grad_norm": 0.1416015625, "learning_rate": 4.9024491490244916e-05, "loss": 0.5666, "step": 3787 }, { "epoch": 2.287352852399638, "grad_norm": 0.1533203125, "learning_rate": 4.89829804898298e-05, "loss": 0.6094, "step": 3788 }, { "epoch": 2.287956534862662, "grad_norm": 0.140625, "learning_rate": 4.8941469489414696e-05, "loss": 0.4883, "step": 3789 }, { "epoch": 2.2885602173256867, "grad_norm": 0.140625, "learning_rate": 4.889995848899959e-05, "loss": 0.5744, "step": 3790 }, { "epoch": 2.2891638997887114, "grad_norm": 0.134765625, "learning_rate": 4.8858447488584476e-05, "loss": 0.6248, "step": 3791 }, { "epoch": 2.2897675822517356, "grad_norm": 0.1474609375, "learning_rate": 4.881693648816937e-05, "loss": 0.8144, "step": 3792 }, { "epoch": 2.29037126471476, "grad_norm": 0.14453125, "learning_rate": 4.8775425487754255e-05, "loss": 0.6317, "step": 3793 }, { "epoch": 2.2909749471777845, "grad_norm": 0.15234375, "learning_rate": 4.873391448733915e-05, "loss": 0.6998, "step": 3794 }, { "epoch": 2.291578629640809, "grad_norm": 0.1357421875, "learning_rate": 4.8692403486924035e-05, "loss": 0.5846, "step": 3795 }, { "epoch": 2.2921823121038334, "grad_norm": 0.13671875, "learning_rate": 4.865089248650893e-05, "loss": 0.5276, "step": 3796 }, { "epoch": 2.2927859945668576, "grad_norm": 0.16015625, "learning_rate": 4.860938148609382e-05, "loss": 0.5752, "step": 3797 }, { "epoch": 2.2933896770298823, "grad_norm": 0.1513671875, "learning_rate": 4.856787048567871e-05, "loss": 0.5444, "step": 3798 }, { "epoch": 2.293993359492907, "grad_norm": 0.140625, "learning_rate": 4.8526359485263595e-05, "loss": 0.5088, "step": 3799 }, { "epoch": 2.294597041955931, "grad_norm": 0.16015625, "learning_rate": 4.848484848484849e-05, "loss": 0.5237, "step": 3800 }, { "epoch": 2.2952007244189554, "grad_norm": 0.162109375, "learning_rate": 4.8443337484433374e-05, "loss": 0.5165, "step": 3801 }, { "epoch": 2.29580440688198, "grad_norm": 0.1669921875, "learning_rate": 4.840182648401827e-05, "loss": 0.5137, "step": 3802 }, { "epoch": 2.2964080893450047, "grad_norm": 0.16015625, "learning_rate": 4.836031548360316e-05, "loss": 0.5314, "step": 3803 }, { "epoch": 2.297011771808029, "grad_norm": 0.17578125, "learning_rate": 4.831880448318805e-05, "loss": 0.5597, "step": 3804 }, { "epoch": 2.297615454271053, "grad_norm": 0.1748046875, "learning_rate": 4.827729348277294e-05, "loss": 0.492, "step": 3805 }, { "epoch": 2.298219136734078, "grad_norm": 0.1826171875, "learning_rate": 4.823578248235783e-05, "loss": 0.4438, "step": 3806 }, { "epoch": 2.2988228191971025, "grad_norm": 0.1806640625, "learning_rate": 4.8194271481942714e-05, "loss": 0.3985, "step": 3807 }, { "epoch": 2.2994265016601267, "grad_norm": 0.1884765625, "learning_rate": 4.815276048152761e-05, "loss": 0.513, "step": 3808 }, { "epoch": 2.3000301841231514, "grad_norm": 0.2109375, "learning_rate": 4.81112494811125e-05, "loss": 0.3765, "step": 3809 }, { "epoch": 2.3006338665861756, "grad_norm": 0.1943359375, "learning_rate": 4.8069738480697387e-05, "loss": 0.2697, "step": 3810 }, { "epoch": 2.3012375490492003, "grad_norm": 0.185546875, "learning_rate": 4.802822748028228e-05, "loss": 0.2245, "step": 3811 }, { "epoch": 2.3018412315122245, "grad_norm": 0.2001953125, "learning_rate": 4.7986716479867166e-05, "loss": 0.1918, "step": 3812 }, { "epoch": 2.302444913975249, "grad_norm": 0.1484375, "learning_rate": 4.794520547945205e-05, "loss": 0.7845, "step": 3813 }, { "epoch": 2.3030485964382734, "grad_norm": 0.12255859375, "learning_rate": 4.7903694479036946e-05, "loss": 0.5505, "step": 3814 }, { "epoch": 2.303652278901298, "grad_norm": 0.1533203125, "learning_rate": 4.786218347862184e-05, "loss": 0.601, "step": 3815 }, { "epoch": 2.3042559613643223, "grad_norm": 0.15234375, "learning_rate": 4.782067247820673e-05, "loss": 0.5139, "step": 3816 }, { "epoch": 2.304859643827347, "grad_norm": 0.1318359375, "learning_rate": 4.777916147779162e-05, "loss": 0.6804, "step": 3817 }, { "epoch": 2.305463326290371, "grad_norm": 0.1435546875, "learning_rate": 4.7737650477376505e-05, "loss": 0.5875, "step": 3818 }, { "epoch": 2.306067008753396, "grad_norm": 0.1748046875, "learning_rate": 4.76961394769614e-05, "loss": 0.6261, "step": 3819 }, { "epoch": 2.30667069121642, "grad_norm": 0.1396484375, "learning_rate": 4.7654628476546285e-05, "loss": 0.597, "step": 3820 }, { "epoch": 2.3072743736794448, "grad_norm": 0.14453125, "learning_rate": 4.761311747613117e-05, "loss": 0.4979, "step": 3821 }, { "epoch": 2.307878056142469, "grad_norm": 0.13671875, "learning_rate": 4.757160647571607e-05, "loss": 0.5344, "step": 3822 }, { "epoch": 2.3084817386054937, "grad_norm": 0.154296875, "learning_rate": 4.753009547530096e-05, "loss": 0.575, "step": 3823 }, { "epoch": 2.309085421068518, "grad_norm": 0.154296875, "learning_rate": 4.7488584474885845e-05, "loss": 0.6303, "step": 3824 }, { "epoch": 2.3096891035315426, "grad_norm": 0.1474609375, "learning_rate": 4.744707347447074e-05, "loss": 0.65, "step": 3825 }, { "epoch": 2.3102927859945668, "grad_norm": 0.1611328125, "learning_rate": 4.7405562474055624e-05, "loss": 0.6018, "step": 3826 }, { "epoch": 2.3108964684575914, "grad_norm": 0.16015625, "learning_rate": 4.736405147364052e-05, "loss": 0.6526, "step": 3827 }, { "epoch": 2.3115001509206157, "grad_norm": 0.140625, "learning_rate": 4.7322540473225404e-05, "loss": 0.5563, "step": 3828 }, { "epoch": 2.3121038333836403, "grad_norm": 0.13671875, "learning_rate": 4.72810294728103e-05, "loss": 0.6763, "step": 3829 }, { "epoch": 2.3127075158466646, "grad_norm": 0.1484375, "learning_rate": 4.723951847239519e-05, "loss": 0.5513, "step": 3830 }, { "epoch": 2.3133111983096892, "grad_norm": 0.146484375, "learning_rate": 4.719800747198008e-05, "loss": 0.7, "step": 3831 }, { "epoch": 2.3139148807727135, "grad_norm": 0.140625, "learning_rate": 4.7156496471564964e-05, "loss": 0.6488, "step": 3832 }, { "epoch": 2.314518563235738, "grad_norm": 0.15625, "learning_rate": 4.711498547114986e-05, "loss": 0.6514, "step": 3833 }, { "epoch": 2.3151222456987623, "grad_norm": 0.1435546875, "learning_rate": 4.7073474470734743e-05, "loss": 0.5488, "step": 3834 }, { "epoch": 2.315725928161787, "grad_norm": 0.146484375, "learning_rate": 4.703196347031964e-05, "loss": 0.6489, "step": 3835 }, { "epoch": 2.3163296106248112, "grad_norm": 0.146484375, "learning_rate": 4.699045246990453e-05, "loss": 0.5646, "step": 3836 }, { "epoch": 2.316933293087836, "grad_norm": 0.1494140625, "learning_rate": 4.6948941469489416e-05, "loss": 0.6441, "step": 3837 }, { "epoch": 2.31753697555086, "grad_norm": 0.1435546875, "learning_rate": 4.690743046907431e-05, "loss": 0.665, "step": 3838 }, { "epoch": 2.318140658013885, "grad_norm": 0.12890625, "learning_rate": 4.6865919468659196e-05, "loss": 0.5684, "step": 3839 }, { "epoch": 2.318744340476909, "grad_norm": 0.1376953125, "learning_rate": 4.682440846824408e-05, "loss": 0.5394, "step": 3840 }, { "epoch": 2.3193480229399337, "grad_norm": 0.154296875, "learning_rate": 4.6782897467828976e-05, "loss": 0.5669, "step": 3841 }, { "epoch": 2.319951705402958, "grad_norm": 0.1572265625, "learning_rate": 4.674138646741387e-05, "loss": 0.6004, "step": 3842 }, { "epoch": 2.3205553878659826, "grad_norm": 0.1533203125, "learning_rate": 4.6699875466998756e-05, "loss": 0.6637, "step": 3843 }, { "epoch": 2.321159070329007, "grad_norm": 0.1337890625, "learning_rate": 4.665836446658365e-05, "loss": 0.5828, "step": 3844 }, { "epoch": 2.3217627527920315, "grad_norm": 0.13671875, "learning_rate": 4.6616853466168535e-05, "loss": 0.5728, "step": 3845 }, { "epoch": 2.3223664352550557, "grad_norm": 0.1376953125, "learning_rate": 4.657534246575342e-05, "loss": 0.5247, "step": 3846 }, { "epoch": 2.3229701177180804, "grad_norm": 0.1416015625, "learning_rate": 4.6533831465338315e-05, "loss": 0.5778, "step": 3847 }, { "epoch": 2.3235738001811046, "grad_norm": 0.1416015625, "learning_rate": 4.649232046492321e-05, "loss": 0.4809, "step": 3848 }, { "epoch": 2.3241774826441293, "grad_norm": 0.1591796875, "learning_rate": 4.64508094645081e-05, "loss": 0.5791, "step": 3849 }, { "epoch": 2.3247811651071535, "grad_norm": 0.154296875, "learning_rate": 4.640929846409299e-05, "loss": 0.4888, "step": 3850 }, { "epoch": 2.325384847570178, "grad_norm": 0.1630859375, "learning_rate": 4.6367787463677875e-05, "loss": 0.5856, "step": 3851 }, { "epoch": 2.3259885300332024, "grad_norm": 0.1650390625, "learning_rate": 4.632627646326277e-05, "loss": 0.5226, "step": 3852 }, { "epoch": 2.326592212496227, "grad_norm": 0.16796875, "learning_rate": 4.6284765462847654e-05, "loss": 0.494, "step": 3853 }, { "epoch": 2.3271958949592513, "grad_norm": 0.1689453125, "learning_rate": 4.624325446243255e-05, "loss": 0.4667, "step": 3854 }, { "epoch": 2.327799577422276, "grad_norm": 0.177734375, "learning_rate": 4.620174346201744e-05, "loss": 0.4776, "step": 3855 }, { "epoch": 2.3284032598853, "grad_norm": 0.1806640625, "learning_rate": 4.616023246160233e-05, "loss": 0.4472, "step": 3856 }, { "epoch": 2.329006942348325, "grad_norm": 0.1884765625, "learning_rate": 4.6118721461187214e-05, "loss": 0.3986, "step": 3857 }, { "epoch": 2.329610624811349, "grad_norm": 0.19140625, "learning_rate": 4.607721046077211e-05, "loss": 0.3597, "step": 3858 }, { "epoch": 2.3302143072743737, "grad_norm": 0.201171875, "learning_rate": 4.6035699460356994e-05, "loss": 0.3309, "step": 3859 }, { "epoch": 2.330817989737398, "grad_norm": 0.2119140625, "learning_rate": 4.599418845994189e-05, "loss": 0.2997, "step": 3860 }, { "epoch": 2.3314216722004226, "grad_norm": 0.2080078125, "learning_rate": 4.595267745952678e-05, "loss": 0.2452, "step": 3861 }, { "epoch": 2.332025354663447, "grad_norm": 0.21484375, "learning_rate": 4.5911166459111667e-05, "loss": 0.1978, "step": 3862 }, { "epoch": 2.3326290371264715, "grad_norm": 0.14453125, "learning_rate": 4.586965545869656e-05, "loss": 0.6661, "step": 3863 }, { "epoch": 2.3332327195894957, "grad_norm": 0.1552734375, "learning_rate": 4.5828144458281446e-05, "loss": 0.5825, "step": 3864 }, { "epoch": 2.3338364020525204, "grad_norm": 0.158203125, "learning_rate": 4.578663345786633e-05, "loss": 0.6307, "step": 3865 }, { "epoch": 2.3344400845155446, "grad_norm": 0.36328125, "learning_rate": 4.5745122457451226e-05, "loss": 0.555, "step": 3866 }, { "epoch": 2.3350437669785693, "grad_norm": 0.150390625, "learning_rate": 4.570361145703612e-05, "loss": 0.5548, "step": 3867 }, { "epoch": 2.335647449441594, "grad_norm": 0.1435546875, "learning_rate": 4.5662100456621006e-05, "loss": 0.7686, "step": 3868 }, { "epoch": 2.336251131904618, "grad_norm": 0.1591796875, "learning_rate": 4.56205894562059e-05, "loss": 0.8087, "step": 3869 }, { "epoch": 2.3368548143676424, "grad_norm": 0.1572265625, "learning_rate": 4.5579078455790786e-05, "loss": 0.7414, "step": 3870 }, { "epoch": 2.337458496830667, "grad_norm": 0.1494140625, "learning_rate": 4.553756745537568e-05, "loss": 0.5163, "step": 3871 }, { "epoch": 2.3380621792936918, "grad_norm": 0.1455078125, "learning_rate": 4.5496056454960565e-05, "loss": 0.6429, "step": 3872 }, { "epoch": 2.338665861756716, "grad_norm": 0.1533203125, "learning_rate": 4.545454545454546e-05, "loss": 0.6131, "step": 3873 }, { "epoch": 2.33926954421974, "grad_norm": 0.13671875, "learning_rate": 4.541303445413035e-05, "loss": 0.546, "step": 3874 }, { "epoch": 2.339873226682765, "grad_norm": 0.1484375, "learning_rate": 4.537152345371524e-05, "loss": 0.7007, "step": 3875 }, { "epoch": 2.3404769091457895, "grad_norm": 0.1435546875, "learning_rate": 4.5330012453300125e-05, "loss": 0.503, "step": 3876 }, { "epoch": 2.3410805916088138, "grad_norm": 0.15625, "learning_rate": 4.528850145288502e-05, "loss": 0.6473, "step": 3877 }, { "epoch": 2.341684274071838, "grad_norm": 0.1474609375, "learning_rate": 4.5246990452469905e-05, "loss": 0.6571, "step": 3878 }, { "epoch": 2.3422879565348627, "grad_norm": 0.1416015625, "learning_rate": 4.520547945205479e-05, "loss": 0.5842, "step": 3879 }, { "epoch": 2.3428916389978873, "grad_norm": 0.1416015625, "learning_rate": 4.516396845163969e-05, "loss": 0.5991, "step": 3880 }, { "epoch": 2.3434953214609116, "grad_norm": 0.1455078125, "learning_rate": 4.512245745122458e-05, "loss": 0.6484, "step": 3881 }, { "epoch": 2.3440990039239358, "grad_norm": 0.1513671875, "learning_rate": 4.508094645080947e-05, "loss": 0.603, "step": 3882 }, { "epoch": 2.3447026863869604, "grad_norm": 0.146484375, "learning_rate": 4.503943545039436e-05, "loss": 0.6013, "step": 3883 }, { "epoch": 2.345306368849985, "grad_norm": 0.150390625, "learning_rate": 4.4997924449979244e-05, "loss": 0.6968, "step": 3884 }, { "epoch": 2.3459100513130093, "grad_norm": 0.1533203125, "learning_rate": 4.495641344956414e-05, "loss": 0.7278, "step": 3885 }, { "epoch": 2.3465137337760336, "grad_norm": 0.1455078125, "learning_rate": 4.4914902449149024e-05, "loss": 0.5987, "step": 3886 }, { "epoch": 2.3471174162390582, "grad_norm": 0.1533203125, "learning_rate": 4.487339144873392e-05, "loss": 0.6939, "step": 3887 }, { "epoch": 2.347721098702083, "grad_norm": 0.1630859375, "learning_rate": 4.483188044831881e-05, "loss": 0.6199, "step": 3888 }, { "epoch": 2.348324781165107, "grad_norm": 0.16015625, "learning_rate": 4.4790369447903697e-05, "loss": 0.653, "step": 3889 }, { "epoch": 2.348928463628132, "grad_norm": 0.1376953125, "learning_rate": 4.474885844748858e-05, "loss": 0.5988, "step": 3890 }, { "epoch": 2.349532146091156, "grad_norm": 0.1513671875, "learning_rate": 4.4707347447073476e-05, "loss": 0.603, "step": 3891 }, { "epoch": 2.3501358285541807, "grad_norm": 0.142578125, "learning_rate": 4.466583644665836e-05, "loss": 0.5959, "step": 3892 }, { "epoch": 2.350739511017205, "grad_norm": 0.13671875, "learning_rate": 4.4624325446243256e-05, "loss": 0.6052, "step": 3893 }, { "epoch": 2.3513431934802296, "grad_norm": 0.1484375, "learning_rate": 4.458281444582815e-05, "loss": 1.0572, "step": 3894 }, { "epoch": 2.351946875943254, "grad_norm": 0.1640625, "learning_rate": 4.4541303445413036e-05, "loss": 0.6273, "step": 3895 }, { "epoch": 2.3525505584062785, "grad_norm": 0.15625, "learning_rate": 4.449979244499793e-05, "loss": 0.6219, "step": 3896 }, { "epoch": 2.3531542408693027, "grad_norm": 0.142578125, "learning_rate": 4.4458281444582815e-05, "loss": 0.5061, "step": 3897 }, { "epoch": 2.3537579233323274, "grad_norm": 0.150390625, "learning_rate": 4.44167704441677e-05, "loss": 0.5157, "step": 3898 }, { "epoch": 2.3543616057953516, "grad_norm": 0.1455078125, "learning_rate": 4.4375259443752595e-05, "loss": 0.5173, "step": 3899 }, { "epoch": 2.3549652882583763, "grad_norm": 0.15234375, "learning_rate": 4.433374844333749e-05, "loss": 0.5796, "step": 3900 }, { "epoch": 2.3555689707214005, "grad_norm": 0.1650390625, "learning_rate": 4.4292237442922375e-05, "loss": 0.5505, "step": 3901 }, { "epoch": 2.356172653184425, "grad_norm": 0.169921875, "learning_rate": 4.425072644250727e-05, "loss": 0.5425, "step": 3902 }, { "epoch": 2.3567763356474494, "grad_norm": 0.1708984375, "learning_rate": 4.4209215442092155e-05, "loss": 0.4759, "step": 3903 }, { "epoch": 2.357380018110474, "grad_norm": 0.1787109375, "learning_rate": 4.416770444167705e-05, "loss": 0.5116, "step": 3904 }, { "epoch": 2.3579837005734983, "grad_norm": 0.16796875, "learning_rate": 4.4126193441261934e-05, "loss": 0.3866, "step": 3905 }, { "epoch": 2.358587383036523, "grad_norm": 0.171875, "learning_rate": 4.408468244084683e-05, "loss": 0.4082, "step": 3906 }, { "epoch": 2.359191065499547, "grad_norm": 0.19140625, "learning_rate": 4.404317144043172e-05, "loss": 0.4422, "step": 3907 }, { "epoch": 2.359794747962572, "grad_norm": 0.1865234375, "learning_rate": 4.400166044001661e-05, "loss": 0.4258, "step": 3908 }, { "epoch": 2.360398430425596, "grad_norm": 0.1826171875, "learning_rate": 4.3960149439601494e-05, "loss": 0.3006, "step": 3909 }, { "epoch": 2.3610021128886207, "grad_norm": 0.1923828125, "learning_rate": 4.391863843918639e-05, "loss": 0.2906, "step": 3910 }, { "epoch": 2.361605795351645, "grad_norm": 0.2060546875, "learning_rate": 4.3877127438771274e-05, "loss": 0.2704, "step": 3911 }, { "epoch": 2.3622094778146696, "grad_norm": 0.1787109375, "learning_rate": 4.383561643835617e-05, "loss": 0.191, "step": 3912 }, { "epoch": 2.362813160277694, "grad_norm": 0.146484375, "learning_rate": 4.379410543794106e-05, "loss": 0.6088, "step": 3913 }, { "epoch": 2.3634168427407185, "grad_norm": 0.1376953125, "learning_rate": 4.375259443752595e-05, "loss": 0.7435, "step": 3914 }, { "epoch": 2.3640205252037427, "grad_norm": 0.15234375, "learning_rate": 4.371108343711084e-05, "loss": 0.6249, "step": 3915 }, { "epoch": 2.3646242076667674, "grad_norm": 0.15234375, "learning_rate": 4.3669572436695726e-05, "loss": 0.6178, "step": 3916 }, { "epoch": 2.3652278901297916, "grad_norm": 0.1455078125, "learning_rate": 4.362806143628061e-05, "loss": 0.6073, "step": 3917 }, { "epoch": 2.3658315725928163, "grad_norm": 0.1474609375, "learning_rate": 4.3586550435865506e-05, "loss": 0.647, "step": 3918 }, { "epoch": 2.3664352550558405, "grad_norm": 0.14453125, "learning_rate": 4.35450394354504e-05, "loss": 0.6807, "step": 3919 }, { "epoch": 2.367038937518865, "grad_norm": 0.1357421875, "learning_rate": 4.3503528435035286e-05, "loss": 0.5945, "step": 3920 }, { "epoch": 2.3676426199818894, "grad_norm": 0.1416015625, "learning_rate": 4.346201743462018e-05, "loss": 0.6119, "step": 3921 }, { "epoch": 2.368246302444914, "grad_norm": 0.1640625, "learning_rate": 4.3420506434205066e-05, "loss": 0.6251, "step": 3922 }, { "epoch": 2.3688499849079383, "grad_norm": 0.162109375, "learning_rate": 4.337899543378995e-05, "loss": 0.5501, "step": 3923 }, { "epoch": 2.369453667370963, "grad_norm": 0.14453125, "learning_rate": 4.3337484433374845e-05, "loss": 0.6416, "step": 3924 }, { "epoch": 2.370057349833987, "grad_norm": 0.1533203125, "learning_rate": 4.329597343295974e-05, "loss": 0.6151, "step": 3925 }, { "epoch": 2.370661032297012, "grad_norm": 0.1611328125, "learning_rate": 4.325446243254463e-05, "loss": 0.7269, "step": 3926 }, { "epoch": 2.371264714760036, "grad_norm": 0.1435546875, "learning_rate": 4.321295143212952e-05, "loss": 0.5769, "step": 3927 }, { "epoch": 2.3718683972230608, "grad_norm": 0.140625, "learning_rate": 4.3171440431714405e-05, "loss": 0.6409, "step": 3928 }, { "epoch": 2.372472079686085, "grad_norm": 0.14453125, "learning_rate": 4.31299294312993e-05, "loss": 0.6153, "step": 3929 }, { "epoch": 2.3730757621491096, "grad_norm": 0.14453125, "learning_rate": 4.3088418430884185e-05, "loss": 0.5554, "step": 3930 }, { "epoch": 2.373679444612134, "grad_norm": 0.1669921875, "learning_rate": 4.304690743046907e-05, "loss": 0.5879, "step": 3931 }, { "epoch": 2.3742831270751585, "grad_norm": 0.1494140625, "learning_rate": 4.300539643005397e-05, "loss": 0.5892, "step": 3932 }, { "epoch": 2.3748868095381828, "grad_norm": 0.1689453125, "learning_rate": 4.296388542963886e-05, "loss": 0.621, "step": 3933 }, { "epoch": 2.3754904920012074, "grad_norm": 0.15234375, "learning_rate": 4.2922374429223744e-05, "loss": 1.0716, "step": 3934 }, { "epoch": 2.3760941744642317, "grad_norm": 0.1533203125, "learning_rate": 4.288086342880864e-05, "loss": 0.6236, "step": 3935 }, { "epoch": 2.3766978569272563, "grad_norm": 0.142578125, "learning_rate": 4.2839352428393524e-05, "loss": 0.6244, "step": 3936 }, { "epoch": 2.3773015393902805, "grad_norm": 0.150390625, "learning_rate": 4.279784142797842e-05, "loss": 0.6411, "step": 3937 }, { "epoch": 2.377905221853305, "grad_norm": 0.1396484375, "learning_rate": 4.2756330427563304e-05, "loss": 0.517, "step": 3938 }, { "epoch": 2.3785089043163294, "grad_norm": 0.14453125, "learning_rate": 4.27148194271482e-05, "loss": 0.8545, "step": 3939 }, { "epoch": 2.379112586779354, "grad_norm": 0.154296875, "learning_rate": 4.267330842673309e-05, "loss": 0.9618, "step": 3940 }, { "epoch": 2.3797162692423783, "grad_norm": 0.1494140625, "learning_rate": 4.2631797426317977e-05, "loss": 0.6273, "step": 3941 }, { "epoch": 2.380319951705403, "grad_norm": 0.13671875, "learning_rate": 4.259028642590286e-05, "loss": 0.9013, "step": 3942 }, { "epoch": 2.3809236341684272, "grad_norm": 0.1494140625, "learning_rate": 4.2548775425487756e-05, "loss": 0.8099, "step": 3943 }, { "epoch": 2.381527316631452, "grad_norm": 0.1494140625, "learning_rate": 4.250726442507264e-05, "loss": 0.633, "step": 3944 }, { "epoch": 2.382130999094476, "grad_norm": 0.1640625, "learning_rate": 4.2465753424657536e-05, "loss": 0.5849, "step": 3945 }, { "epoch": 2.382734681557501, "grad_norm": 0.1376953125, "learning_rate": 4.242424242424243e-05, "loss": 0.5141, "step": 3946 }, { "epoch": 2.383338364020525, "grad_norm": 0.142578125, "learning_rate": 4.2382731423827316e-05, "loss": 0.5301, "step": 3947 }, { "epoch": 2.3839420464835497, "grad_norm": 0.142578125, "learning_rate": 4.234122042341221e-05, "loss": 0.5945, "step": 3948 }, { "epoch": 2.3845457289465744, "grad_norm": 0.146484375, "learning_rate": 4.2299709422997096e-05, "loss": 0.5041, "step": 3949 }, { "epoch": 2.3851494114095986, "grad_norm": 0.15625, "learning_rate": 4.225819842258198e-05, "loss": 0.5563, "step": 3950 }, { "epoch": 2.385753093872623, "grad_norm": 0.1611328125, "learning_rate": 4.2216687422166875e-05, "loss": 0.5933, "step": 3951 }, { "epoch": 2.3863567763356475, "grad_norm": 0.1650390625, "learning_rate": 4.217517642175177e-05, "loss": 0.5283, "step": 3952 }, { "epoch": 2.386960458798672, "grad_norm": 0.171875, "learning_rate": 4.2133665421336655e-05, "loss": 0.5161, "step": 3953 }, { "epoch": 2.3875641412616964, "grad_norm": 0.1708984375, "learning_rate": 4.209215442092155e-05, "loss": 0.5154, "step": 3954 }, { "epoch": 2.3881678237247206, "grad_norm": 0.1630859375, "learning_rate": 4.2050643420506435e-05, "loss": 0.4278, "step": 3955 }, { "epoch": 2.3887715061877453, "grad_norm": 0.177734375, "learning_rate": 4.200913242009132e-05, "loss": 0.5358, "step": 3956 }, { "epoch": 2.38937518865077, "grad_norm": 0.1796875, "learning_rate": 4.1967621419676215e-05, "loss": 0.4306, "step": 3957 }, { "epoch": 2.389978871113794, "grad_norm": 0.1904296875, "learning_rate": 4.192611041926111e-05, "loss": 0.4228, "step": 3958 }, { "epoch": 2.3905825535768184, "grad_norm": 0.1826171875, "learning_rate": 4.1884599418846e-05, "loss": 0.3486, "step": 3959 }, { "epoch": 2.391186236039843, "grad_norm": 0.205078125, "learning_rate": 4.184308841843089e-05, "loss": 0.3173, "step": 3960 }, { "epoch": 2.3917899185028677, "grad_norm": 0.212890625, "learning_rate": 4.1801577418015774e-05, "loss": 0.328, "step": 3961 }, { "epoch": 2.392393600965892, "grad_norm": 0.2001953125, "learning_rate": 4.176006641760067e-05, "loss": 0.2109, "step": 3962 }, { "epoch": 2.392997283428916, "grad_norm": 0.14453125, "learning_rate": 4.1718555417185554e-05, "loss": 0.7375, "step": 3963 }, { "epoch": 2.393600965891941, "grad_norm": 0.146484375, "learning_rate": 4.167704441677045e-05, "loss": 0.515, "step": 3964 }, { "epoch": 2.3942046483549655, "grad_norm": 0.150390625, "learning_rate": 4.163553341635534e-05, "loss": 0.6026, "step": 3965 }, { "epoch": 2.3948083308179897, "grad_norm": 0.154296875, "learning_rate": 4.159402241594023e-05, "loss": 0.5353, "step": 3966 }, { "epoch": 2.395412013281014, "grad_norm": 0.1376953125, "learning_rate": 4.155251141552511e-05, "loss": 0.6007, "step": 3967 }, { "epoch": 2.3960156957440386, "grad_norm": 0.134765625, "learning_rate": 4.1511000415110007e-05, "loss": 0.5639, "step": 3968 }, { "epoch": 2.3966193782070633, "grad_norm": 0.1396484375, "learning_rate": 4.146948941469489e-05, "loss": 0.5642, "step": 3969 }, { "epoch": 2.3972230606700875, "grad_norm": 0.146484375, "learning_rate": 4.1427978414279786e-05, "loss": 0.584, "step": 3970 }, { "epoch": 2.397826743133112, "grad_norm": 0.1416015625, "learning_rate": 4.138646741386468e-05, "loss": 0.5897, "step": 3971 }, { "epoch": 2.3984304255961364, "grad_norm": 0.1484375, "learning_rate": 4.1344956413449566e-05, "loss": 0.7358, "step": 3972 }, { "epoch": 2.399034108059161, "grad_norm": 0.177734375, "learning_rate": 4.130344541303446e-05, "loss": 0.5912, "step": 3973 }, { "epoch": 2.3996377905221853, "grad_norm": 0.158203125, "learning_rate": 4.1261934412619346e-05, "loss": 0.8172, "step": 3974 }, { "epoch": 2.40024147298521, "grad_norm": 0.1376953125, "learning_rate": 4.122042341220423e-05, "loss": 0.8762, "step": 3975 }, { "epoch": 2.400845155448234, "grad_norm": 0.1484375, "learning_rate": 4.1178912411789126e-05, "loss": 0.6193, "step": 3976 }, { "epoch": 2.401448837911259, "grad_norm": 0.1591796875, "learning_rate": 4.113740141137402e-05, "loss": 0.6507, "step": 3977 }, { "epoch": 2.402052520374283, "grad_norm": 0.158203125, "learning_rate": 4.1095890410958905e-05, "loss": 0.6294, "step": 3978 }, { "epoch": 2.4026562028373077, "grad_norm": 0.1669921875, "learning_rate": 4.10543794105438e-05, "loss": 0.6852, "step": 3979 }, { "epoch": 2.403259885300332, "grad_norm": 0.189453125, "learning_rate": 4.1012868410128685e-05, "loss": 0.6219, "step": 3980 }, { "epoch": 2.4038635677633566, "grad_norm": 0.138671875, "learning_rate": 4.097135740971358e-05, "loss": 0.5864, "step": 3981 }, { "epoch": 2.404467250226381, "grad_norm": 0.1572265625, "learning_rate": 4.0929846409298465e-05, "loss": 0.5816, "step": 3982 }, { "epoch": 2.4050709326894055, "grad_norm": 0.1689453125, "learning_rate": 4.088833540888336e-05, "loss": 0.7003, "step": 3983 }, { "epoch": 2.4056746151524298, "grad_norm": 0.140625, "learning_rate": 4.084682440846825e-05, "loss": 0.6031, "step": 3984 }, { "epoch": 2.4062782976154544, "grad_norm": 0.1435546875, "learning_rate": 4.080531340805314e-05, "loss": 0.9704, "step": 3985 }, { "epoch": 2.4068819800784786, "grad_norm": 0.142578125, "learning_rate": 4.0763802407638024e-05, "loss": 0.5736, "step": 3986 }, { "epoch": 2.4074856625415033, "grad_norm": 0.1455078125, "learning_rate": 4.072229140722292e-05, "loss": 0.8293, "step": 3987 }, { "epoch": 2.4080893450045275, "grad_norm": 0.1494140625, "learning_rate": 4.0680780406807804e-05, "loss": 0.5689, "step": 3988 }, { "epoch": 2.408693027467552, "grad_norm": 0.1396484375, "learning_rate": 4.063926940639269e-05, "loss": 0.6113, "step": 3989 }, { "epoch": 2.4092967099305764, "grad_norm": 0.1298828125, "learning_rate": 4.059775840597759e-05, "loss": 0.5141, "step": 3990 }, { "epoch": 2.409900392393601, "grad_norm": 0.1435546875, "learning_rate": 4.055624740556248e-05, "loss": 0.8989, "step": 3991 }, { "epoch": 2.4105040748566253, "grad_norm": 0.140625, "learning_rate": 4.0514736405147363e-05, "loss": 0.5546, "step": 3992 }, { "epoch": 2.41110775731965, "grad_norm": 0.1435546875, "learning_rate": 4.047322540473226e-05, "loss": 0.5768, "step": 3993 }, { "epoch": 2.411711439782674, "grad_norm": 0.373046875, "learning_rate": 4.043171440431714e-05, "loss": 0.5518, "step": 3994 }, { "epoch": 2.412315122245699, "grad_norm": 0.1328125, "learning_rate": 4.0390203403902036e-05, "loss": 0.5835, "step": 3995 }, { "epoch": 2.412918804708723, "grad_norm": 0.14453125, "learning_rate": 4.034869240348692e-05, "loss": 0.8638, "step": 3996 }, { "epoch": 2.4135224871717478, "grad_norm": 0.138671875, "learning_rate": 4.0307181403071816e-05, "loss": 0.5474, "step": 3997 }, { "epoch": 2.414126169634772, "grad_norm": 0.13671875, "learning_rate": 4.026567040265671e-05, "loss": 0.5179, "step": 3998 }, { "epoch": 2.4147298520977967, "grad_norm": 0.150390625, "learning_rate": 4.0224159402241596e-05, "loss": 0.5755, "step": 3999 }, { "epoch": 2.415333534560821, "grad_norm": 0.15234375, "learning_rate": 4.018264840182648e-05, "loss": 0.4952, "step": 4000 }, { "epoch": 2.415333534560821, "eval_loss": 0.6154825687408447, "eval_runtime": 1059.7325, "eval_samples_per_second": 2.633, "eval_steps_per_second": 0.329, "step": 4000 }, { "epoch": 2.4159372170238456, "grad_norm": 0.1552734375, "learning_rate": 4.0141137401411376e-05, "loss": 0.5103, "step": 4001 }, { "epoch": 2.41654089948687, "grad_norm": 0.1650390625, "learning_rate": 4.009962640099626e-05, "loss": 0.5184, "step": 4002 }, { "epoch": 2.4171445819498945, "grad_norm": 0.1640625, "learning_rate": 4.0058115400581155e-05, "loss": 0.4408, "step": 4003 }, { "epoch": 2.4177482644129187, "grad_norm": 0.1591796875, "learning_rate": 4.001660440016605e-05, "loss": 0.531, "step": 4004 }, { "epoch": 2.4183519468759433, "grad_norm": 0.1845703125, "learning_rate": 3.9975093399750935e-05, "loss": 0.5268, "step": 4005 }, { "epoch": 2.4189556293389676, "grad_norm": 0.1748046875, "learning_rate": 3.993358239933583e-05, "loss": 0.4319, "step": 4006 }, { "epoch": 2.4195593118019922, "grad_norm": 0.1748046875, "learning_rate": 3.9892071398920715e-05, "loss": 0.3856, "step": 4007 }, { "epoch": 2.4201629942650165, "grad_norm": 0.1982421875, "learning_rate": 3.98505603985056e-05, "loss": 0.4184, "step": 4008 }, { "epoch": 2.420766676728041, "grad_norm": 0.2099609375, "learning_rate": 3.9809049398090495e-05, "loss": 0.4034, "step": 4009 }, { "epoch": 2.4213703591910654, "grad_norm": 0.193359375, "learning_rate": 3.976753839767539e-05, "loss": 0.2712, "step": 4010 }, { "epoch": 2.42197404165409, "grad_norm": 0.1875, "learning_rate": 3.9726027397260274e-05, "loss": 0.2546, "step": 4011 }, { "epoch": 2.4225777241171143, "grad_norm": 0.185546875, "learning_rate": 3.968451639684517e-05, "loss": 0.2069, "step": 4012 }, { "epoch": 2.423181406580139, "grad_norm": 0.16015625, "learning_rate": 3.9643005396430054e-05, "loss": 0.6653, "step": 4013 }, { "epoch": 2.423785089043163, "grad_norm": 0.1484375, "learning_rate": 3.960149439601494e-05, "loss": 0.5395, "step": 4014 }, { "epoch": 2.424388771506188, "grad_norm": 0.1357421875, "learning_rate": 3.9559983395599834e-05, "loss": 0.6241, "step": 4015 }, { "epoch": 2.424992453969212, "grad_norm": 0.1357421875, "learning_rate": 3.951847239518473e-05, "loss": 0.628, "step": 4016 }, { "epoch": 2.4255961364322367, "grad_norm": 0.1640625, "learning_rate": 3.947696139476962e-05, "loss": 0.6188, "step": 4017 }, { "epoch": 2.426199818895261, "grad_norm": 0.15234375, "learning_rate": 3.943545039435451e-05, "loss": 0.7127, "step": 4018 }, { "epoch": 2.4268035013582856, "grad_norm": 0.154296875, "learning_rate": 3.939393939393939e-05, "loss": 0.8119, "step": 4019 }, { "epoch": 2.42740718382131, "grad_norm": 0.1484375, "learning_rate": 3.935242839352429e-05, "loss": 0.5477, "step": 4020 }, { "epoch": 2.4280108662843345, "grad_norm": 0.138671875, "learning_rate": 3.931091739310917e-05, "loss": 0.557, "step": 4021 }, { "epoch": 2.4286145487473587, "grad_norm": 0.1435546875, "learning_rate": 3.9269406392694066e-05, "loss": 0.8675, "step": 4022 }, { "epoch": 2.4292182312103834, "grad_norm": 0.1328125, "learning_rate": 3.922789539227896e-05, "loss": 0.7214, "step": 4023 }, { "epoch": 2.4298219136734076, "grad_norm": 0.14453125, "learning_rate": 3.9186384391863846e-05, "loss": 0.6073, "step": 4024 }, { "epoch": 2.4304255961364323, "grad_norm": 0.1376953125, "learning_rate": 3.914487339144873e-05, "loss": 0.6403, "step": 4025 }, { "epoch": 2.4310292785994565, "grad_norm": 0.146484375, "learning_rate": 3.9103362391033626e-05, "loss": 0.7011, "step": 4026 }, { "epoch": 2.431632961062481, "grad_norm": 0.1416015625, "learning_rate": 3.906185139061851e-05, "loss": 0.6148, "step": 4027 }, { "epoch": 2.4322366435255054, "grad_norm": 0.1533203125, "learning_rate": 3.9020340390203406e-05, "loss": 0.743, "step": 4028 }, { "epoch": 2.43284032598853, "grad_norm": 0.1494140625, "learning_rate": 3.89788293897883e-05, "loss": 0.6212, "step": 4029 }, { "epoch": 2.4334440084515547, "grad_norm": 0.1513671875, "learning_rate": 3.8937318389373185e-05, "loss": 0.5285, "step": 4030 }, { "epoch": 2.434047690914579, "grad_norm": 0.140625, "learning_rate": 3.889580738895808e-05, "loss": 0.7145, "step": 4031 }, { "epoch": 2.434651373377603, "grad_norm": 0.1865234375, "learning_rate": 3.8854296388542965e-05, "loss": 0.6085, "step": 4032 }, { "epoch": 2.435255055840628, "grad_norm": 0.1474609375, "learning_rate": 3.881278538812785e-05, "loss": 0.8197, "step": 4033 }, { "epoch": 2.4358587383036525, "grad_norm": 0.142578125, "learning_rate": 3.8771274387712745e-05, "loss": 0.5681, "step": 4034 }, { "epoch": 2.4364624207666767, "grad_norm": 0.1640625, "learning_rate": 3.872976338729764e-05, "loss": 0.6784, "step": 4035 }, { "epoch": 2.437066103229701, "grad_norm": 0.15234375, "learning_rate": 3.8688252386882525e-05, "loss": 1.1645, "step": 4036 }, { "epoch": 2.4376697856927256, "grad_norm": 0.15234375, "learning_rate": 3.864674138646742e-05, "loss": 0.8407, "step": 4037 }, { "epoch": 2.4382734681557503, "grad_norm": 0.17578125, "learning_rate": 3.8605230386052304e-05, "loss": 0.5816, "step": 4038 }, { "epoch": 2.4388771506187745, "grad_norm": 0.208984375, "learning_rate": 3.85637193856372e-05, "loss": 0.9043, "step": 4039 }, { "epoch": 2.4394808330817987, "grad_norm": 0.1435546875, "learning_rate": 3.8522208385222084e-05, "loss": 0.5051, "step": 4040 }, { "epoch": 2.4400845155448234, "grad_norm": 0.1650390625, "learning_rate": 3.848069738480698e-05, "loss": 0.5917, "step": 4041 }, { "epoch": 2.440688198007848, "grad_norm": 0.15625, "learning_rate": 3.843918638439187e-05, "loss": 0.6743, "step": 4042 }, { "epoch": 2.4412918804708723, "grad_norm": 0.1455078125, "learning_rate": 3.839767538397676e-05, "loss": 0.6768, "step": 4043 }, { "epoch": 2.4418955629338965, "grad_norm": 0.1416015625, "learning_rate": 3.8356164383561644e-05, "loss": 0.7161, "step": 4044 }, { "epoch": 2.442499245396921, "grad_norm": 0.1416015625, "learning_rate": 3.831465338314654e-05, "loss": 0.5666, "step": 4045 }, { "epoch": 2.443102927859946, "grad_norm": 0.1435546875, "learning_rate": 3.827314238273142e-05, "loss": 0.5713, "step": 4046 }, { "epoch": 2.44370661032297, "grad_norm": 0.142578125, "learning_rate": 3.823163138231631e-05, "loss": 0.5426, "step": 4047 }, { "epoch": 2.4443102927859943, "grad_norm": 0.1533203125, "learning_rate": 3.819012038190121e-05, "loss": 0.5295, "step": 4048 }, { "epoch": 2.444913975249019, "grad_norm": 0.158203125, "learning_rate": 3.8148609381486096e-05, "loss": 0.5478, "step": 4049 }, { "epoch": 2.4455176577120437, "grad_norm": 0.1748046875, "learning_rate": 3.810709838107099e-05, "loss": 0.5445, "step": 4050 }, { "epoch": 2.446121340175068, "grad_norm": 0.185546875, "learning_rate": 3.8065587380655876e-05, "loss": 0.5635, "step": 4051 }, { "epoch": 2.4467250226380926, "grad_norm": 0.1640625, "learning_rate": 3.802407638024076e-05, "loss": 0.5435, "step": 4052 }, { "epoch": 2.4473287051011168, "grad_norm": 0.1572265625, "learning_rate": 3.7982565379825656e-05, "loss": 0.4776, "step": 4053 }, { "epoch": 2.4479323875641414, "grad_norm": 0.171875, "learning_rate": 3.794105437941054e-05, "loss": 0.4868, "step": 4054 }, { "epoch": 2.4485360700271657, "grad_norm": 0.1787109375, "learning_rate": 3.7899543378995436e-05, "loss": 0.4523, "step": 4055 }, { "epoch": 2.4491397524901903, "grad_norm": 0.1962890625, "learning_rate": 3.785803237858033e-05, "loss": 0.3794, "step": 4056 }, { "epoch": 2.4497434349532146, "grad_norm": 0.193359375, "learning_rate": 3.7816521378165215e-05, "loss": 0.4109, "step": 4057 }, { "epoch": 2.4503471174162392, "grad_norm": 0.2001953125, "learning_rate": 3.77750103777501e-05, "loss": 0.3978, "step": 4058 }, { "epoch": 2.4509507998792635, "grad_norm": 0.189453125, "learning_rate": 3.7733499377334995e-05, "loss": 0.3167, "step": 4059 }, { "epoch": 2.451554482342288, "grad_norm": 0.2041015625, "learning_rate": 3.769198837691988e-05, "loss": 0.3251, "step": 4060 }, { "epoch": 2.4521581648053123, "grad_norm": 0.189453125, "learning_rate": 3.7650477376504775e-05, "loss": 0.241, "step": 4061 }, { "epoch": 2.452761847268337, "grad_norm": 0.20703125, "learning_rate": 3.760896637608967e-05, "loss": 0.2344, "step": 4062 }, { "epoch": 2.4533655297313612, "grad_norm": 0.1435546875, "learning_rate": 3.7567455375674554e-05, "loss": 0.6431, "step": 4063 }, { "epoch": 2.453969212194386, "grad_norm": 0.14453125, "learning_rate": 3.752594437525945e-05, "loss": 0.5608, "step": 4064 }, { "epoch": 2.45457289465741, "grad_norm": 0.158203125, "learning_rate": 3.7484433374844334e-05, "loss": 0.6186, "step": 4065 }, { "epoch": 2.455176577120435, "grad_norm": 0.1337890625, "learning_rate": 3.744292237442922e-05, "loss": 0.5959, "step": 4066 }, { "epoch": 2.455780259583459, "grad_norm": 0.1552734375, "learning_rate": 3.7401411374014114e-05, "loss": 0.5077, "step": 4067 }, { "epoch": 2.4563839420464837, "grad_norm": 0.1513671875, "learning_rate": 3.735990037359901e-05, "loss": 0.578, "step": 4068 }, { "epoch": 2.456987624509508, "grad_norm": 0.154296875, "learning_rate": 3.7318389373183894e-05, "loss": 0.6015, "step": 4069 }, { "epoch": 2.4575913069725326, "grad_norm": 0.146484375, "learning_rate": 3.727687837276879e-05, "loss": 0.6267, "step": 4070 }, { "epoch": 2.458194989435557, "grad_norm": 0.1494140625, "learning_rate": 3.7235367372353673e-05, "loss": 0.6992, "step": 4071 }, { "epoch": 2.4587986718985815, "grad_norm": 0.15234375, "learning_rate": 3.719385637193857e-05, "loss": 0.6082, "step": 4072 }, { "epoch": 2.4594023543616057, "grad_norm": 0.150390625, "learning_rate": 3.715234537152345e-05, "loss": 0.6808, "step": 4073 }, { "epoch": 2.4600060368246304, "grad_norm": 0.154296875, "learning_rate": 3.7110834371108346e-05, "loss": 0.6117, "step": 4074 }, { "epoch": 2.4606097192876546, "grad_norm": 0.146484375, "learning_rate": 3.706932337069324e-05, "loss": 0.9315, "step": 4075 }, { "epoch": 2.4612134017506793, "grad_norm": 0.1513671875, "learning_rate": 3.7027812370278126e-05, "loss": 0.647, "step": 4076 }, { "epoch": 2.4618170842137035, "grad_norm": 0.1611328125, "learning_rate": 3.698630136986301e-05, "loss": 0.7295, "step": 4077 }, { "epoch": 2.462420766676728, "grad_norm": 0.1494140625, "learning_rate": 3.6944790369447906e-05, "loss": 0.6171, "step": 4078 }, { "epoch": 2.4630244491397524, "grad_norm": 0.1474609375, "learning_rate": 3.690327936903279e-05, "loss": 0.698, "step": 4079 }, { "epoch": 2.463628131602777, "grad_norm": 0.1435546875, "learning_rate": 3.6861768368617686e-05, "loss": 0.5728, "step": 4080 }, { "epoch": 2.4642318140658013, "grad_norm": 0.13671875, "learning_rate": 3.682025736820258e-05, "loss": 0.6312, "step": 4081 }, { "epoch": 2.464835496528826, "grad_norm": 0.1533203125, "learning_rate": 3.6778746367787465e-05, "loss": 0.6361, "step": 4082 }, { "epoch": 2.46543917899185, "grad_norm": 0.138671875, "learning_rate": 3.673723536737236e-05, "loss": 0.6073, "step": 4083 }, { "epoch": 2.466042861454875, "grad_norm": 0.16796875, "learning_rate": 3.6695724366957245e-05, "loss": 0.5762, "step": 4084 }, { "epoch": 2.466646543917899, "grad_norm": 0.146484375, "learning_rate": 3.665421336654213e-05, "loss": 0.704, "step": 4085 }, { "epoch": 2.4672502263809237, "grad_norm": 0.2255859375, "learning_rate": 3.6612702366127025e-05, "loss": 0.5739, "step": 4086 }, { "epoch": 2.467853908843948, "grad_norm": 0.1640625, "learning_rate": 3.657119136571192e-05, "loss": 0.6475, "step": 4087 }, { "epoch": 2.4684575913069726, "grad_norm": 0.1591796875, "learning_rate": 3.6529680365296805e-05, "loss": 0.6407, "step": 4088 }, { "epoch": 2.469061273769997, "grad_norm": 0.1357421875, "learning_rate": 3.64881693648817e-05, "loss": 0.6102, "step": 4089 }, { "epoch": 2.4696649562330215, "grad_norm": 0.138671875, "learning_rate": 3.6446658364466584e-05, "loss": 0.5091, "step": 4090 }, { "epoch": 2.4702686386960457, "grad_norm": 0.146484375, "learning_rate": 3.640514736405147e-05, "loss": 0.5409, "step": 4091 }, { "epoch": 2.4708723211590704, "grad_norm": 0.142578125, "learning_rate": 3.6363636363636364e-05, "loss": 0.7817, "step": 4092 }, { "epoch": 2.4714760036220946, "grad_norm": 0.1513671875, "learning_rate": 3.632212536322126e-05, "loss": 0.604, "step": 4093 }, { "epoch": 2.4720796860851193, "grad_norm": 0.1396484375, "learning_rate": 3.628061436280615e-05, "loss": 0.6486, "step": 4094 }, { "epoch": 2.4726833685481435, "grad_norm": 0.1552734375, "learning_rate": 3.623910336239104e-05, "loss": 0.5922, "step": 4095 }, { "epoch": 2.473287051011168, "grad_norm": 0.1494140625, "learning_rate": 3.6197592361975924e-05, "loss": 0.5334, "step": 4096 }, { "epoch": 2.4738907334741924, "grad_norm": 0.166015625, "learning_rate": 3.615608136156082e-05, "loss": 0.5537, "step": 4097 }, { "epoch": 2.474494415937217, "grad_norm": 0.146484375, "learning_rate": 3.61145703611457e-05, "loss": 0.5546, "step": 4098 }, { "epoch": 2.4750980984002413, "grad_norm": 0.1572265625, "learning_rate": 3.60730593607306e-05, "loss": 0.4821, "step": 4099 }, { "epoch": 2.475701780863266, "grad_norm": 0.1650390625, "learning_rate": 3.603154836031549e-05, "loss": 0.5078, "step": 4100 }, { "epoch": 2.47630546332629, "grad_norm": 0.1552734375, "learning_rate": 3.5990037359900376e-05, "loss": 0.4714, "step": 4101 }, { "epoch": 2.476909145789315, "grad_norm": 0.17578125, "learning_rate": 3.594852635948526e-05, "loss": 0.5351, "step": 4102 }, { "epoch": 2.477512828252339, "grad_norm": 0.169921875, "learning_rate": 3.5907015359070156e-05, "loss": 0.5389, "step": 4103 }, { "epoch": 2.4781165107153638, "grad_norm": 0.1796875, "learning_rate": 3.586550435865504e-05, "loss": 0.5224, "step": 4104 }, { "epoch": 2.478720193178388, "grad_norm": 0.18359375, "learning_rate": 3.5823993358239936e-05, "loss": 0.4301, "step": 4105 }, { "epoch": 2.4793238756414127, "grad_norm": 0.181640625, "learning_rate": 3.578248235782483e-05, "loss": 0.4746, "step": 4106 }, { "epoch": 2.479927558104437, "grad_norm": 0.19140625, "learning_rate": 3.5740971357409716e-05, "loss": 0.3572, "step": 4107 }, { "epoch": 2.4805312405674615, "grad_norm": 0.2041015625, "learning_rate": 3.569946035699461e-05, "loss": 0.4036, "step": 4108 }, { "epoch": 2.4811349230304858, "grad_norm": 0.21484375, "learning_rate": 3.5657949356579495e-05, "loss": 0.3364, "step": 4109 }, { "epoch": 2.4817386054935104, "grad_norm": 0.1962890625, "learning_rate": 3.561643835616438e-05, "loss": 0.3264, "step": 4110 }, { "epoch": 2.482342287956535, "grad_norm": 0.2119140625, "learning_rate": 3.5574927355749275e-05, "loss": 0.2525, "step": 4111 }, { "epoch": 2.4829459704195593, "grad_norm": 0.353515625, "learning_rate": 3.553341635533416e-05, "loss": 0.2355, "step": 4112 }, { "epoch": 2.4835496528825836, "grad_norm": 0.13671875, "learning_rate": 3.5491905354919055e-05, "loss": 0.6404, "step": 4113 }, { "epoch": 2.4841533353456082, "grad_norm": 0.15625, "learning_rate": 3.545039435450395e-05, "loss": 0.6651, "step": 4114 }, { "epoch": 2.484757017808633, "grad_norm": 0.146484375, "learning_rate": 3.5408883354088835e-05, "loss": 0.685, "step": 4115 }, { "epoch": 2.485360700271657, "grad_norm": 0.1357421875, "learning_rate": 3.536737235367373e-05, "loss": 0.5109, "step": 4116 }, { "epoch": 2.4859643827346813, "grad_norm": 0.140625, "learning_rate": 3.5325861353258614e-05, "loss": 0.7178, "step": 4117 }, { "epoch": 2.486568065197706, "grad_norm": 0.1513671875, "learning_rate": 3.52843503528435e-05, "loss": 0.5863, "step": 4118 }, { "epoch": 2.4871717476607307, "grad_norm": 0.146484375, "learning_rate": 3.5242839352428394e-05, "loss": 0.6693, "step": 4119 }, { "epoch": 2.487775430123755, "grad_norm": 0.1513671875, "learning_rate": 3.520132835201329e-05, "loss": 0.6018, "step": 4120 }, { "epoch": 2.488379112586779, "grad_norm": 0.1494140625, "learning_rate": 3.5159817351598174e-05, "loss": 0.6141, "step": 4121 }, { "epoch": 2.488982795049804, "grad_norm": 0.1552734375, "learning_rate": 3.511830635118307e-05, "loss": 0.7566, "step": 4122 }, { "epoch": 2.4895864775128285, "grad_norm": 0.1513671875, "learning_rate": 3.5076795350767954e-05, "loss": 0.6085, "step": 4123 }, { "epoch": 2.4901901599758527, "grad_norm": 0.14453125, "learning_rate": 3.503528435035284e-05, "loss": 0.6169, "step": 4124 }, { "epoch": 2.490793842438877, "grad_norm": 0.1494140625, "learning_rate": 3.499377334993773e-05, "loss": 1.1005, "step": 4125 }, { "epoch": 2.4913975249019016, "grad_norm": 0.1611328125, "learning_rate": 3.4952262349522627e-05, "loss": 0.6369, "step": 4126 }, { "epoch": 2.4920012073649263, "grad_norm": 0.140625, "learning_rate": 3.491075134910752e-05, "loss": 0.7739, "step": 4127 }, { "epoch": 2.4926048898279505, "grad_norm": 0.1416015625, "learning_rate": 3.4869240348692406e-05, "loss": 0.6243, "step": 4128 }, { "epoch": 2.4932085722909747, "grad_norm": 0.158203125, "learning_rate": 3.482772934827729e-05, "loss": 0.6468, "step": 4129 }, { "epoch": 2.4938122547539994, "grad_norm": 0.1474609375, "learning_rate": 3.4786218347862186e-05, "loss": 0.6467, "step": 4130 }, { "epoch": 2.494415937217024, "grad_norm": 0.1474609375, "learning_rate": 3.474470734744707e-05, "loss": 0.6936, "step": 4131 }, { "epoch": 2.4950196196800483, "grad_norm": 0.138671875, "learning_rate": 3.4703196347031966e-05, "loss": 0.6583, "step": 4132 }, { "epoch": 2.495623302143073, "grad_norm": 0.13671875, "learning_rate": 3.466168534661686e-05, "loss": 0.55, "step": 4133 }, { "epoch": 2.496226984606097, "grad_norm": 0.14453125, "learning_rate": 3.4620174346201746e-05, "loss": 0.6066, "step": 4134 }, { "epoch": 2.496830667069122, "grad_norm": 0.1328125, "learning_rate": 3.457866334578663e-05, "loss": 0.6148, "step": 4135 }, { "epoch": 2.497434349532146, "grad_norm": 0.1455078125, "learning_rate": 3.4537152345371525e-05, "loss": 0.6258, "step": 4136 }, { "epoch": 2.4980380319951707, "grad_norm": 0.142578125, "learning_rate": 3.449564134495641e-05, "loss": 0.6906, "step": 4137 }, { "epoch": 2.498641714458195, "grad_norm": 0.1435546875, "learning_rate": 3.4454130344541305e-05, "loss": 0.601, "step": 4138 }, { "epoch": 2.4992453969212196, "grad_norm": 0.1435546875, "learning_rate": 3.44126193441262e-05, "loss": 0.6109, "step": 4139 }, { "epoch": 2.499849079384244, "grad_norm": 0.138671875, "learning_rate": 3.4371108343711085e-05, "loss": 0.6139, "step": 4140 }, { "epoch": 2.5004527618472685, "grad_norm": 0.1357421875, "learning_rate": 3.432959734329598e-05, "loss": 0.4505, "step": 4141 }, { "epoch": 2.5010564443102927, "grad_norm": 0.1376953125, "learning_rate": 3.4288086342880864e-05, "loss": 0.5743, "step": 4142 }, { "epoch": 2.5016601267733174, "grad_norm": 0.138671875, "learning_rate": 3.424657534246575e-05, "loss": 0.6745, "step": 4143 }, { "epoch": 2.5022638092363416, "grad_norm": 0.1396484375, "learning_rate": 3.4205064342050644e-05, "loss": 0.7918, "step": 4144 }, { "epoch": 2.5028674916993663, "grad_norm": 0.1533203125, "learning_rate": 3.416355334163554e-05, "loss": 0.5862, "step": 4145 }, { "epoch": 2.5034711741623905, "grad_norm": 0.1435546875, "learning_rate": 3.4122042341220424e-05, "loss": 0.6077, "step": 4146 }, { "epoch": 2.504074856625415, "grad_norm": 0.142578125, "learning_rate": 3.408053134080532e-05, "loss": 0.5726, "step": 4147 }, { "epoch": 2.5046785390884394, "grad_norm": 0.1435546875, "learning_rate": 3.4039020340390204e-05, "loss": 0.5069, "step": 4148 }, { "epoch": 2.505282221551464, "grad_norm": 0.1474609375, "learning_rate": 3.39975093399751e-05, "loss": 0.5979, "step": 4149 }, { "epoch": 2.5058859040144883, "grad_norm": 0.1552734375, "learning_rate": 3.3955998339559983e-05, "loss": 0.5018, "step": 4150 }, { "epoch": 2.506489586477513, "grad_norm": 0.158203125, "learning_rate": 3.391448733914488e-05, "loss": 0.4882, "step": 4151 }, { "epoch": 2.507093268940537, "grad_norm": 0.158203125, "learning_rate": 3.387297633872977e-05, "loss": 0.5086, "step": 4152 }, { "epoch": 2.507696951403562, "grad_norm": 0.1669921875, "learning_rate": 3.3831465338314656e-05, "loss": 0.523, "step": 4153 }, { "epoch": 2.508300633866586, "grad_norm": 0.1923828125, "learning_rate": 3.378995433789954e-05, "loss": 0.4776, "step": 4154 }, { "epoch": 2.5089043163296108, "grad_norm": 0.1796875, "learning_rate": 3.3748443337484436e-05, "loss": 0.5318, "step": 4155 }, { "epoch": 2.509507998792635, "grad_norm": 0.1875, "learning_rate": 3.370693233706932e-05, "loss": 0.4694, "step": 4156 }, { "epoch": 2.5101116812556596, "grad_norm": 0.1865234375, "learning_rate": 3.3665421336654216e-05, "loss": 0.4135, "step": 4157 }, { "epoch": 2.510715363718684, "grad_norm": 0.18359375, "learning_rate": 3.362391033623911e-05, "loss": 0.3618, "step": 4158 }, { "epoch": 2.5113190461817085, "grad_norm": 0.20703125, "learning_rate": 3.3582399335823996e-05, "loss": 0.412, "step": 4159 }, { "epoch": 2.5119227286447328, "grad_norm": 0.2080078125, "learning_rate": 3.354088833540889e-05, "loss": 0.3636, "step": 4160 }, { "epoch": 2.5125264111077574, "grad_norm": 0.2021484375, "learning_rate": 3.3499377334993775e-05, "loss": 0.2725, "step": 4161 }, { "epoch": 2.5131300935707817, "grad_norm": 0.2177734375, "learning_rate": 3.345786633457866e-05, "loss": 0.2317, "step": 4162 }, { "epoch": 2.5137337760338063, "grad_norm": 0.1396484375, "learning_rate": 3.3416355334163555e-05, "loss": 0.5467, "step": 4163 }, { "epoch": 2.5143374584968305, "grad_norm": 0.1455078125, "learning_rate": 3.337484433374845e-05, "loss": 0.6671, "step": 4164 }, { "epoch": 2.514941140959855, "grad_norm": 0.1533203125, "learning_rate": 3.3333333333333335e-05, "loss": 0.653, "step": 4165 }, { "epoch": 2.5155448234228794, "grad_norm": 0.1533203125, "learning_rate": 3.329182233291823e-05, "loss": 0.6325, "step": 4166 }, { "epoch": 2.516148505885904, "grad_norm": 0.1533203125, "learning_rate": 3.3250311332503115e-05, "loss": 0.593, "step": 4167 }, { "epoch": 2.5167521883489283, "grad_norm": 0.162109375, "learning_rate": 3.3208800332088e-05, "loss": 0.6592, "step": 4168 }, { "epoch": 2.517355870811953, "grad_norm": 0.15234375, "learning_rate": 3.3167289331672894e-05, "loss": 0.6527, "step": 4169 }, { "epoch": 2.5179595532749772, "grad_norm": 0.1474609375, "learning_rate": 3.312577833125778e-05, "loss": 0.6054, "step": 4170 }, { "epoch": 2.518563235738002, "grad_norm": 0.150390625, "learning_rate": 3.308426733084268e-05, "loss": 0.6671, "step": 4171 }, { "epoch": 2.519166918201026, "grad_norm": 0.1318359375, "learning_rate": 3.304275633042757e-05, "loss": 0.527, "step": 4172 }, { "epoch": 2.519770600664051, "grad_norm": 0.1474609375, "learning_rate": 3.3001245330012454e-05, "loss": 0.5836, "step": 4173 }, { "epoch": 2.520374283127075, "grad_norm": 0.1416015625, "learning_rate": 3.295973432959735e-05, "loss": 0.5882, "step": 4174 }, { "epoch": 2.5209779655900997, "grad_norm": 0.14453125, "learning_rate": 3.2918223329182234e-05, "loss": 0.6148, "step": 4175 }, { "epoch": 2.521581648053124, "grad_norm": 0.16796875, "learning_rate": 3.287671232876712e-05, "loss": 0.6586, "step": 4176 }, { "epoch": 2.5221853305161486, "grad_norm": 0.13671875, "learning_rate": 3.2835201328352013e-05, "loss": 0.496, "step": 4177 }, { "epoch": 2.522789012979173, "grad_norm": 0.1396484375, "learning_rate": 3.279369032793691e-05, "loss": 0.6182, "step": 4178 }, { "epoch": 2.5233926954421975, "grad_norm": 0.15625, "learning_rate": 3.275217932752179e-05, "loss": 0.5696, "step": 4179 }, { "epoch": 2.5239963779052217, "grad_norm": 0.1474609375, "learning_rate": 3.2710668327106686e-05, "loss": 0.643, "step": 4180 }, { "epoch": 2.5246000603682464, "grad_norm": 0.1474609375, "learning_rate": 3.266915732669157e-05, "loss": 0.6102, "step": 4181 }, { "epoch": 2.5252037428312706, "grad_norm": 0.15234375, "learning_rate": 3.2627646326276466e-05, "loss": 0.5995, "step": 4182 }, { "epoch": 2.5258074252942952, "grad_norm": 0.1630859375, "learning_rate": 3.258613532586135e-05, "loss": 0.5864, "step": 4183 }, { "epoch": 2.52641110775732, "grad_norm": 0.1474609375, "learning_rate": 3.2544624325446246e-05, "loss": 0.5564, "step": 4184 }, { "epoch": 2.527014790220344, "grad_norm": 0.15234375, "learning_rate": 3.250311332503114e-05, "loss": 0.5808, "step": 4185 }, { "epoch": 2.5276184726833684, "grad_norm": 0.2119140625, "learning_rate": 3.2461602324616026e-05, "loss": 0.5563, "step": 4186 }, { "epoch": 2.528222155146393, "grad_norm": 0.140625, "learning_rate": 3.242009132420091e-05, "loss": 0.5779, "step": 4187 }, { "epoch": 2.5288258376094177, "grad_norm": 0.14453125, "learning_rate": 3.2378580323785805e-05, "loss": 0.6378, "step": 4188 }, { "epoch": 2.529429520072442, "grad_norm": 0.1484375, "learning_rate": 3.233706932337069e-05, "loss": 0.5168, "step": 4189 }, { "epoch": 2.530033202535466, "grad_norm": 0.146484375, "learning_rate": 3.2295558322955585e-05, "loss": 0.6191, "step": 4190 }, { "epoch": 2.530636884998491, "grad_norm": 0.1474609375, "learning_rate": 3.225404732254048e-05, "loss": 0.6251, "step": 4191 }, { "epoch": 2.5312405674615155, "grad_norm": 0.1533203125, "learning_rate": 3.2212536322125365e-05, "loss": 0.592, "step": 4192 }, { "epoch": 2.5318442499245397, "grad_norm": 0.1474609375, "learning_rate": 3.217102532171026e-05, "loss": 0.5686, "step": 4193 }, { "epoch": 2.532447932387564, "grad_norm": 0.138671875, "learning_rate": 3.2129514321295145e-05, "loss": 0.7272, "step": 4194 }, { "epoch": 2.5330516148505886, "grad_norm": 0.1328125, "learning_rate": 3.208800332088003e-05, "loss": 0.684, "step": 4195 }, { "epoch": 2.5336552973136133, "grad_norm": 0.1455078125, "learning_rate": 3.2046492320464924e-05, "loss": 0.5861, "step": 4196 }, { "epoch": 2.5342589797766375, "grad_norm": 0.1484375, "learning_rate": 3.200498132004982e-05, "loss": 0.555, "step": 4197 }, { "epoch": 2.5348626622396617, "grad_norm": 0.154296875, "learning_rate": 3.1963470319634704e-05, "loss": 0.5677, "step": 4198 }, { "epoch": 2.5354663447026864, "grad_norm": 0.1533203125, "learning_rate": 3.19219593192196e-05, "loss": 0.5287, "step": 4199 }, { "epoch": 2.536070027165711, "grad_norm": 0.1640625, "learning_rate": 3.1880448318804484e-05, "loss": 0.5441, "step": 4200 }, { "epoch": 2.5366737096287353, "grad_norm": 0.16015625, "learning_rate": 3.183893731838937e-05, "loss": 0.4927, "step": 4201 }, { "epoch": 2.5372773920917595, "grad_norm": 0.1572265625, "learning_rate": 3.1797426317974264e-05, "loss": 0.4615, "step": 4202 }, { "epoch": 2.537881074554784, "grad_norm": 0.16015625, "learning_rate": 3.175591531755916e-05, "loss": 0.4678, "step": 4203 }, { "epoch": 2.538484757017809, "grad_norm": 0.1689453125, "learning_rate": 3.171440431714404e-05, "loss": 0.48, "step": 4204 }, { "epoch": 2.539088439480833, "grad_norm": 0.1796875, "learning_rate": 3.1672893316728937e-05, "loss": 0.5278, "step": 4205 }, { "epoch": 2.5396921219438573, "grad_norm": 0.1826171875, "learning_rate": 3.163138231631382e-05, "loss": 0.4566, "step": 4206 }, { "epoch": 2.540295804406882, "grad_norm": 0.1796875, "learning_rate": 3.1589871315898716e-05, "loss": 0.4041, "step": 4207 }, { "epoch": 2.5408994868699066, "grad_norm": 0.2021484375, "learning_rate": 3.15483603154836e-05, "loss": 0.4396, "step": 4208 }, { "epoch": 2.541503169332931, "grad_norm": 0.1962890625, "learning_rate": 3.1506849315068496e-05, "loss": 0.4315, "step": 4209 }, { "epoch": 2.542106851795955, "grad_norm": 0.201171875, "learning_rate": 3.146533831465339e-05, "loss": 0.318, "step": 4210 }, { "epoch": 2.5427105342589797, "grad_norm": 0.201171875, "learning_rate": 3.1423827314238276e-05, "loss": 0.2812, "step": 4211 }, { "epoch": 2.5433142167220044, "grad_norm": 0.1953125, "learning_rate": 3.138231631382316e-05, "loss": 0.2069, "step": 4212 }, { "epoch": 2.5439178991850286, "grad_norm": 0.1357421875, "learning_rate": 3.1340805313408056e-05, "loss": 0.5955, "step": 4213 }, { "epoch": 2.544521581648053, "grad_norm": 0.1357421875, "learning_rate": 3.129929431299294e-05, "loss": 0.5779, "step": 4214 }, { "epoch": 2.5451252641110775, "grad_norm": 0.142578125, "learning_rate": 3.125778331257783e-05, "loss": 0.6802, "step": 4215 }, { "epoch": 2.545728946574102, "grad_norm": 0.1630859375, "learning_rate": 3.121627231216273e-05, "loss": 0.8038, "step": 4216 }, { "epoch": 2.5463326290371264, "grad_norm": 0.1435546875, "learning_rate": 3.1174761311747615e-05, "loss": 0.8096, "step": 4217 }, { "epoch": 2.5469363115001507, "grad_norm": 0.140625, "learning_rate": 3.113325031133251e-05, "loss": 1.203, "step": 4218 }, { "epoch": 2.5475399939631753, "grad_norm": 0.1435546875, "learning_rate": 3.1091739310917395e-05, "loss": 0.7647, "step": 4219 }, { "epoch": 2.5481436764262, "grad_norm": 0.1455078125, "learning_rate": 3.105022831050228e-05, "loss": 0.5275, "step": 4220 }, { "epoch": 2.548747358889224, "grad_norm": 0.13671875, "learning_rate": 3.1008717310087175e-05, "loss": 0.658, "step": 4221 }, { "epoch": 2.549351041352249, "grad_norm": 0.150390625, "learning_rate": 3.096720630967206e-05, "loss": 0.6599, "step": 4222 }, { "epoch": 2.549954723815273, "grad_norm": 0.1513671875, "learning_rate": 3.0925695309256954e-05, "loss": 0.6281, "step": 4223 }, { "epoch": 2.5505584062782978, "grad_norm": 0.1572265625, "learning_rate": 3.088418430884185e-05, "loss": 0.6384, "step": 4224 }, { "epoch": 2.551162088741322, "grad_norm": 0.14453125, "learning_rate": 3.0842673308426734e-05, "loss": 0.6109, "step": 4225 }, { "epoch": 2.5517657712043467, "grad_norm": 0.154296875, "learning_rate": 3.080116230801162e-05, "loss": 0.7078, "step": 4226 }, { "epoch": 2.552369453667371, "grad_norm": 0.1455078125, "learning_rate": 3.0759651307596514e-05, "loss": 0.5598, "step": 4227 }, { "epoch": 2.5529731361303956, "grad_norm": 0.1650390625, "learning_rate": 3.07181403071814e-05, "loss": 0.8109, "step": 4228 }, { "epoch": 2.55357681859342, "grad_norm": 0.1904296875, "learning_rate": 3.06766293067663e-05, "loss": 0.6628, "step": 4229 }, { "epoch": 2.5541805010564445, "grad_norm": 0.1533203125, "learning_rate": 3.063511830635119e-05, "loss": 0.6804, "step": 4230 }, { "epoch": 2.5547841835194687, "grad_norm": 0.1435546875, "learning_rate": 3.059360730593607e-05, "loss": 0.5877, "step": 4231 }, { "epoch": 2.5553878659824933, "grad_norm": 0.1630859375, "learning_rate": 3.0552096305520966e-05, "loss": 0.656, "step": 4232 }, { "epoch": 2.5559915484455176, "grad_norm": 0.1513671875, "learning_rate": 3.0510585305105853e-05, "loss": 0.676, "step": 4233 }, { "epoch": 2.5565952309085422, "grad_norm": 0.1435546875, "learning_rate": 3.0469074304690743e-05, "loss": 0.5534, "step": 4234 }, { "epoch": 2.5571989133715665, "grad_norm": 0.1513671875, "learning_rate": 3.0427563304275636e-05, "loss": 0.6103, "step": 4235 }, { "epoch": 2.557802595834591, "grad_norm": 0.150390625, "learning_rate": 3.0386052303860523e-05, "loss": 0.6122, "step": 4236 }, { "epoch": 2.5584062782976154, "grad_norm": 0.14453125, "learning_rate": 3.0344541303445412e-05, "loss": 0.6039, "step": 4237 }, { "epoch": 2.55900996076064, "grad_norm": 0.185546875, "learning_rate": 3.0303030303030306e-05, "loss": 0.5834, "step": 4238 }, { "epoch": 2.5596136432236642, "grad_norm": 0.1474609375, "learning_rate": 3.0261519302615192e-05, "loss": 0.6298, "step": 4239 }, { "epoch": 2.560217325686689, "grad_norm": 0.1396484375, "learning_rate": 3.0220008302200085e-05, "loss": 0.5479, "step": 4240 }, { "epoch": 2.560821008149713, "grad_norm": 0.1533203125, "learning_rate": 3.0178497301784975e-05, "loss": 0.5281, "step": 4241 }, { "epoch": 2.561424690612738, "grad_norm": 0.169921875, "learning_rate": 3.0136986301369862e-05, "loss": 0.7109, "step": 4242 }, { "epoch": 2.562028373075762, "grad_norm": 0.1416015625, "learning_rate": 3.0095475300954755e-05, "loss": 0.6514, "step": 4243 }, { "epoch": 2.5626320555387867, "grad_norm": 0.162109375, "learning_rate": 3.0053964300539645e-05, "loss": 1.1654, "step": 4244 }, { "epoch": 2.563235738001811, "grad_norm": 0.158203125, "learning_rate": 3.001245330012453e-05, "loss": 0.5638, "step": 4245 }, { "epoch": 2.5638394204648356, "grad_norm": 0.1376953125, "learning_rate": 2.9970942299709425e-05, "loss": 0.5556, "step": 4246 }, { "epoch": 2.56444310292786, "grad_norm": 0.1552734375, "learning_rate": 2.9929431299294315e-05, "loss": 0.6126, "step": 4247 }, { "epoch": 2.5650467853908845, "grad_norm": 0.150390625, "learning_rate": 2.98879202988792e-05, "loss": 0.5597, "step": 4248 }, { "epoch": 2.5656504678539087, "grad_norm": 0.1552734375, "learning_rate": 2.9846409298464094e-05, "loss": 0.5701, "step": 4249 }, { "epoch": 2.5662541503169334, "grad_norm": 0.1533203125, "learning_rate": 2.9804898298048984e-05, "loss": 0.5262, "step": 4250 }, { "epoch": 2.5668578327799576, "grad_norm": 0.1650390625, "learning_rate": 2.9763387297633877e-05, "loss": 0.6605, "step": 4251 }, { "epoch": 2.5674615152429823, "grad_norm": 0.1728515625, "learning_rate": 2.9721876297218764e-05, "loss": 0.5195, "step": 4252 }, { "epoch": 2.5680651977060065, "grad_norm": 0.1708984375, "learning_rate": 2.9680365296803654e-05, "loss": 0.5135, "step": 4253 }, { "epoch": 2.568668880169031, "grad_norm": 0.1708984375, "learning_rate": 2.9638854296388547e-05, "loss": 0.55, "step": 4254 }, { "epoch": 2.5692725626320554, "grad_norm": 0.173828125, "learning_rate": 2.9597343295973434e-05, "loss": 0.4785, "step": 4255 }, { "epoch": 2.56987624509508, "grad_norm": 0.171875, "learning_rate": 2.9555832295558323e-05, "loss": 0.4183, "step": 4256 }, { "epoch": 2.5704799275581043, "grad_norm": 0.1875, "learning_rate": 2.9514321295143217e-05, "loss": 0.4437, "step": 4257 }, { "epoch": 2.571083610021129, "grad_norm": 0.19140625, "learning_rate": 2.9472810294728103e-05, "loss": 0.4089, "step": 4258 }, { "epoch": 2.571687292484153, "grad_norm": 0.2001953125, "learning_rate": 2.9431299294312993e-05, "loss": 0.355, "step": 4259 }, { "epoch": 2.572290974947178, "grad_norm": 0.2265625, "learning_rate": 2.9389788293897886e-05, "loss": 0.4149, "step": 4260 }, { "epoch": 2.572894657410202, "grad_norm": 0.1923828125, "learning_rate": 2.9348277293482773e-05, "loss": 0.2763, "step": 4261 }, { "epoch": 2.5734983398732267, "grad_norm": 0.212890625, "learning_rate": 2.9306766293067666e-05, "loss": 0.2371, "step": 4262 }, { "epoch": 2.574102022336251, "grad_norm": 0.1591796875, "learning_rate": 2.9265255292652556e-05, "loss": 0.7012, "step": 4263 }, { "epoch": 2.5747057047992756, "grad_norm": 0.302734375, "learning_rate": 2.9223744292237442e-05, "loss": 0.7176, "step": 4264 }, { "epoch": 2.5753093872623003, "grad_norm": 0.1416015625, "learning_rate": 2.9182233291822336e-05, "loss": 0.6191, "step": 4265 }, { "epoch": 2.5759130697253245, "grad_norm": 0.1611328125, "learning_rate": 2.9140722291407226e-05, "loss": 0.7192, "step": 4266 }, { "epoch": 2.5765167521883487, "grad_norm": 0.1435546875, "learning_rate": 2.9099211290992112e-05, "loss": 0.6505, "step": 4267 }, { "epoch": 2.5771204346513734, "grad_norm": 0.1416015625, "learning_rate": 2.9057700290577005e-05, "loss": 0.6055, "step": 4268 }, { "epoch": 2.577724117114398, "grad_norm": 0.1572265625, "learning_rate": 2.9016189290161895e-05, "loss": 0.7176, "step": 4269 }, { "epoch": 2.5783277995774223, "grad_norm": 0.1474609375, "learning_rate": 2.897467828974678e-05, "loss": 0.7773, "step": 4270 }, { "epoch": 2.5789314820404465, "grad_norm": 0.1416015625, "learning_rate": 2.8933167289331675e-05, "loss": 0.5553, "step": 4271 }, { "epoch": 2.579535164503471, "grad_norm": 0.1484375, "learning_rate": 2.8891656288916565e-05, "loss": 0.7196, "step": 4272 }, { "epoch": 2.580138846966496, "grad_norm": 0.138671875, "learning_rate": 2.8850145288501458e-05, "loss": 0.5476, "step": 4273 }, { "epoch": 2.58074252942952, "grad_norm": 0.1650390625, "learning_rate": 2.8808634288086344e-05, "loss": 0.6434, "step": 4274 }, { "epoch": 2.5813462118925443, "grad_norm": 0.15234375, "learning_rate": 2.8767123287671234e-05, "loss": 0.8742, "step": 4275 }, { "epoch": 2.581949894355569, "grad_norm": 0.1484375, "learning_rate": 2.8725612287256128e-05, "loss": 0.6174, "step": 4276 }, { "epoch": 2.5825535768185937, "grad_norm": 0.146484375, "learning_rate": 2.8684101286841014e-05, "loss": 0.6064, "step": 4277 }, { "epoch": 2.583157259281618, "grad_norm": 0.1435546875, "learning_rate": 2.86425902864259e-05, "loss": 0.5538, "step": 4278 }, { "epoch": 2.583760941744642, "grad_norm": 0.1474609375, "learning_rate": 2.8601079286010797e-05, "loss": 0.5924, "step": 4279 }, { "epoch": 2.5843646242076668, "grad_norm": 0.1455078125, "learning_rate": 2.8559568285595684e-05, "loss": 0.5754, "step": 4280 }, { "epoch": 2.5849683066706914, "grad_norm": 0.142578125, "learning_rate": 2.851805728518057e-05, "loss": 0.5529, "step": 4281 }, { "epoch": 2.5855719891337157, "grad_norm": 0.181640625, "learning_rate": 2.8476546284765467e-05, "loss": 0.7275, "step": 4282 }, { "epoch": 2.58617567159674, "grad_norm": 0.1455078125, "learning_rate": 2.8435035284350353e-05, "loss": 0.6515, "step": 4283 }, { "epoch": 2.5867793540597646, "grad_norm": 0.1484375, "learning_rate": 2.8393524283935247e-05, "loss": 0.6439, "step": 4284 }, { "epoch": 2.5873830365227892, "grad_norm": 0.1533203125, "learning_rate": 2.8352013283520133e-05, "loss": 0.5381, "step": 4285 }, { "epoch": 2.5879867189858135, "grad_norm": 0.1513671875, "learning_rate": 2.8310502283105023e-05, "loss": 0.865, "step": 4286 }, { "epoch": 2.5885904014488377, "grad_norm": 0.138671875, "learning_rate": 2.8268991282689916e-05, "loss": 0.7643, "step": 4287 }, { "epoch": 2.5891940839118623, "grad_norm": 0.1611328125, "learning_rate": 2.8227480282274803e-05, "loss": 0.6419, "step": 4288 }, { "epoch": 2.589797766374887, "grad_norm": 0.16015625, "learning_rate": 2.8185969281859693e-05, "loss": 0.6687, "step": 4289 }, { "epoch": 2.5904014488379112, "grad_norm": 0.13671875, "learning_rate": 2.8144458281444586e-05, "loss": 0.7835, "step": 4290 }, { "epoch": 2.5910051313009355, "grad_norm": 0.16015625, "learning_rate": 2.8102947281029472e-05, "loss": 0.6415, "step": 4291 }, { "epoch": 2.59160881376396, "grad_norm": 0.154296875, "learning_rate": 2.8061436280614362e-05, "loss": 0.7106, "step": 4292 }, { "epoch": 2.592212496226985, "grad_norm": 0.1455078125, "learning_rate": 2.8019925280199255e-05, "loss": 0.617, "step": 4293 }, { "epoch": 2.592816178690009, "grad_norm": 0.1494140625, "learning_rate": 2.7978414279784142e-05, "loss": 0.595, "step": 4294 }, { "epoch": 2.5934198611530332, "grad_norm": 0.1357421875, "learning_rate": 2.7936903279369035e-05, "loss": 0.5337, "step": 4295 }, { "epoch": 2.594023543616058, "grad_norm": 0.150390625, "learning_rate": 2.7895392278953925e-05, "loss": 0.5098, "step": 4296 }, { "epoch": 2.5946272260790826, "grad_norm": 0.1533203125, "learning_rate": 2.785388127853881e-05, "loss": 0.519, "step": 4297 }, { "epoch": 2.595230908542107, "grad_norm": 0.1533203125, "learning_rate": 2.7812370278123705e-05, "loss": 0.5765, "step": 4298 }, { "epoch": 2.595834591005131, "grad_norm": 0.1552734375, "learning_rate": 2.7770859277708595e-05, "loss": 0.5621, "step": 4299 }, { "epoch": 2.5964382734681557, "grad_norm": 0.154296875, "learning_rate": 2.772934827729348e-05, "loss": 0.5247, "step": 4300 }, { "epoch": 2.5970419559311804, "grad_norm": 0.1494140625, "learning_rate": 2.7687837276878374e-05, "loss": 0.455, "step": 4301 }, { "epoch": 2.5976456383942046, "grad_norm": 0.154296875, "learning_rate": 2.7646326276463264e-05, "loss": 0.4566, "step": 4302 }, { "epoch": 2.5982493208572293, "grad_norm": 0.1669921875, "learning_rate": 2.760481527604815e-05, "loss": 0.4368, "step": 4303 }, { "epoch": 2.5988530033202535, "grad_norm": 0.169921875, "learning_rate": 2.7563304275633044e-05, "loss": 0.5235, "step": 4304 }, { "epoch": 2.599456685783278, "grad_norm": 0.1708984375, "learning_rate": 2.7521793275217934e-05, "loss": 0.4767, "step": 4305 }, { "epoch": 2.6000603682463024, "grad_norm": 0.2412109375, "learning_rate": 2.7480282274802827e-05, "loss": 0.3947, "step": 4306 }, { "epoch": 2.600664050709327, "grad_norm": 0.1904296875, "learning_rate": 2.7438771274387714e-05, "loss": 0.3915, "step": 4307 }, { "epoch": 2.6012677331723513, "grad_norm": 0.1953125, "learning_rate": 2.7397260273972603e-05, "loss": 0.3913, "step": 4308 }, { "epoch": 2.601871415635376, "grad_norm": 0.2001953125, "learning_rate": 2.7355749273557497e-05, "loss": 0.3407, "step": 4309 }, { "epoch": 2.6024750980984, "grad_norm": 0.205078125, "learning_rate": 2.7314238273142383e-05, "loss": 0.3446, "step": 4310 }, { "epoch": 2.603078780561425, "grad_norm": 0.2255859375, "learning_rate": 2.7272727272727273e-05, "loss": 0.3252, "step": 4311 }, { "epoch": 2.603682463024449, "grad_norm": 0.1962890625, "learning_rate": 2.7231216272312166e-05, "loss": 0.1949, "step": 4312 }, { "epoch": 2.6042861454874737, "grad_norm": 0.1533203125, "learning_rate": 2.7189705271897053e-05, "loss": 0.6236, "step": 4313 }, { "epoch": 2.604889827950498, "grad_norm": 0.1435546875, "learning_rate": 2.7148194271481943e-05, "loss": 0.5858, "step": 4314 }, { "epoch": 2.6054935104135226, "grad_norm": 0.1416015625, "learning_rate": 2.7106683271066836e-05, "loss": 0.6003, "step": 4315 }, { "epoch": 2.606097192876547, "grad_norm": 0.1513671875, "learning_rate": 2.7065172270651722e-05, "loss": 0.5897, "step": 4316 }, { "epoch": 2.6067008753395715, "grad_norm": 0.140625, "learning_rate": 2.7023661270236616e-05, "loss": 0.6271, "step": 4317 }, { "epoch": 2.6073045578025957, "grad_norm": 0.1513671875, "learning_rate": 2.6982150269821506e-05, "loss": 0.5715, "step": 4318 }, { "epoch": 2.6079082402656204, "grad_norm": 0.12890625, "learning_rate": 2.6940639269406392e-05, "loss": 0.5873, "step": 4319 }, { "epoch": 2.6085119227286446, "grad_norm": 0.1376953125, "learning_rate": 2.6899128268991285e-05, "loss": 0.5097, "step": 4320 }, { "epoch": 2.6091156051916693, "grad_norm": 0.1357421875, "learning_rate": 2.6857617268576175e-05, "loss": 0.8605, "step": 4321 }, { "epoch": 2.6097192876546935, "grad_norm": 0.154296875, "learning_rate": 2.6816106268161062e-05, "loss": 0.6028, "step": 4322 }, { "epoch": 2.610322970117718, "grad_norm": 0.1708984375, "learning_rate": 2.6774595267745955e-05, "loss": 0.6827, "step": 4323 }, { "epoch": 2.6109266525807424, "grad_norm": 0.15234375, "learning_rate": 2.6733084267330845e-05, "loss": 0.6564, "step": 4324 }, { "epoch": 2.611530335043767, "grad_norm": 0.1494140625, "learning_rate": 2.669157326691573e-05, "loss": 0.5687, "step": 4325 }, { "epoch": 2.6121340175067913, "grad_norm": 0.1796875, "learning_rate": 2.6650062266500625e-05, "loss": 0.856, "step": 4326 }, { "epoch": 2.612737699969816, "grad_norm": 0.1474609375, "learning_rate": 2.6608551266085514e-05, "loss": 0.6517, "step": 4327 }, { "epoch": 2.61334138243284, "grad_norm": 0.1484375, "learning_rate": 2.6567040265670408e-05, "loss": 0.6049, "step": 4328 }, { "epoch": 2.613945064895865, "grad_norm": 0.1455078125, "learning_rate": 2.6525529265255294e-05, "loss": 0.6425, "step": 4329 }, { "epoch": 2.614548747358889, "grad_norm": 0.1533203125, "learning_rate": 2.6484018264840184e-05, "loss": 0.6713, "step": 4330 }, { "epoch": 2.6151524298219138, "grad_norm": 0.16015625, "learning_rate": 2.6442507264425077e-05, "loss": 0.7973, "step": 4331 }, { "epoch": 2.615756112284938, "grad_norm": 0.158203125, "learning_rate": 2.6400996264009964e-05, "loss": 0.8084, "step": 4332 }, { "epoch": 2.6163597947479627, "grad_norm": 0.16015625, "learning_rate": 2.635948526359485e-05, "loss": 0.7568, "step": 4333 }, { "epoch": 2.616963477210987, "grad_norm": 0.142578125, "learning_rate": 2.6317974263179747e-05, "loss": 0.6364, "step": 4334 }, { "epoch": 2.6175671596740115, "grad_norm": 0.150390625, "learning_rate": 2.6276463262764633e-05, "loss": 0.7264, "step": 4335 }, { "epoch": 2.6181708421370358, "grad_norm": 0.1513671875, "learning_rate": 2.623495226234952e-05, "loss": 0.6437, "step": 4336 }, { "epoch": 2.6187745246000604, "grad_norm": 0.1884765625, "learning_rate": 2.6193441261934417e-05, "loss": 0.9275, "step": 4337 }, { "epoch": 2.6193782070630847, "grad_norm": 0.16015625, "learning_rate": 2.6151930261519303e-05, "loss": 0.6317, "step": 4338 }, { "epoch": 2.6199818895261093, "grad_norm": 0.1533203125, "learning_rate": 2.6110419261104196e-05, "loss": 0.5622, "step": 4339 }, { "epoch": 2.6205855719891336, "grad_norm": 0.1484375, "learning_rate": 2.6068908260689083e-05, "loss": 0.6879, "step": 4340 }, { "epoch": 2.6211892544521582, "grad_norm": 0.1494140625, "learning_rate": 2.6027397260273973e-05, "loss": 0.8795, "step": 4341 }, { "epoch": 2.6217929369151824, "grad_norm": 0.1416015625, "learning_rate": 2.5985886259858866e-05, "loss": 0.5918, "step": 4342 }, { "epoch": 2.622396619378207, "grad_norm": 0.13671875, "learning_rate": 2.5944375259443752e-05, "loss": 0.5913, "step": 4343 }, { "epoch": 2.6230003018412313, "grad_norm": 0.1435546875, "learning_rate": 2.5902864259028642e-05, "loss": 0.7423, "step": 4344 }, { "epoch": 2.623603984304256, "grad_norm": 0.14453125, "learning_rate": 2.5861353258613536e-05, "loss": 0.6315, "step": 4345 }, { "epoch": 2.6242076667672807, "grad_norm": 0.140625, "learning_rate": 2.5819842258198422e-05, "loss": 0.5559, "step": 4346 }, { "epoch": 2.624811349230305, "grad_norm": 0.142578125, "learning_rate": 2.5778331257783312e-05, "loss": 0.4504, "step": 4347 }, { "epoch": 2.625415031693329, "grad_norm": 0.150390625, "learning_rate": 2.5736820257368205e-05, "loss": 0.5707, "step": 4348 }, { "epoch": 2.626018714156354, "grad_norm": 0.1611328125, "learning_rate": 2.569530925695309e-05, "loss": 0.5686, "step": 4349 }, { "epoch": 2.6266223966193785, "grad_norm": 0.169921875, "learning_rate": 2.5653798256537985e-05, "loss": 0.5619, "step": 4350 }, { "epoch": 2.6272260790824027, "grad_norm": 0.16796875, "learning_rate": 2.5612287256122875e-05, "loss": 0.467, "step": 4351 }, { "epoch": 2.627829761545427, "grad_norm": 0.1748046875, "learning_rate": 2.557077625570776e-05, "loss": 0.5095, "step": 4352 }, { "epoch": 2.6284334440084516, "grad_norm": 0.169921875, "learning_rate": 2.5529265255292654e-05, "loss": 0.5222, "step": 4353 }, { "epoch": 2.6290371264714762, "grad_norm": 0.166015625, "learning_rate": 2.5487754254877544e-05, "loss": 0.5631, "step": 4354 }, { "epoch": 2.6296408089345005, "grad_norm": 0.1787109375, "learning_rate": 2.544624325446243e-05, "loss": 0.4773, "step": 4355 }, { "epoch": 2.6302444913975247, "grad_norm": 0.1953125, "learning_rate": 2.5404732254047324e-05, "loss": 0.5103, "step": 4356 }, { "epoch": 2.6308481738605494, "grad_norm": 0.1796875, "learning_rate": 2.5363221253632214e-05, "loss": 0.4466, "step": 4357 }, { "epoch": 2.631451856323574, "grad_norm": 0.205078125, "learning_rate": 2.53217102532171e-05, "loss": 0.41, "step": 4358 }, { "epoch": 2.6320555387865983, "grad_norm": 0.1962890625, "learning_rate": 2.5280199252801994e-05, "loss": 0.3226, "step": 4359 }, { "epoch": 2.6326592212496225, "grad_norm": 0.1904296875, "learning_rate": 2.5238688252386884e-05, "loss": 0.2631, "step": 4360 }, { "epoch": 2.633262903712647, "grad_norm": 0.21484375, "learning_rate": 2.5197177251971777e-05, "loss": 0.2883, "step": 4361 }, { "epoch": 2.633866586175672, "grad_norm": 0.224609375, "learning_rate": 2.5155666251556663e-05, "loss": 0.2396, "step": 4362 }, { "epoch": 2.634470268638696, "grad_norm": 0.14453125, "learning_rate": 2.5114155251141553e-05, "loss": 0.6584, "step": 4363 }, { "epoch": 2.6350739511017203, "grad_norm": 0.130859375, "learning_rate": 2.5072644250726446e-05, "loss": 0.5349, "step": 4364 }, { "epoch": 2.635677633564745, "grad_norm": 0.15234375, "learning_rate": 2.5031133250311333e-05, "loss": 0.6576, "step": 4365 }, { "epoch": 2.6362813160277696, "grad_norm": 0.140625, "learning_rate": 2.4989622249896226e-05, "loss": 0.61, "step": 4366 }, { "epoch": 2.636884998490794, "grad_norm": 0.1533203125, "learning_rate": 2.4948111249481113e-05, "loss": 0.7938, "step": 4367 }, { "epoch": 2.637488680953818, "grad_norm": 0.15234375, "learning_rate": 2.4906600249066003e-05, "loss": 0.5602, "step": 4368 }, { "epoch": 2.6380923634168427, "grad_norm": 0.158203125, "learning_rate": 2.4865089248650892e-05, "loss": 0.6619, "step": 4369 }, { "epoch": 2.6386960458798674, "grad_norm": 0.1552734375, "learning_rate": 2.4823578248235786e-05, "loss": 0.6877, "step": 4370 }, { "epoch": 2.6392997283428916, "grad_norm": 0.1357421875, "learning_rate": 2.4782067247820672e-05, "loss": 0.5548, "step": 4371 }, { "epoch": 2.639903410805916, "grad_norm": 0.1533203125, "learning_rate": 2.4740556247405562e-05, "loss": 0.5933, "step": 4372 }, { "epoch": 2.6405070932689405, "grad_norm": 0.1806640625, "learning_rate": 2.4699045246990455e-05, "loss": 0.6843, "step": 4373 }, { "epoch": 2.641110775731965, "grad_norm": 0.1474609375, "learning_rate": 2.4657534246575342e-05, "loss": 0.6951, "step": 4374 }, { "epoch": 2.6417144581949894, "grad_norm": 0.140625, "learning_rate": 2.461602324616023e-05, "loss": 0.6361, "step": 4375 }, { "epoch": 2.6423181406580136, "grad_norm": 0.15234375, "learning_rate": 2.4574512245745125e-05, "loss": 0.6971, "step": 4376 }, { "epoch": 2.6429218231210383, "grad_norm": 0.1494140625, "learning_rate": 2.4533001245330015e-05, "loss": 0.6643, "step": 4377 }, { "epoch": 2.643525505584063, "grad_norm": 0.1416015625, "learning_rate": 2.44914902449149e-05, "loss": 0.5958, "step": 4378 }, { "epoch": 2.644129188047087, "grad_norm": 0.1904296875, "learning_rate": 2.4449979244499795e-05, "loss": 0.5614, "step": 4379 }, { "epoch": 2.6447328705101114, "grad_norm": 0.1591796875, "learning_rate": 2.4408468244084684e-05, "loss": 0.6388, "step": 4380 }, { "epoch": 2.645336552973136, "grad_norm": 0.140625, "learning_rate": 2.4366957243669574e-05, "loss": 0.6615, "step": 4381 }, { "epoch": 2.6459402354361607, "grad_norm": 0.1533203125, "learning_rate": 2.4325446243254464e-05, "loss": 0.5931, "step": 4382 }, { "epoch": 2.646543917899185, "grad_norm": 0.134765625, "learning_rate": 2.4283935242839354e-05, "loss": 0.618, "step": 4383 }, { "epoch": 2.6471476003622096, "grad_norm": 0.150390625, "learning_rate": 2.4242424242424244e-05, "loss": 0.7889, "step": 4384 }, { "epoch": 2.647751282825234, "grad_norm": 0.158203125, "learning_rate": 2.4200913242009134e-05, "loss": 0.5992, "step": 4385 }, { "epoch": 2.6483549652882585, "grad_norm": 0.1552734375, "learning_rate": 2.4159402241594024e-05, "loss": 0.6226, "step": 4386 }, { "epoch": 2.6489586477512828, "grad_norm": 0.1650390625, "learning_rate": 2.4117891241178914e-05, "loss": 0.6265, "step": 4387 }, { "epoch": 2.6495623302143074, "grad_norm": 0.16015625, "learning_rate": 2.4076380240763803e-05, "loss": 0.5859, "step": 4388 }, { "epoch": 2.6501660126773317, "grad_norm": 0.1474609375, "learning_rate": 2.4034869240348693e-05, "loss": 0.6647, "step": 4389 }, { "epoch": 2.6507696951403563, "grad_norm": 0.1533203125, "learning_rate": 2.3993358239933583e-05, "loss": 0.6158, "step": 4390 }, { "epoch": 2.6513733776033805, "grad_norm": 0.1484375, "learning_rate": 2.3951847239518473e-05, "loss": 0.7374, "step": 4391 }, { "epoch": 2.651977060066405, "grad_norm": 0.1455078125, "learning_rate": 2.3910336239103366e-05, "loss": 0.7737, "step": 4392 }, { "epoch": 2.6525807425294294, "grad_norm": 0.181640625, "learning_rate": 2.3868825238688253e-05, "loss": 0.5964, "step": 4393 }, { "epoch": 2.653184424992454, "grad_norm": 0.1455078125, "learning_rate": 2.3827314238273143e-05, "loss": 0.7454, "step": 4394 }, { "epoch": 2.6537881074554783, "grad_norm": 0.15234375, "learning_rate": 2.3785803237858036e-05, "loss": 0.5554, "step": 4395 }, { "epoch": 2.654391789918503, "grad_norm": 0.1552734375, "learning_rate": 2.3744292237442922e-05, "loss": 0.6501, "step": 4396 }, { "epoch": 2.654995472381527, "grad_norm": 0.1474609375, "learning_rate": 2.3702781237027812e-05, "loss": 0.5483, "step": 4397 }, { "epoch": 2.655599154844552, "grad_norm": 0.146484375, "learning_rate": 2.3661270236612702e-05, "loss": 0.5345, "step": 4398 }, { "epoch": 2.656202837307576, "grad_norm": 0.142578125, "learning_rate": 2.3619759236197595e-05, "loss": 0.4837, "step": 4399 }, { "epoch": 2.656806519770601, "grad_norm": 0.150390625, "learning_rate": 2.3578248235782482e-05, "loss": 0.5075, "step": 4400 }, { "epoch": 2.657410202233625, "grad_norm": 0.158203125, "learning_rate": 2.3536737235367372e-05, "loss": 0.5213, "step": 4401 }, { "epoch": 2.6580138846966497, "grad_norm": 0.1572265625, "learning_rate": 2.3495226234952265e-05, "loss": 0.488, "step": 4402 }, { "epoch": 2.658617567159674, "grad_norm": 0.171875, "learning_rate": 2.3453715234537155e-05, "loss": 0.5311, "step": 4403 }, { "epoch": 2.6592212496226986, "grad_norm": 0.169921875, "learning_rate": 2.341220423412204e-05, "loss": 0.4742, "step": 4404 }, { "epoch": 2.659824932085723, "grad_norm": 0.1767578125, "learning_rate": 2.3370693233706935e-05, "loss": 0.498, "step": 4405 }, { "epoch": 2.6604286145487475, "grad_norm": 0.1796875, "learning_rate": 2.3329182233291824e-05, "loss": 0.4055, "step": 4406 }, { "epoch": 2.6610322970117717, "grad_norm": 0.18359375, "learning_rate": 2.328767123287671e-05, "loss": 0.3957, "step": 4407 }, { "epoch": 2.6616359794747964, "grad_norm": 0.193359375, "learning_rate": 2.3246160232461604e-05, "loss": 0.3804, "step": 4408 }, { "epoch": 2.6622396619378206, "grad_norm": 0.1845703125, "learning_rate": 2.3204649232046494e-05, "loss": 0.337, "step": 4409 }, { "epoch": 2.6628433444008452, "grad_norm": 0.197265625, "learning_rate": 2.3163138231631384e-05, "loss": 0.3131, "step": 4410 }, { "epoch": 2.6634470268638695, "grad_norm": 0.203125, "learning_rate": 2.3121627231216274e-05, "loss": 0.2671, "step": 4411 }, { "epoch": 2.664050709326894, "grad_norm": 0.201171875, "learning_rate": 2.3080116230801164e-05, "loss": 0.2049, "step": 4412 }, { "epoch": 2.6646543917899184, "grad_norm": 0.1435546875, "learning_rate": 2.3038605230386054e-05, "loss": 0.7446, "step": 4413 }, { "epoch": 2.665258074252943, "grad_norm": 0.1484375, "learning_rate": 2.2997094229970943e-05, "loss": 0.6451, "step": 4414 }, { "epoch": 2.6658617567159673, "grad_norm": 0.15234375, "learning_rate": 2.2955583229555833e-05, "loss": 0.6584, "step": 4415 }, { "epoch": 2.666465439178992, "grad_norm": 0.1474609375, "learning_rate": 2.2914072229140723e-05, "loss": 0.595, "step": 4416 }, { "epoch": 2.667069121642016, "grad_norm": 0.158203125, "learning_rate": 2.2872561228725613e-05, "loss": 0.7, "step": 4417 }, { "epoch": 2.667672804105041, "grad_norm": 0.2265625, "learning_rate": 2.2831050228310503e-05, "loss": 0.6295, "step": 4418 }, { "epoch": 2.668276486568065, "grad_norm": 0.1376953125, "learning_rate": 2.2789539227895393e-05, "loss": 0.5823, "step": 4419 }, { "epoch": 2.6688801690310897, "grad_norm": 0.138671875, "learning_rate": 2.2748028227480283e-05, "loss": 0.6256, "step": 4420 }, { "epoch": 2.669483851494114, "grad_norm": 0.1533203125, "learning_rate": 2.2706517227065176e-05, "loss": 0.6366, "step": 4421 }, { "epoch": 2.6700875339571386, "grad_norm": 0.1552734375, "learning_rate": 2.2665006226650062e-05, "loss": 0.6766, "step": 4422 }, { "epoch": 2.670691216420163, "grad_norm": 0.150390625, "learning_rate": 2.2623495226234952e-05, "loss": 0.6968, "step": 4423 }, { "epoch": 2.6712948988831875, "grad_norm": 0.146484375, "learning_rate": 2.2581984225819846e-05, "loss": 0.6291, "step": 4424 }, { "epoch": 2.6718985813462117, "grad_norm": 0.1396484375, "learning_rate": 2.2540473225404735e-05, "loss": 0.649, "step": 4425 }, { "epoch": 2.6725022638092364, "grad_norm": 0.1591796875, "learning_rate": 2.2498962224989622e-05, "loss": 0.6302, "step": 4426 }, { "epoch": 2.673105946272261, "grad_norm": 0.154296875, "learning_rate": 2.2457451224574512e-05, "loss": 0.6207, "step": 4427 }, { "epoch": 2.6737096287352853, "grad_norm": 0.1484375, "learning_rate": 2.2415940224159405e-05, "loss": 0.6337, "step": 4428 }, { "epoch": 2.6743133111983095, "grad_norm": 0.146484375, "learning_rate": 2.237442922374429e-05, "loss": 0.655, "step": 4429 }, { "epoch": 2.674916993661334, "grad_norm": 0.1396484375, "learning_rate": 2.233291822332918e-05, "loss": 0.6017, "step": 4430 }, { "epoch": 2.675520676124359, "grad_norm": 0.15625, "learning_rate": 2.2291407222914075e-05, "loss": 0.7195, "step": 4431 }, { "epoch": 2.676124358587383, "grad_norm": 0.1591796875, "learning_rate": 2.2249896222498965e-05, "loss": 0.4655, "step": 4432 }, { "epoch": 2.6767280410504073, "grad_norm": 0.154296875, "learning_rate": 2.220838522208385e-05, "loss": 0.6356, "step": 4433 }, { "epoch": 2.677331723513432, "grad_norm": 0.298828125, "learning_rate": 2.2166874221668744e-05, "loss": 0.6569, "step": 4434 }, { "epoch": 2.6779354059764566, "grad_norm": 0.1484375, "learning_rate": 2.2125363221253634e-05, "loss": 0.654, "step": 4435 }, { "epoch": 2.678539088439481, "grad_norm": 0.1396484375, "learning_rate": 2.2083852220838524e-05, "loss": 0.5911, "step": 4436 }, { "epoch": 2.679142770902505, "grad_norm": 0.140625, "learning_rate": 2.2042341220423414e-05, "loss": 0.6751, "step": 4437 }, { "epoch": 2.6797464533655297, "grad_norm": 0.146484375, "learning_rate": 2.2000830220008304e-05, "loss": 0.6674, "step": 4438 }, { "epoch": 2.6803501358285544, "grad_norm": 0.1396484375, "learning_rate": 2.1959319219593194e-05, "loss": 0.8804, "step": 4439 }, { "epoch": 2.6809538182915786, "grad_norm": 0.1474609375, "learning_rate": 2.1917808219178083e-05, "loss": 0.5718, "step": 4440 }, { "epoch": 2.681557500754603, "grad_norm": 0.1630859375, "learning_rate": 2.1876297218762973e-05, "loss": 0.6883, "step": 4441 }, { "epoch": 2.6821611832176275, "grad_norm": 0.1552734375, "learning_rate": 2.1834786218347863e-05, "loss": 0.7524, "step": 4442 }, { "epoch": 2.682764865680652, "grad_norm": 0.1279296875, "learning_rate": 2.1793275217932753e-05, "loss": 0.5331, "step": 4443 }, { "epoch": 2.6833685481436764, "grad_norm": 0.1396484375, "learning_rate": 2.1751764217517643e-05, "loss": 0.786, "step": 4444 }, { "epoch": 2.6839722306067006, "grad_norm": 0.146484375, "learning_rate": 2.1710253217102533e-05, "loss": 0.6069, "step": 4445 }, { "epoch": 2.6845759130697253, "grad_norm": 0.146484375, "learning_rate": 2.1668742216687423e-05, "loss": 0.6164, "step": 4446 }, { "epoch": 2.68517959553275, "grad_norm": 0.1494140625, "learning_rate": 2.1627231216272316e-05, "loss": 0.5618, "step": 4447 }, { "epoch": 2.685783277995774, "grad_norm": 0.1455078125, "learning_rate": 2.1585720215857202e-05, "loss": 0.535, "step": 4448 }, { "epoch": 2.6863869604587984, "grad_norm": 0.1611328125, "learning_rate": 2.1544209215442092e-05, "loss": 0.6048, "step": 4449 }, { "epoch": 2.686990642921823, "grad_norm": 0.1591796875, "learning_rate": 2.1502698215026986e-05, "loss": 0.5397, "step": 4450 }, { "epoch": 2.6875943253848478, "grad_norm": 0.1708984375, "learning_rate": 2.1461187214611872e-05, "loss": 0.5, "step": 4451 }, { "epoch": 2.688198007847872, "grad_norm": 0.1689453125, "learning_rate": 2.1419676214196762e-05, "loss": 0.514, "step": 4452 }, { "epoch": 2.688801690310896, "grad_norm": 0.1689453125, "learning_rate": 2.1378165213781652e-05, "loss": 0.4886, "step": 4453 }, { "epoch": 2.689405372773921, "grad_norm": 0.1689453125, "learning_rate": 2.1336654213366545e-05, "loss": 0.4477, "step": 4454 }, { "epoch": 2.6900090552369456, "grad_norm": 0.181640625, "learning_rate": 2.129514321295143e-05, "loss": 0.5355, "step": 4455 }, { "epoch": 2.69061273769997, "grad_norm": 0.185546875, "learning_rate": 2.125363221253632e-05, "loss": 0.414, "step": 4456 }, { "epoch": 2.691216420162994, "grad_norm": 0.1962890625, "learning_rate": 2.1212121212121215e-05, "loss": 0.4451, "step": 4457 }, { "epoch": 2.6918201026260187, "grad_norm": 0.212890625, "learning_rate": 2.1170610211706105e-05, "loss": 0.4357, "step": 4458 }, { "epoch": 2.6924237850890433, "grad_norm": 0.208984375, "learning_rate": 2.112909921129099e-05, "loss": 0.3608, "step": 4459 }, { "epoch": 2.6930274675520676, "grad_norm": 0.201171875, "learning_rate": 2.1087588210875884e-05, "loss": 0.2876, "step": 4460 }, { "epoch": 2.693631150015092, "grad_norm": 0.2001953125, "learning_rate": 2.1046077210460774e-05, "loss": 0.2317, "step": 4461 }, { "epoch": 2.6942348324781165, "grad_norm": 0.1953125, "learning_rate": 2.100456621004566e-05, "loss": 0.1918, "step": 4462 }, { "epoch": 2.694838514941141, "grad_norm": 0.15234375, "learning_rate": 2.0963055209630554e-05, "loss": 0.6023, "step": 4463 }, { "epoch": 2.6954421974041654, "grad_norm": 0.15234375, "learning_rate": 2.0921544209215444e-05, "loss": 0.6662, "step": 4464 }, { "epoch": 2.69604587986719, "grad_norm": 0.1416015625, "learning_rate": 2.0880033208800334e-05, "loss": 0.5831, "step": 4465 }, { "epoch": 2.6966495623302142, "grad_norm": 0.158203125, "learning_rate": 2.0838522208385224e-05, "loss": 0.6397, "step": 4466 }, { "epoch": 2.697253244793239, "grad_norm": 0.16796875, "learning_rate": 2.0797011207970113e-05, "loss": 0.6301, "step": 4467 }, { "epoch": 2.697856927256263, "grad_norm": 0.150390625, "learning_rate": 2.0755500207555003e-05, "loss": 0.7089, "step": 4468 }, { "epoch": 2.698460609719288, "grad_norm": 0.138671875, "learning_rate": 2.0713989207139893e-05, "loss": 0.519, "step": 4469 }, { "epoch": 2.699064292182312, "grad_norm": 0.1416015625, "learning_rate": 2.0672478206724783e-05, "loss": 0.6105, "step": 4470 }, { "epoch": 2.6996679746453367, "grad_norm": 0.1669921875, "learning_rate": 2.0630967206309673e-05, "loss": 0.7206, "step": 4471 }, { "epoch": 2.700271657108361, "grad_norm": 0.1708984375, "learning_rate": 2.0589456205894563e-05, "loss": 0.6736, "step": 4472 }, { "epoch": 2.7008753395713856, "grad_norm": 0.14453125, "learning_rate": 2.0547945205479453e-05, "loss": 0.6061, "step": 4473 }, { "epoch": 2.70147902203441, "grad_norm": 0.1533203125, "learning_rate": 2.0506434205064342e-05, "loss": 0.5616, "step": 4474 }, { "epoch": 2.7020827044974345, "grad_norm": 0.1513671875, "learning_rate": 2.0464923204649232e-05, "loss": 0.6123, "step": 4475 }, { "epoch": 2.7026863869604587, "grad_norm": 0.1474609375, "learning_rate": 2.0423412204234126e-05, "loss": 0.8862, "step": 4476 }, { "epoch": 2.7032900694234834, "grad_norm": 0.1513671875, "learning_rate": 2.0381901203819012e-05, "loss": 0.7349, "step": 4477 }, { "epoch": 2.7038937518865076, "grad_norm": 0.1533203125, "learning_rate": 2.0340390203403902e-05, "loss": 0.6979, "step": 4478 }, { "epoch": 2.7044974343495323, "grad_norm": 0.1728515625, "learning_rate": 2.0298879202988795e-05, "loss": 0.6383, "step": 4479 }, { "epoch": 2.7051011168125565, "grad_norm": 0.1474609375, "learning_rate": 2.0257368202573682e-05, "loss": 0.6267, "step": 4480 }, { "epoch": 2.705704799275581, "grad_norm": 0.146484375, "learning_rate": 2.021585720215857e-05, "loss": 0.6785, "step": 4481 }, { "epoch": 2.7063084817386054, "grad_norm": 0.16015625, "learning_rate": 2.017434620174346e-05, "loss": 0.753, "step": 4482 }, { "epoch": 2.70691216420163, "grad_norm": 0.1865234375, "learning_rate": 2.0132835201328355e-05, "loss": 0.5688, "step": 4483 }, { "epoch": 2.7075158466646543, "grad_norm": 0.142578125, "learning_rate": 2.009132420091324e-05, "loss": 0.7494, "step": 4484 }, { "epoch": 2.708119529127679, "grad_norm": 0.1611328125, "learning_rate": 2.004981320049813e-05, "loss": 0.74, "step": 4485 }, { "epoch": 2.708723211590703, "grad_norm": 0.142578125, "learning_rate": 2.0008302200083024e-05, "loss": 0.5672, "step": 4486 }, { "epoch": 2.709326894053728, "grad_norm": 0.15234375, "learning_rate": 1.9966791199667914e-05, "loss": 0.8192, "step": 4487 }, { "epoch": 2.709930576516752, "grad_norm": 0.1708984375, "learning_rate": 1.99252801992528e-05, "loss": 0.6851, "step": 4488 }, { "epoch": 2.7105342589797767, "grad_norm": 0.14453125, "learning_rate": 1.9883769198837694e-05, "loss": 0.6071, "step": 4489 }, { "epoch": 2.711137941442801, "grad_norm": 0.154296875, "learning_rate": 1.9842258198422584e-05, "loss": 0.6756, "step": 4490 }, { "epoch": 2.7117416239058256, "grad_norm": 0.1552734375, "learning_rate": 1.980074719800747e-05, "loss": 0.6658, "step": 4491 }, { "epoch": 2.71234530636885, "grad_norm": 0.140625, "learning_rate": 1.9759236197592364e-05, "loss": 0.5517, "step": 4492 }, { "epoch": 2.7129489888318745, "grad_norm": 0.173828125, "learning_rate": 1.9717725197177253e-05, "loss": 0.6189, "step": 4493 }, { "epoch": 2.7135526712948987, "grad_norm": 0.1455078125, "learning_rate": 1.9676214196762143e-05, "loss": 0.5697, "step": 4494 }, { "epoch": 2.7141563537579234, "grad_norm": 0.1416015625, "learning_rate": 1.9634703196347033e-05, "loss": 0.6535, "step": 4495 }, { "epoch": 2.7147600362209476, "grad_norm": 0.1416015625, "learning_rate": 1.9593192195931923e-05, "loss": 0.5543, "step": 4496 }, { "epoch": 2.7153637186839723, "grad_norm": 0.1572265625, "learning_rate": 1.9551681195516813e-05, "loss": 0.5723, "step": 4497 }, { "epoch": 2.7159674011469965, "grad_norm": 0.1474609375, "learning_rate": 1.9510170195101703e-05, "loss": 0.5723, "step": 4498 }, { "epoch": 2.716571083610021, "grad_norm": 0.1650390625, "learning_rate": 1.9468659194686593e-05, "loss": 0.5244, "step": 4499 }, { "epoch": 2.7171747660730454, "grad_norm": 0.1640625, "learning_rate": 1.9427148194271483e-05, "loss": 0.5962, "step": 4500 }, { "epoch": 2.7171747660730454, "eval_loss": 0.6135643124580383, "eval_runtime": 1059.4817, "eval_samples_per_second": 2.633, "eval_steps_per_second": 0.329, "step": 4500 }, { "epoch": 2.71777844853607, "grad_norm": 0.158203125, "learning_rate": 1.9385637193856372e-05, "loss": 0.5302, "step": 4501 }, { "epoch": 2.7183821309990943, "grad_norm": 0.1748046875, "learning_rate": 1.9344126193441262e-05, "loss": 0.5348, "step": 4502 }, { "epoch": 2.718985813462119, "grad_norm": 0.1923828125, "learning_rate": 1.9302615193026152e-05, "loss": 0.5221, "step": 4503 }, { "epoch": 2.7195894959251437, "grad_norm": 0.173828125, "learning_rate": 1.9261104192611042e-05, "loss": 0.4312, "step": 4504 }, { "epoch": 2.720193178388168, "grad_norm": 0.1796875, "learning_rate": 1.9219593192195935e-05, "loss": 0.4838, "step": 4505 }, { "epoch": 2.720796860851192, "grad_norm": 0.1953125, "learning_rate": 1.9178082191780822e-05, "loss": 0.471, "step": 4506 }, { "epoch": 2.7214005433142168, "grad_norm": 0.1865234375, "learning_rate": 1.913657119136571e-05, "loss": 0.4915, "step": 4507 }, { "epoch": 2.7220042257772414, "grad_norm": 0.173828125, "learning_rate": 1.9095060190950605e-05, "loss": 0.3882, "step": 4508 }, { "epoch": 2.7226079082402657, "grad_norm": 0.1962890625, "learning_rate": 1.9053549190535495e-05, "loss": 0.3019, "step": 4509 }, { "epoch": 2.72321159070329, "grad_norm": 0.2080078125, "learning_rate": 1.901203819012038e-05, "loss": 0.3022, "step": 4510 }, { "epoch": 2.7238152731663146, "grad_norm": 0.2158203125, "learning_rate": 1.897052718970527e-05, "loss": 0.3075, "step": 4511 }, { "epoch": 2.7244189556293392, "grad_norm": 0.205078125, "learning_rate": 1.8929016189290164e-05, "loss": 0.2109, "step": 4512 }, { "epoch": 2.7250226380923634, "grad_norm": 0.146484375, "learning_rate": 1.888750518887505e-05, "loss": 0.6597, "step": 4513 }, { "epoch": 2.7256263205553877, "grad_norm": 0.1318359375, "learning_rate": 1.884599418845994e-05, "loss": 0.5063, "step": 4514 }, { "epoch": 2.7262300030184123, "grad_norm": 0.171875, "learning_rate": 1.8804483188044834e-05, "loss": 0.722, "step": 4515 }, { "epoch": 2.726833685481437, "grad_norm": 0.14453125, "learning_rate": 1.8762972187629724e-05, "loss": 0.733, "step": 4516 }, { "epoch": 2.7274373679444612, "grad_norm": 0.1337890625, "learning_rate": 1.872146118721461e-05, "loss": 0.5692, "step": 4517 }, { "epoch": 2.7280410504074855, "grad_norm": 0.1572265625, "learning_rate": 1.8679950186799504e-05, "loss": 0.5854, "step": 4518 }, { "epoch": 2.72864473287051, "grad_norm": 0.1904296875, "learning_rate": 1.8638439186384393e-05, "loss": 0.4641, "step": 4519 }, { "epoch": 2.729248415333535, "grad_norm": 0.1435546875, "learning_rate": 1.8596928185969283e-05, "loss": 0.5903, "step": 4520 }, { "epoch": 2.729852097796559, "grad_norm": 0.1484375, "learning_rate": 1.8555417185554173e-05, "loss": 0.5939, "step": 4521 }, { "epoch": 2.7304557802595832, "grad_norm": 0.21484375, "learning_rate": 1.8513906185139063e-05, "loss": 0.6747, "step": 4522 }, { "epoch": 2.731059462722608, "grad_norm": 0.142578125, "learning_rate": 1.8472395184723953e-05, "loss": 0.6233, "step": 4523 }, { "epoch": 2.7316631451856326, "grad_norm": 0.142578125, "learning_rate": 1.8430884184308843e-05, "loss": 0.5499, "step": 4524 }, { "epoch": 2.732266827648657, "grad_norm": 0.15234375, "learning_rate": 1.8389373183893733e-05, "loss": 0.5983, "step": 4525 }, { "epoch": 2.732870510111681, "grad_norm": 0.154296875, "learning_rate": 1.8347862183478623e-05, "loss": 0.7181, "step": 4526 }, { "epoch": 2.7334741925747057, "grad_norm": 0.15625, "learning_rate": 1.8306351183063512e-05, "loss": 0.5598, "step": 4527 }, { "epoch": 2.7340778750377304, "grad_norm": 0.1494140625, "learning_rate": 1.8264840182648402e-05, "loss": 0.6007, "step": 4528 }, { "epoch": 2.7346815575007546, "grad_norm": 0.1455078125, "learning_rate": 1.8223329182233292e-05, "loss": 0.5836, "step": 4529 }, { "epoch": 2.735285239963779, "grad_norm": 0.1455078125, "learning_rate": 1.8181818181818182e-05, "loss": 0.5936, "step": 4530 }, { "epoch": 2.7358889224268035, "grad_norm": 0.1416015625, "learning_rate": 1.8140307181403075e-05, "loss": 0.8953, "step": 4531 }, { "epoch": 2.736492604889828, "grad_norm": 0.142578125, "learning_rate": 1.8098796180987962e-05, "loss": 0.5644, "step": 4532 }, { "epoch": 2.7370962873528524, "grad_norm": 0.1611328125, "learning_rate": 1.805728518057285e-05, "loss": 0.7689, "step": 4533 }, { "epoch": 2.7376999698158766, "grad_norm": 0.15625, "learning_rate": 1.8015774180157745e-05, "loss": 0.7984, "step": 4534 }, { "epoch": 2.7383036522789013, "grad_norm": 0.17578125, "learning_rate": 1.797426317974263e-05, "loss": 0.7707, "step": 4535 }, { "epoch": 2.738907334741926, "grad_norm": 0.1513671875, "learning_rate": 1.793275217932752e-05, "loss": 0.7229, "step": 4536 }, { "epoch": 2.73951101720495, "grad_norm": 0.1474609375, "learning_rate": 1.7891241178912415e-05, "loss": 0.9094, "step": 4537 }, { "epoch": 2.7401146996679744, "grad_norm": 0.1552734375, "learning_rate": 1.7849730178497304e-05, "loss": 0.6551, "step": 4538 }, { "epoch": 2.740718382130999, "grad_norm": 0.1611328125, "learning_rate": 1.780821917808219e-05, "loss": 0.696, "step": 4539 }, { "epoch": 2.7413220645940237, "grad_norm": 0.1552734375, "learning_rate": 1.776670817766708e-05, "loss": 0.7026, "step": 4540 }, { "epoch": 2.741925747057048, "grad_norm": 0.1552734375, "learning_rate": 1.7725197177251974e-05, "loss": 0.6059, "step": 4541 }, { "epoch": 2.742529429520072, "grad_norm": 0.1396484375, "learning_rate": 1.7683686176836864e-05, "loss": 0.6566, "step": 4542 }, { "epoch": 2.743133111983097, "grad_norm": 0.1455078125, "learning_rate": 1.764217517642175e-05, "loss": 0.6337, "step": 4543 }, { "epoch": 2.7437367944461215, "grad_norm": 0.1396484375, "learning_rate": 1.7600664176006644e-05, "loss": 0.512, "step": 4544 }, { "epoch": 2.7443404769091457, "grad_norm": 0.1357421875, "learning_rate": 1.7559153175591534e-05, "loss": 0.4839, "step": 4545 }, { "epoch": 2.7449441593721704, "grad_norm": 0.1787109375, "learning_rate": 1.751764217517642e-05, "loss": 0.5621, "step": 4546 }, { "epoch": 2.7455478418351946, "grad_norm": 0.1455078125, "learning_rate": 1.7476131174761313e-05, "loss": 0.5687, "step": 4547 }, { "epoch": 2.7461515242982193, "grad_norm": 0.142578125, "learning_rate": 1.7434620174346203e-05, "loss": 0.5606, "step": 4548 }, { "epoch": 2.7467552067612435, "grad_norm": 0.1455078125, "learning_rate": 1.7393109173931093e-05, "loss": 0.5483, "step": 4549 }, { "epoch": 2.747358889224268, "grad_norm": 0.150390625, "learning_rate": 1.7351598173515983e-05, "loss": 0.4612, "step": 4550 }, { "epoch": 2.7479625716872924, "grad_norm": 0.158203125, "learning_rate": 1.7310087173100873e-05, "loss": 0.5156, "step": 4551 }, { "epoch": 2.748566254150317, "grad_norm": 0.1630859375, "learning_rate": 1.7268576172685763e-05, "loss": 0.5005, "step": 4552 }, { "epoch": 2.7491699366133413, "grad_norm": 0.1630859375, "learning_rate": 1.7227065172270653e-05, "loss": 0.454, "step": 4553 }, { "epoch": 2.749773619076366, "grad_norm": 0.166015625, "learning_rate": 1.7185554171855542e-05, "loss": 0.5291, "step": 4554 }, { "epoch": 2.75037730153939, "grad_norm": 0.18359375, "learning_rate": 1.7144043171440432e-05, "loss": 0.5026, "step": 4555 }, { "epoch": 2.750980984002415, "grad_norm": 0.189453125, "learning_rate": 1.7102532171025322e-05, "loss": 0.4815, "step": 4556 }, { "epoch": 2.751584666465439, "grad_norm": 0.1904296875, "learning_rate": 1.7061021170610212e-05, "loss": 0.4753, "step": 4557 }, { "epoch": 2.7521883489284638, "grad_norm": 0.1806640625, "learning_rate": 1.7019510170195102e-05, "loss": 0.4151, "step": 4558 }, { "epoch": 2.752792031391488, "grad_norm": 0.2197265625, "learning_rate": 1.6977999169779992e-05, "loss": 0.3104, "step": 4559 }, { "epoch": 2.7533957138545126, "grad_norm": 0.203125, "learning_rate": 1.6936488169364885e-05, "loss": 0.3303, "step": 4560 }, { "epoch": 2.753999396317537, "grad_norm": 0.203125, "learning_rate": 1.689497716894977e-05, "loss": 0.308, "step": 4561 }, { "epoch": 2.7546030787805615, "grad_norm": 0.2197265625, "learning_rate": 1.685346616853466e-05, "loss": 0.2067, "step": 4562 }, { "epoch": 2.7552067612435858, "grad_norm": 0.1357421875, "learning_rate": 1.6811955168119555e-05, "loss": 1.0449, "step": 4563 }, { "epoch": 2.7558104437066104, "grad_norm": 0.1435546875, "learning_rate": 1.6770444167704444e-05, "loss": 0.6562, "step": 4564 }, { "epoch": 2.7564141261696347, "grad_norm": 0.13671875, "learning_rate": 1.672893316728933e-05, "loss": 0.5127, "step": 4565 }, { "epoch": 2.7570178086326593, "grad_norm": 0.146484375, "learning_rate": 1.6687422166874224e-05, "loss": 0.6374, "step": 4566 }, { "epoch": 2.7576214910956836, "grad_norm": 0.1513671875, "learning_rate": 1.6645911166459114e-05, "loss": 0.7662, "step": 4567 }, { "epoch": 2.758225173558708, "grad_norm": 0.1435546875, "learning_rate": 1.6604400166044e-05, "loss": 0.7541, "step": 4568 }, { "epoch": 2.7588288560217324, "grad_norm": 0.16796875, "learning_rate": 1.656288916562889e-05, "loss": 0.735, "step": 4569 }, { "epoch": 2.759432538484757, "grad_norm": 0.1591796875, "learning_rate": 1.6521378165213784e-05, "loss": 0.59, "step": 4570 }, { "epoch": 2.7600362209477813, "grad_norm": 0.138671875, "learning_rate": 1.6479867164798674e-05, "loss": 0.5174, "step": 4571 }, { "epoch": 2.760639903410806, "grad_norm": 0.140625, "learning_rate": 1.643835616438356e-05, "loss": 0.5364, "step": 4572 }, { "epoch": 2.7612435858738302, "grad_norm": 0.150390625, "learning_rate": 1.6396845163968453e-05, "loss": 0.6259, "step": 4573 }, { "epoch": 2.761847268336855, "grad_norm": 0.1455078125, "learning_rate": 1.6355334163553343e-05, "loss": 0.8076, "step": 4574 }, { "epoch": 2.762450950799879, "grad_norm": 0.1533203125, "learning_rate": 1.6313823163138233e-05, "loss": 0.5766, "step": 4575 }, { "epoch": 2.763054633262904, "grad_norm": 0.1357421875, "learning_rate": 1.6272312162723123e-05, "loss": 0.6713, "step": 4576 }, { "epoch": 2.763658315725928, "grad_norm": 0.1513671875, "learning_rate": 1.6230801162308013e-05, "loss": 0.6865, "step": 4577 }, { "epoch": 2.7642619981889527, "grad_norm": 0.1513671875, "learning_rate": 1.6189290161892903e-05, "loss": 0.8193, "step": 4578 }, { "epoch": 2.764865680651977, "grad_norm": 0.146484375, "learning_rate": 1.6147779161477793e-05, "loss": 0.5764, "step": 4579 }, { "epoch": 2.7654693631150016, "grad_norm": 0.1552734375, "learning_rate": 1.6106268161062682e-05, "loss": 0.7132, "step": 4580 }, { "epoch": 2.766073045578026, "grad_norm": 0.1416015625, "learning_rate": 1.6064757160647572e-05, "loss": 0.7006, "step": 4581 }, { "epoch": 2.7666767280410505, "grad_norm": 0.169921875, "learning_rate": 1.6023246160232462e-05, "loss": 0.62, "step": 4582 }, { "epoch": 2.7672804105040747, "grad_norm": 0.146484375, "learning_rate": 1.5981735159817352e-05, "loss": 0.5348, "step": 4583 }, { "epoch": 2.7678840929670994, "grad_norm": 0.14453125, "learning_rate": 1.5940224159402242e-05, "loss": 0.6188, "step": 4584 }, { "epoch": 2.768487775430124, "grad_norm": 0.15625, "learning_rate": 1.5898713158987132e-05, "loss": 0.6714, "step": 4585 }, { "epoch": 2.7690914578931483, "grad_norm": 0.1474609375, "learning_rate": 1.585720215857202e-05, "loss": 0.7338, "step": 4586 }, { "epoch": 2.7696951403561725, "grad_norm": 0.15625, "learning_rate": 1.581569115815691e-05, "loss": 0.5708, "step": 4587 }, { "epoch": 2.770298822819197, "grad_norm": 0.15234375, "learning_rate": 1.57741801577418e-05, "loss": 0.6864, "step": 4588 }, { "epoch": 2.770902505282222, "grad_norm": 0.1328125, "learning_rate": 1.5732669157326695e-05, "loss": 0.57, "step": 4589 }, { "epoch": 2.771506187745246, "grad_norm": 0.158203125, "learning_rate": 1.569115815691158e-05, "loss": 0.5914, "step": 4590 }, { "epoch": 2.7721098702082703, "grad_norm": 0.1591796875, "learning_rate": 1.564964715649647e-05, "loss": 0.9134, "step": 4591 }, { "epoch": 2.772713552671295, "grad_norm": 0.365234375, "learning_rate": 1.5608136156081364e-05, "loss": 0.6086, "step": 4592 }, { "epoch": 2.7733172351343196, "grad_norm": 0.154296875, "learning_rate": 1.5566625155666254e-05, "loss": 0.7351, "step": 4593 }, { "epoch": 2.773920917597344, "grad_norm": 0.1484375, "learning_rate": 1.552511415525114e-05, "loss": 0.8879, "step": 4594 }, { "epoch": 2.774524600060368, "grad_norm": 0.150390625, "learning_rate": 1.548360315483603e-05, "loss": 0.8547, "step": 4595 }, { "epoch": 2.7751282825233927, "grad_norm": 0.134765625, "learning_rate": 1.5442092154420924e-05, "loss": 0.5326, "step": 4596 }, { "epoch": 2.7757319649864174, "grad_norm": 0.1455078125, "learning_rate": 1.540058115400581e-05, "loss": 0.4954, "step": 4597 }, { "epoch": 2.7763356474494416, "grad_norm": 0.16015625, "learning_rate": 1.53590701535907e-05, "loss": 0.6014, "step": 4598 }, { "epoch": 2.776939329912466, "grad_norm": 0.158203125, "learning_rate": 1.5317559153175593e-05, "loss": 0.5491, "step": 4599 }, { "epoch": 2.7775430123754905, "grad_norm": 0.1650390625, "learning_rate": 1.5276048152760483e-05, "loss": 0.5466, "step": 4600 }, { "epoch": 2.778146694838515, "grad_norm": 0.1630859375, "learning_rate": 1.5234537152345371e-05, "loss": 0.508, "step": 4601 }, { "epoch": 2.7787503773015394, "grad_norm": 0.1728515625, "learning_rate": 1.5193026151930261e-05, "loss": 0.5131, "step": 4602 }, { "epoch": 2.7793540597645636, "grad_norm": 0.169921875, "learning_rate": 1.5151515151515153e-05, "loss": 0.4846, "step": 4603 }, { "epoch": 2.7799577422275883, "grad_norm": 0.1767578125, "learning_rate": 1.5110004151100043e-05, "loss": 0.5484, "step": 4604 }, { "epoch": 2.780561424690613, "grad_norm": 0.181640625, "learning_rate": 1.5068493150684931e-05, "loss": 0.4347, "step": 4605 }, { "epoch": 2.781165107153637, "grad_norm": 0.193359375, "learning_rate": 1.5026982150269822e-05, "loss": 0.4263, "step": 4606 }, { "epoch": 2.7817687896166614, "grad_norm": 0.19140625, "learning_rate": 1.4985471149854712e-05, "loss": 0.4077, "step": 4607 }, { "epoch": 2.782372472079686, "grad_norm": 0.201171875, "learning_rate": 1.49439601494396e-05, "loss": 0.4031, "step": 4608 }, { "epoch": 2.7829761545427107, "grad_norm": 0.2060546875, "learning_rate": 1.4902449149024492e-05, "loss": 0.3571, "step": 4609 }, { "epoch": 2.783579837005735, "grad_norm": 0.20703125, "learning_rate": 1.4860938148609382e-05, "loss": 0.2875, "step": 4610 }, { "epoch": 2.784183519468759, "grad_norm": 0.197265625, "learning_rate": 1.4819427148194274e-05, "loss": 0.257, "step": 4611 }, { "epoch": 2.784787201931784, "grad_norm": 0.2041015625, "learning_rate": 1.4777916147779162e-05, "loss": 0.2157, "step": 4612 }, { "epoch": 2.7853908843948085, "grad_norm": 0.1318359375, "learning_rate": 1.4736405147364052e-05, "loss": 0.7389, "step": 4613 }, { "epoch": 2.7859945668578328, "grad_norm": 0.1630859375, "learning_rate": 1.4694894146948943e-05, "loss": 0.7929, "step": 4614 }, { "epoch": 2.786598249320857, "grad_norm": 0.138671875, "learning_rate": 1.4653383146533833e-05, "loss": 0.6031, "step": 4615 }, { "epoch": 2.7872019317838816, "grad_norm": 0.1396484375, "learning_rate": 1.4611872146118721e-05, "loss": 0.5555, "step": 4616 }, { "epoch": 2.7878056142469063, "grad_norm": 0.1533203125, "learning_rate": 1.4570361145703613e-05, "loss": 0.6512, "step": 4617 }, { "epoch": 2.7884092967099305, "grad_norm": 0.1552734375, "learning_rate": 1.4528850145288503e-05, "loss": 0.6039, "step": 4618 }, { "epoch": 2.7890129791729548, "grad_norm": 0.1494140625, "learning_rate": 1.448733914487339e-05, "loss": 0.6583, "step": 4619 }, { "epoch": 2.7896166616359794, "grad_norm": 0.1455078125, "learning_rate": 1.4445828144458282e-05, "loss": 0.6774, "step": 4620 }, { "epoch": 2.790220344099004, "grad_norm": 0.1396484375, "learning_rate": 1.4404317144043172e-05, "loss": 0.5934, "step": 4621 }, { "epoch": 2.7908240265620283, "grad_norm": 0.1640625, "learning_rate": 1.4362806143628064e-05, "loss": 0.6648, "step": 4622 }, { "epoch": 2.7914277090250526, "grad_norm": 0.130859375, "learning_rate": 1.432129514321295e-05, "loss": 0.5695, "step": 4623 }, { "epoch": 2.792031391488077, "grad_norm": 0.1474609375, "learning_rate": 1.4279784142797842e-05, "loss": 0.9932, "step": 4624 }, { "epoch": 2.792635073951102, "grad_norm": 0.154296875, "learning_rate": 1.4238273142382733e-05, "loss": 0.6688, "step": 4625 }, { "epoch": 2.793238756414126, "grad_norm": 0.169921875, "learning_rate": 1.4196762141967623e-05, "loss": 0.8921, "step": 4626 }, { "epoch": 2.793842438877151, "grad_norm": 0.12890625, "learning_rate": 1.4155251141552511e-05, "loss": 0.8431, "step": 4627 }, { "epoch": 2.794446121340175, "grad_norm": 0.1357421875, "learning_rate": 1.4113740141137401e-05, "loss": 0.6138, "step": 4628 }, { "epoch": 2.7950498038031997, "grad_norm": 0.171875, "learning_rate": 1.4072229140722293e-05, "loss": 0.7836, "step": 4629 }, { "epoch": 2.795653486266224, "grad_norm": 0.14453125, "learning_rate": 1.4030718140307181e-05, "loss": 0.8925, "step": 4630 }, { "epoch": 2.7962571687292486, "grad_norm": 0.15234375, "learning_rate": 1.3989207139892071e-05, "loss": 0.6221, "step": 4631 }, { "epoch": 2.796860851192273, "grad_norm": 0.1513671875, "learning_rate": 1.3947696139476963e-05, "loss": 0.6567, "step": 4632 }, { "epoch": 2.7974645336552975, "grad_norm": 0.146484375, "learning_rate": 1.3906185139061852e-05, "loss": 0.5648, "step": 4633 }, { "epoch": 2.7980682161183217, "grad_norm": 0.158203125, "learning_rate": 1.386467413864674e-05, "loss": 0.6673, "step": 4634 }, { "epoch": 2.7986718985813464, "grad_norm": 0.1435546875, "learning_rate": 1.3823163138231632e-05, "loss": 0.605, "step": 4635 }, { "epoch": 2.7992755810443706, "grad_norm": 0.1396484375, "learning_rate": 1.3781652137816522e-05, "loss": 0.6171, "step": 4636 }, { "epoch": 2.7998792635073952, "grad_norm": 0.1494140625, "learning_rate": 1.3740141137401414e-05, "loss": 0.5335, "step": 4637 }, { "epoch": 2.8004829459704195, "grad_norm": 0.138671875, "learning_rate": 1.3698630136986302e-05, "loss": 0.7067, "step": 4638 }, { "epoch": 2.801086628433444, "grad_norm": 0.1484375, "learning_rate": 1.3657119136571192e-05, "loss": 0.655, "step": 4639 }, { "epoch": 2.8016903108964684, "grad_norm": 0.1435546875, "learning_rate": 1.3615608136156083e-05, "loss": 0.5433, "step": 4640 }, { "epoch": 2.802293993359493, "grad_norm": 0.1474609375, "learning_rate": 1.3574097135740971e-05, "loss": 0.7166, "step": 4641 }, { "epoch": 2.8028976758225173, "grad_norm": 0.171875, "learning_rate": 1.3532586135325861e-05, "loss": 0.6537, "step": 4642 }, { "epoch": 2.803501358285542, "grad_norm": 0.1494140625, "learning_rate": 1.3491075134910753e-05, "loss": 0.6526, "step": 4643 }, { "epoch": 2.804105040748566, "grad_norm": 0.158203125, "learning_rate": 1.3449564134495643e-05, "loss": 0.6646, "step": 4644 }, { "epoch": 2.804708723211591, "grad_norm": 0.1328125, "learning_rate": 1.3408053134080531e-05, "loss": 0.5491, "step": 4645 }, { "epoch": 2.805312405674615, "grad_norm": 0.146484375, "learning_rate": 1.3366542133665422e-05, "loss": 0.5573, "step": 4646 }, { "epoch": 2.8059160881376397, "grad_norm": 0.154296875, "learning_rate": 1.3325031133250312e-05, "loss": 0.5577, "step": 4647 }, { "epoch": 2.806519770600664, "grad_norm": 0.1533203125, "learning_rate": 1.3283520132835204e-05, "loss": 0.5257, "step": 4648 }, { "epoch": 2.8071234530636886, "grad_norm": 0.1630859375, "learning_rate": 1.3242009132420092e-05, "loss": 0.6006, "step": 4649 }, { "epoch": 2.807727135526713, "grad_norm": 0.1630859375, "learning_rate": 1.3200498132004982e-05, "loss": 0.5615, "step": 4650 }, { "epoch": 2.8083308179897375, "grad_norm": 0.15625, "learning_rate": 1.3158987131589873e-05, "loss": 0.5161, "step": 4651 }, { "epoch": 2.8089345004527617, "grad_norm": 0.1630859375, "learning_rate": 1.311747613117476e-05, "loss": 0.5035, "step": 4652 }, { "epoch": 2.8095381829157864, "grad_norm": 0.1650390625, "learning_rate": 1.3075965130759652e-05, "loss": 0.5236, "step": 4653 }, { "epoch": 2.8101418653788106, "grad_norm": 0.1787109375, "learning_rate": 1.3034454130344541e-05, "loss": 0.5802, "step": 4654 }, { "epoch": 2.8107455478418353, "grad_norm": 0.17578125, "learning_rate": 1.2992943129929433e-05, "loss": 0.4602, "step": 4655 }, { "epoch": 2.8113492303048595, "grad_norm": 0.181640625, "learning_rate": 1.2951432129514321e-05, "loss": 0.4859, "step": 4656 }, { "epoch": 2.811952912767884, "grad_norm": 0.1884765625, "learning_rate": 1.2909921129099211e-05, "loss": 0.4449, "step": 4657 }, { "epoch": 2.8125565952309084, "grad_norm": 0.1875, "learning_rate": 1.2868410128684103e-05, "loss": 0.3615, "step": 4658 }, { "epoch": 2.813160277693933, "grad_norm": 0.19921875, "learning_rate": 1.2826899128268992e-05, "loss": 0.325, "step": 4659 }, { "epoch": 2.8137639601569573, "grad_norm": 0.21875, "learning_rate": 1.278538812785388e-05, "loss": 0.3904, "step": 4660 }, { "epoch": 2.814367642619982, "grad_norm": 0.20703125, "learning_rate": 1.2743877127438772e-05, "loss": 0.271, "step": 4661 }, { "epoch": 2.814971325083006, "grad_norm": 0.1953125, "learning_rate": 1.2702366127023662e-05, "loss": 0.2315, "step": 4662 }, { "epoch": 2.815575007546031, "grad_norm": 0.13671875, "learning_rate": 1.266085512660855e-05, "loss": 0.7124, "step": 4663 }, { "epoch": 2.816178690009055, "grad_norm": 0.1533203125, "learning_rate": 1.2619344126193442e-05, "loss": 0.6489, "step": 4664 }, { "epoch": 2.8167823724720797, "grad_norm": 0.15625, "learning_rate": 1.2577833125778332e-05, "loss": 0.966, "step": 4665 }, { "epoch": 2.8173860549351044, "grad_norm": 0.1650390625, "learning_rate": 1.2536322125363223e-05, "loss": 0.7182, "step": 4666 }, { "epoch": 2.8179897373981286, "grad_norm": 0.166015625, "learning_rate": 1.2494811124948113e-05, "loss": 0.6668, "step": 4667 }, { "epoch": 2.818593419861153, "grad_norm": 0.1572265625, "learning_rate": 1.2453300124533001e-05, "loss": 0.6513, "step": 4668 }, { "epoch": 2.8191971023241775, "grad_norm": 0.13671875, "learning_rate": 1.2411789124117893e-05, "loss": 0.5954, "step": 4669 }, { "epoch": 2.819800784787202, "grad_norm": 0.171875, "learning_rate": 1.2370278123702781e-05, "loss": 0.7074, "step": 4670 }, { "epoch": 2.8204044672502264, "grad_norm": 0.1552734375, "learning_rate": 1.2328767123287671e-05, "loss": 0.5661, "step": 4671 }, { "epoch": 2.8210081497132506, "grad_norm": 0.1435546875, "learning_rate": 1.2287256122872562e-05, "loss": 0.6607, "step": 4672 }, { "epoch": 2.8216118321762753, "grad_norm": 0.1630859375, "learning_rate": 1.224574512245745e-05, "loss": 0.7987, "step": 4673 }, { "epoch": 2.8222155146393, "grad_norm": 0.14453125, "learning_rate": 1.2204234122042342e-05, "loss": 0.8645, "step": 4674 }, { "epoch": 2.822819197102324, "grad_norm": 0.140625, "learning_rate": 1.2162723121627232e-05, "loss": 0.4833, "step": 4675 }, { "epoch": 2.8234228795653484, "grad_norm": 0.1513671875, "learning_rate": 1.2121212121212122e-05, "loss": 0.6758, "step": 4676 }, { "epoch": 2.824026562028373, "grad_norm": 0.1484375, "learning_rate": 1.2079701120797012e-05, "loss": 0.6682, "step": 4677 }, { "epoch": 2.8246302444913978, "grad_norm": 0.1474609375, "learning_rate": 1.2038190120381902e-05, "loss": 0.753, "step": 4678 }, { "epoch": 2.825233926954422, "grad_norm": 0.1376953125, "learning_rate": 1.1996679119966792e-05, "loss": 0.4793, "step": 4679 }, { "epoch": 2.825837609417446, "grad_norm": 0.19140625, "learning_rate": 1.1955168119551683e-05, "loss": 0.6239, "step": 4680 }, { "epoch": 2.826441291880471, "grad_norm": 0.15625, "learning_rate": 1.1913657119136571e-05, "loss": 0.607, "step": 4681 }, { "epoch": 2.8270449743434956, "grad_norm": 0.150390625, "learning_rate": 1.1872146118721461e-05, "loss": 0.7043, "step": 4682 }, { "epoch": 2.82764865680652, "grad_norm": 0.1572265625, "learning_rate": 1.1830635118306351e-05, "loss": 0.7399, "step": 4683 }, { "epoch": 2.828252339269544, "grad_norm": 0.1513671875, "learning_rate": 1.1789124117891241e-05, "loss": 0.5987, "step": 4684 }, { "epoch": 2.8288560217325687, "grad_norm": 0.140625, "learning_rate": 1.1747613117476132e-05, "loss": 0.6033, "step": 4685 }, { "epoch": 2.8294597041955933, "grad_norm": 0.1552734375, "learning_rate": 1.170610211706102e-05, "loss": 0.5987, "step": 4686 }, { "epoch": 2.8300633866586176, "grad_norm": 0.1884765625, "learning_rate": 1.1664591116645912e-05, "loss": 0.6025, "step": 4687 }, { "epoch": 2.830667069121642, "grad_norm": 0.142578125, "learning_rate": 1.1623080116230802e-05, "loss": 0.588, "step": 4688 }, { "epoch": 2.8312707515846665, "grad_norm": 0.1572265625, "learning_rate": 1.1581569115815692e-05, "loss": 0.6295, "step": 4689 }, { "epoch": 2.831874434047691, "grad_norm": 0.1552734375, "learning_rate": 1.1540058115400582e-05, "loss": 0.7745, "step": 4690 }, { "epoch": 2.8324781165107153, "grad_norm": 0.1416015625, "learning_rate": 1.1498547114985472e-05, "loss": 0.6095, "step": 4691 }, { "epoch": 2.8330817989737396, "grad_norm": 0.14453125, "learning_rate": 1.1457036114570362e-05, "loss": 0.8522, "step": 4692 } ], "logging_steps": 1, "max_steps": 4968, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 276, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.661106930390131e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }