diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24010 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9994890883877088, + "eval_steps": 500, + "global_step": 3424, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005838989854755127, + "grad_norm": 0.332582177741534, + "learning_rate": 5.830903790087464e-08, + "loss": 0.6999, + "step": 1 + }, + { + "epoch": 0.0011677979709510255, + "grad_norm": 0.3615315022509243, + "learning_rate": 1.1661807580174928e-07, + "loss": 0.7009, + "step": 2 + }, + { + "epoch": 0.0017516969564265383, + "grad_norm": 0.31196393628685787, + "learning_rate": 1.7492711370262392e-07, + "loss": 0.6576, + "step": 3 + }, + { + "epoch": 0.002335595941902051, + "grad_norm": 0.3520631421061603, + "learning_rate": 2.3323615160349856e-07, + "loss": 0.6998, + "step": 4 + }, + { + "epoch": 0.0029194949273775635, + "grad_norm": 0.3551115574819664, + "learning_rate": 2.915451895043732e-07, + "loss": 0.7596, + "step": 5 + }, + { + "epoch": 0.0035033939128530766, + "grad_norm": 0.35126782313779803, + "learning_rate": 3.4985422740524783e-07, + "loss": 0.7029, + "step": 6 + }, + { + "epoch": 0.004087292898328589, + "grad_norm": 0.3344061095274702, + "learning_rate": 4.0816326530612243e-07, + "loss": 0.7243, + "step": 7 + }, + { + "epoch": 0.004671191883804102, + "grad_norm": 0.36297819316778795, + "learning_rate": 4.6647230320699713e-07, + "loss": 0.7425, + "step": 8 + }, + { + "epoch": 0.005255090869279615, + "grad_norm": 0.3780309672814191, + "learning_rate": 5.247813411078718e-07, + "loss": 0.7984, + "step": 9 + }, + { + "epoch": 0.005838989854755127, + "grad_norm": 0.33840973852304723, + "learning_rate": 5.830903790087464e-07, + "loss": 0.719, + "step": 10 + }, + { + "epoch": 0.00642288884023064, + "grad_norm": 0.380962644753867, + "learning_rate": 6.413994169096211e-07, + "loss": 0.7446, + "step": 11 + }, + { + "epoch": 0.007006787825706153, + "grad_norm": 0.3345884213995324, + "learning_rate": 6.997084548104957e-07, + "loss": 0.7027, + "step": 12 + }, + { + "epoch": 0.007590686811181665, + "grad_norm": 0.3069686147999862, + "learning_rate": 7.580174927113704e-07, + "loss": 0.6898, + "step": 13 + }, + { + "epoch": 0.008174585796657178, + "grad_norm": 0.3135045316873362, + "learning_rate": 8.163265306122449e-07, + "loss": 0.7456, + "step": 14 + }, + { + "epoch": 0.00875848478213269, + "grad_norm": 0.2609074968081895, + "learning_rate": 8.746355685131196e-07, + "loss": 0.6614, + "step": 15 + }, + { + "epoch": 0.009342383767608204, + "grad_norm": 0.2799330424224384, + "learning_rate": 9.329446064139943e-07, + "loss": 0.6649, + "step": 16 + }, + { + "epoch": 0.009926282753083717, + "grad_norm": 0.25380300320984944, + "learning_rate": 9.91253644314869e-07, + "loss": 0.6715, + "step": 17 + }, + { + "epoch": 0.01051018173855923, + "grad_norm": 0.26485072905334656, + "learning_rate": 1.0495626822157436e-06, + "loss": 0.6852, + "step": 18 + }, + { + "epoch": 0.011094080724034743, + "grad_norm": 0.20548347422113858, + "learning_rate": 1.1078717201166181e-06, + "loss": 0.6713, + "step": 19 + }, + { + "epoch": 0.011677979709510254, + "grad_norm": 0.18057677561829577, + "learning_rate": 1.1661807580174927e-06, + "loss": 0.6136, + "step": 20 + }, + { + "epoch": 0.012261878694985767, + "grad_norm": 0.21671779345112255, + "learning_rate": 1.2244897959183673e-06, + "loss": 0.7028, + "step": 21 + }, + { + "epoch": 0.01284577768046128, + "grad_norm": 0.16929860513297237, + "learning_rate": 1.2827988338192421e-06, + "loss": 0.654, + "step": 22 + }, + { + "epoch": 0.013429676665936793, + "grad_norm": 0.17685563778580482, + "learning_rate": 1.3411078717201167e-06, + "loss": 0.6474, + "step": 23 + }, + { + "epoch": 0.014013575651412306, + "grad_norm": 0.1706611185931487, + "learning_rate": 1.3994169096209913e-06, + "loss": 0.636, + "step": 24 + }, + { + "epoch": 0.014597474636887818, + "grad_norm": 0.16620672438961032, + "learning_rate": 1.4577259475218661e-06, + "loss": 0.6209, + "step": 25 + }, + { + "epoch": 0.01518137362236333, + "grad_norm": 0.1602393725989506, + "learning_rate": 1.5160349854227407e-06, + "loss": 0.649, + "step": 26 + }, + { + "epoch": 0.015765272607838846, + "grad_norm": 0.2092349414661533, + "learning_rate": 1.5743440233236153e-06, + "loss": 0.6628, + "step": 27 + }, + { + "epoch": 0.016349171593314355, + "grad_norm": 0.2560487647611822, + "learning_rate": 1.6326530612244897e-06, + "loss": 0.6138, + "step": 28 + }, + { + "epoch": 0.016933070578789868, + "grad_norm": 0.239719439026104, + "learning_rate": 1.6909620991253645e-06, + "loss": 0.6251, + "step": 29 + }, + { + "epoch": 0.01751696956426538, + "grad_norm": 0.2491205523664377, + "learning_rate": 1.7492711370262391e-06, + "loss": 0.635, + "step": 30 + }, + { + "epoch": 0.018100868549740894, + "grad_norm": 0.19890611405577832, + "learning_rate": 1.8075801749271137e-06, + "loss": 0.6084, + "step": 31 + }, + { + "epoch": 0.018684767535216407, + "grad_norm": 0.20911262872446526, + "learning_rate": 1.8658892128279885e-06, + "loss": 0.6457, + "step": 32 + }, + { + "epoch": 0.01926866652069192, + "grad_norm": 0.16394915098725632, + "learning_rate": 1.9241982507288633e-06, + "loss": 0.627, + "step": 33 + }, + { + "epoch": 0.019852565506167433, + "grad_norm": 0.1572828858097562, + "learning_rate": 1.982507288629738e-06, + "loss": 0.6132, + "step": 34 + }, + { + "epoch": 0.020436464491642947, + "grad_norm": 0.1384117125623619, + "learning_rate": 2.0408163265306125e-06, + "loss": 0.5816, + "step": 35 + }, + { + "epoch": 0.02102036347711846, + "grad_norm": 0.1379251430163816, + "learning_rate": 2.099125364431487e-06, + "loss": 0.5639, + "step": 36 + }, + { + "epoch": 0.021604262462593973, + "grad_norm": 0.14531427780293601, + "learning_rate": 2.1574344023323617e-06, + "loss": 0.5934, + "step": 37 + }, + { + "epoch": 0.022188161448069486, + "grad_norm": 0.14198828932680072, + "learning_rate": 2.2157434402332363e-06, + "loss": 0.555, + "step": 38 + }, + { + "epoch": 0.022772060433544995, + "grad_norm": 0.14650974951276136, + "learning_rate": 2.274052478134111e-06, + "loss": 0.5683, + "step": 39 + }, + { + "epoch": 0.02335595941902051, + "grad_norm": 0.15459202863522678, + "learning_rate": 2.3323615160349855e-06, + "loss": 0.5872, + "step": 40 + }, + { + "epoch": 0.02393985840449602, + "grad_norm": 0.14486054702102946, + "learning_rate": 2.39067055393586e-06, + "loss": 0.5589, + "step": 41 + }, + { + "epoch": 0.024523757389971534, + "grad_norm": 0.1376287453256842, + "learning_rate": 2.4489795918367347e-06, + "loss": 0.5402, + "step": 42 + }, + { + "epoch": 0.025107656375447047, + "grad_norm": 0.14738744852971825, + "learning_rate": 2.5072886297376097e-06, + "loss": 0.6036, + "step": 43 + }, + { + "epoch": 0.02569155536092256, + "grad_norm": 0.14943073167071605, + "learning_rate": 2.5655976676384843e-06, + "loss": 0.6129, + "step": 44 + }, + { + "epoch": 0.026275454346398074, + "grad_norm": 0.13302700086928715, + "learning_rate": 2.6239067055393585e-06, + "loss": 0.5501, + "step": 45 + }, + { + "epoch": 0.026859353331873587, + "grad_norm": 0.12725044493333262, + "learning_rate": 2.6822157434402335e-06, + "loss": 0.5489, + "step": 46 + }, + { + "epoch": 0.0274432523173491, + "grad_norm": 0.13789667607469522, + "learning_rate": 2.740524781341108e-06, + "loss": 0.626, + "step": 47 + }, + { + "epoch": 0.028027151302824613, + "grad_norm": 0.12155102584665793, + "learning_rate": 2.7988338192419827e-06, + "loss": 0.5471, + "step": 48 + }, + { + "epoch": 0.028611050288300126, + "grad_norm": 0.14372921401111807, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.5512, + "step": 49 + }, + { + "epoch": 0.029194949273775635, + "grad_norm": 0.12362142367890944, + "learning_rate": 2.9154518950437323e-06, + "loss": 0.508, + "step": 50 + }, + { + "epoch": 0.02977884825925115, + "grad_norm": 0.11936106531504113, + "learning_rate": 2.9737609329446064e-06, + "loss": 0.5206, + "step": 51 + }, + { + "epoch": 0.03036274724472666, + "grad_norm": 0.1373011935034686, + "learning_rate": 3.0320699708454815e-06, + "loss": 0.6104, + "step": 52 + }, + { + "epoch": 0.030946646230202175, + "grad_norm": 0.13193134690672428, + "learning_rate": 3.090379008746356e-06, + "loss": 0.4985, + "step": 53 + }, + { + "epoch": 0.03153054521567769, + "grad_norm": 0.12304490916181232, + "learning_rate": 3.1486880466472307e-06, + "loss": 0.5745, + "step": 54 + }, + { + "epoch": 0.0321144442011532, + "grad_norm": 0.11616875298030363, + "learning_rate": 3.2069970845481052e-06, + "loss": 0.5546, + "step": 55 + }, + { + "epoch": 0.03269834318662871, + "grad_norm": 0.11681255765858213, + "learning_rate": 3.2653061224489794e-06, + "loss": 0.5208, + "step": 56 + }, + { + "epoch": 0.03328224217210422, + "grad_norm": 0.1178102443082031, + "learning_rate": 3.3236151603498544e-06, + "loss": 0.5262, + "step": 57 + }, + { + "epoch": 0.033866141157579736, + "grad_norm": 0.12023383250240183, + "learning_rate": 3.381924198250729e-06, + "loss": 0.5335, + "step": 58 + }, + { + "epoch": 0.03445004014305525, + "grad_norm": 0.12946145838575623, + "learning_rate": 3.440233236151604e-06, + "loss": 0.5776, + "step": 59 + }, + { + "epoch": 0.03503393912853076, + "grad_norm": 0.12554895642813893, + "learning_rate": 3.4985422740524782e-06, + "loss": 0.5561, + "step": 60 + }, + { + "epoch": 0.035617838114006276, + "grad_norm": 0.12262268728322226, + "learning_rate": 3.5568513119533532e-06, + "loss": 0.5492, + "step": 61 + }, + { + "epoch": 0.03620173709948179, + "grad_norm": 0.12471388952057888, + "learning_rate": 3.6151603498542274e-06, + "loss": 0.5974, + "step": 62 + }, + { + "epoch": 0.0367856360849573, + "grad_norm": 0.1279487568488648, + "learning_rate": 3.6734693877551024e-06, + "loss": 0.5931, + "step": 63 + }, + { + "epoch": 0.037369535070432815, + "grad_norm": 0.12410639472409644, + "learning_rate": 3.731778425655977e-06, + "loss": 0.5764, + "step": 64 + }, + { + "epoch": 0.03795343405590833, + "grad_norm": 0.13709798625298875, + "learning_rate": 3.790087463556852e-06, + "loss": 0.5405, + "step": 65 + }, + { + "epoch": 0.03853733304138384, + "grad_norm": 0.12178713376984483, + "learning_rate": 3.848396501457727e-06, + "loss": 0.5085, + "step": 66 + }, + { + "epoch": 0.039121232026859354, + "grad_norm": 0.13312657848575002, + "learning_rate": 3.906705539358601e-06, + "loss": 0.5353, + "step": 67 + }, + { + "epoch": 0.03970513101233487, + "grad_norm": 0.12493424985237252, + "learning_rate": 3.965014577259476e-06, + "loss": 0.5322, + "step": 68 + }, + { + "epoch": 0.04028902999781038, + "grad_norm": 0.1205078113617299, + "learning_rate": 4.02332361516035e-06, + "loss": 0.5433, + "step": 69 + }, + { + "epoch": 0.04087292898328589, + "grad_norm": 0.12588846597963127, + "learning_rate": 4.081632653061225e-06, + "loss": 0.5223, + "step": 70 + }, + { + "epoch": 0.041456827968761406, + "grad_norm": 0.1192064498475329, + "learning_rate": 4.139941690962099e-06, + "loss": 0.5114, + "step": 71 + }, + { + "epoch": 0.04204072695423692, + "grad_norm": 0.11434727869286143, + "learning_rate": 4.198250728862974e-06, + "loss": 0.477, + "step": 72 + }, + { + "epoch": 0.04262462593971243, + "grad_norm": 0.12562116222538355, + "learning_rate": 4.256559766763848e-06, + "loss": 0.5215, + "step": 73 + }, + { + "epoch": 0.043208524925187945, + "grad_norm": 0.12566878145398752, + "learning_rate": 4.314868804664723e-06, + "loss": 0.5435, + "step": 74 + }, + { + "epoch": 0.04379242391066346, + "grad_norm": 0.12263132583356026, + "learning_rate": 4.3731778425655976e-06, + "loss": 0.5364, + "step": 75 + }, + { + "epoch": 0.04437632289613897, + "grad_norm": 0.11346603861235804, + "learning_rate": 4.431486880466473e-06, + "loss": 0.5006, + "step": 76 + }, + { + "epoch": 0.04496022188161448, + "grad_norm": 0.11195519797898033, + "learning_rate": 4.489795918367348e-06, + "loss": 0.5123, + "step": 77 + }, + { + "epoch": 0.04554412086708999, + "grad_norm": 0.12025685764890878, + "learning_rate": 4.548104956268222e-06, + "loss": 0.5142, + "step": 78 + }, + { + "epoch": 0.046128019852565504, + "grad_norm": 0.1224784663975255, + "learning_rate": 4.606413994169097e-06, + "loss": 0.4846, + "step": 79 + }, + { + "epoch": 0.04671191883804102, + "grad_norm": 0.12015130607330382, + "learning_rate": 4.664723032069971e-06, + "loss": 0.5692, + "step": 80 + }, + { + "epoch": 0.04729581782351653, + "grad_norm": 0.12406650199200392, + "learning_rate": 4.723032069970846e-06, + "loss": 0.6212, + "step": 81 + }, + { + "epoch": 0.04787971680899204, + "grad_norm": 0.11688273590591051, + "learning_rate": 4.78134110787172e-06, + "loss": 0.5409, + "step": 82 + }, + { + "epoch": 0.048463615794467556, + "grad_norm": 0.12230762563265353, + "learning_rate": 4.839650145772595e-06, + "loss": 0.5432, + "step": 83 + }, + { + "epoch": 0.04904751477994307, + "grad_norm": 0.1275313607712504, + "learning_rate": 4.897959183673469e-06, + "loss": 0.5699, + "step": 84 + }, + { + "epoch": 0.04963141376541858, + "grad_norm": 0.1204465221711168, + "learning_rate": 4.956268221574344e-06, + "loss": 0.5528, + "step": 85 + }, + { + "epoch": 0.050215312750894095, + "grad_norm": 0.12643827231830812, + "learning_rate": 5.014577259475219e-06, + "loss": 0.5979, + "step": 86 + }, + { + "epoch": 0.05079921173636961, + "grad_norm": 0.1176641285310529, + "learning_rate": 5.0728862973760935e-06, + "loss": 0.4808, + "step": 87 + }, + { + "epoch": 0.05138311072184512, + "grad_norm": 0.11361176460052108, + "learning_rate": 5.1311953352769686e-06, + "loss": 0.5, + "step": 88 + }, + { + "epoch": 0.051967009707320634, + "grad_norm": 0.11502890469580693, + "learning_rate": 5.189504373177843e-06, + "loss": 0.5322, + "step": 89 + }, + { + "epoch": 0.05255090869279615, + "grad_norm": 0.12885510850361562, + "learning_rate": 5.247813411078717e-06, + "loss": 0.5467, + "step": 90 + }, + { + "epoch": 0.05313480767827166, + "grad_norm": 0.12151012718224047, + "learning_rate": 5.306122448979593e-06, + "loss": 0.5169, + "step": 91 + }, + { + "epoch": 0.05371870666374717, + "grad_norm": 0.12758162352192182, + "learning_rate": 5.364431486880467e-06, + "loss": 0.5266, + "step": 92 + }, + { + "epoch": 0.054302605649222686, + "grad_norm": 0.12001742706805828, + "learning_rate": 5.422740524781341e-06, + "loss": 0.4928, + "step": 93 + }, + { + "epoch": 0.0548865046346982, + "grad_norm": 0.12225258857579692, + "learning_rate": 5.481049562682216e-06, + "loss": 0.5229, + "step": 94 + }, + { + "epoch": 0.05547040362017371, + "grad_norm": 0.1152421603670446, + "learning_rate": 5.539358600583091e-06, + "loss": 0.4899, + "step": 95 + }, + { + "epoch": 0.056054302605649226, + "grad_norm": 0.13106739527831543, + "learning_rate": 5.597667638483965e-06, + "loss": 0.49, + "step": 96 + }, + { + "epoch": 0.05663820159112474, + "grad_norm": 0.11876453588451588, + "learning_rate": 5.65597667638484e-06, + "loss": 0.5807, + "step": 97 + }, + { + "epoch": 0.05722210057660025, + "grad_norm": 0.1260609220976984, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.5286, + "step": 98 + }, + { + "epoch": 0.05780599956207576, + "grad_norm": 0.12484182169112895, + "learning_rate": 5.7725947521865895e-06, + "loss": 0.517, + "step": 99 + }, + { + "epoch": 0.05838989854755127, + "grad_norm": 0.11886606751056505, + "learning_rate": 5.8309037900874645e-06, + "loss": 0.4703, + "step": 100 + }, + { + "epoch": 0.058973797533026784, + "grad_norm": 0.1305079203618463, + "learning_rate": 5.889212827988339e-06, + "loss": 0.5341, + "step": 101 + }, + { + "epoch": 0.0595576965185023, + "grad_norm": 0.11959851382087784, + "learning_rate": 5.947521865889213e-06, + "loss": 0.5235, + "step": 102 + }, + { + "epoch": 0.06014159550397781, + "grad_norm": 0.12343380355404973, + "learning_rate": 6.005830903790088e-06, + "loss": 0.4685, + "step": 103 + }, + { + "epoch": 0.06072549448945332, + "grad_norm": 0.12075598983303103, + "learning_rate": 6.064139941690963e-06, + "loss": 0.5033, + "step": 104 + }, + { + "epoch": 0.061309393474928836, + "grad_norm": 0.1167137230362061, + "learning_rate": 6.122448979591837e-06, + "loss": 0.5241, + "step": 105 + }, + { + "epoch": 0.06189329246040435, + "grad_norm": 0.11672844204405812, + "learning_rate": 6.180758017492712e-06, + "loss": 0.4872, + "step": 106 + }, + { + "epoch": 0.06247719144587986, + "grad_norm": 0.12760486809800367, + "learning_rate": 6.239067055393586e-06, + "loss": 0.5111, + "step": 107 + }, + { + "epoch": 0.06306109043135538, + "grad_norm": 0.12450897467876869, + "learning_rate": 6.297376093294461e-06, + "loss": 0.5063, + "step": 108 + }, + { + "epoch": 0.06364498941683089, + "grad_norm": 0.12309635552630531, + "learning_rate": 6.355685131195336e-06, + "loss": 0.5014, + "step": 109 + }, + { + "epoch": 0.0642288884023064, + "grad_norm": 0.12234066899553464, + "learning_rate": 6.4139941690962105e-06, + "loss": 0.4859, + "step": 110 + }, + { + "epoch": 0.06481278738778191, + "grad_norm": 0.13277130246339422, + "learning_rate": 6.472303206997085e-06, + "loss": 0.5136, + "step": 111 + }, + { + "epoch": 0.06539668637325742, + "grad_norm": 0.13016482225809503, + "learning_rate": 6.530612244897959e-06, + "loss": 0.5087, + "step": 112 + }, + { + "epoch": 0.06598058535873294, + "grad_norm": 0.1243907074015154, + "learning_rate": 6.588921282798835e-06, + "loss": 0.5005, + "step": 113 + }, + { + "epoch": 0.06656448434420845, + "grad_norm": 0.1192950552887921, + "learning_rate": 6.647230320699709e-06, + "loss": 0.5017, + "step": 114 + }, + { + "epoch": 0.06714838332968397, + "grad_norm": 0.1291611400972062, + "learning_rate": 6.705539358600584e-06, + "loss": 0.4999, + "step": 115 + }, + { + "epoch": 0.06773228231515947, + "grad_norm": 0.12645224706155894, + "learning_rate": 6.763848396501458e-06, + "loss": 0.5065, + "step": 116 + }, + { + "epoch": 0.06831618130063499, + "grad_norm": 0.1184522539064496, + "learning_rate": 6.822157434402333e-06, + "loss": 0.5572, + "step": 117 + }, + { + "epoch": 0.0689000802861105, + "grad_norm": 0.14241104843035735, + "learning_rate": 6.880466472303208e-06, + "loss": 0.5268, + "step": 118 + }, + { + "epoch": 0.06948397927158602, + "grad_norm": 0.12389568827667287, + "learning_rate": 6.938775510204082e-06, + "loss": 0.4814, + "step": 119 + }, + { + "epoch": 0.07006787825706152, + "grad_norm": 0.12255659504852189, + "learning_rate": 6.9970845481049564e-06, + "loss": 0.4784, + "step": 120 + }, + { + "epoch": 0.07065177724253704, + "grad_norm": 0.12385767818111074, + "learning_rate": 7.055393586005832e-06, + "loss": 0.5114, + "step": 121 + }, + { + "epoch": 0.07123567622801255, + "grad_norm": 0.1242479410803753, + "learning_rate": 7.1137026239067065e-06, + "loss": 0.5286, + "step": 122 + }, + { + "epoch": 0.07181957521348807, + "grad_norm": 0.12069225013240867, + "learning_rate": 7.172011661807581e-06, + "loss": 0.4962, + "step": 123 + }, + { + "epoch": 0.07240347419896358, + "grad_norm": 0.13312820172082093, + "learning_rate": 7.230320699708455e-06, + "loss": 0.5188, + "step": 124 + }, + { + "epoch": 0.0729873731844391, + "grad_norm": 0.1711945728595143, + "learning_rate": 7.28862973760933e-06, + "loss": 0.57, + "step": 125 + }, + { + "epoch": 0.0735712721699146, + "grad_norm": 0.1194956898594109, + "learning_rate": 7.346938775510205e-06, + "loss": 0.473, + "step": 126 + }, + { + "epoch": 0.07415517115539012, + "grad_norm": 0.11652570987834152, + "learning_rate": 7.40524781341108e-06, + "loss": 0.5096, + "step": 127 + }, + { + "epoch": 0.07473907014086563, + "grad_norm": 0.1257804593504759, + "learning_rate": 7.463556851311954e-06, + "loss": 0.506, + "step": 128 + }, + { + "epoch": 0.07532296912634115, + "grad_norm": 0.13585415092568312, + "learning_rate": 7.521865889212828e-06, + "loss": 0.542, + "step": 129 + }, + { + "epoch": 0.07590686811181666, + "grad_norm": 0.13148882109410992, + "learning_rate": 7.580174927113704e-06, + "loss": 0.547, + "step": 130 + }, + { + "epoch": 0.07649076709729216, + "grad_norm": 0.13169721829496428, + "learning_rate": 7.638483965014577e-06, + "loss": 0.4995, + "step": 131 + }, + { + "epoch": 0.07707466608276768, + "grad_norm": 0.12365384850689345, + "learning_rate": 7.696793002915453e-06, + "loss": 0.5357, + "step": 132 + }, + { + "epoch": 0.07765856506824319, + "grad_norm": 0.12532078419615827, + "learning_rate": 7.755102040816327e-06, + "loss": 0.548, + "step": 133 + }, + { + "epoch": 0.07824246405371871, + "grad_norm": 0.12627432789135531, + "learning_rate": 7.813411078717202e-06, + "loss": 0.5003, + "step": 134 + }, + { + "epoch": 0.07882636303919421, + "grad_norm": 0.12600863655776395, + "learning_rate": 7.871720116618077e-06, + "loss": 0.5168, + "step": 135 + }, + { + "epoch": 0.07941026202466973, + "grad_norm": 0.12299139663110784, + "learning_rate": 7.930029154518952e-06, + "loss": 0.4521, + "step": 136 + }, + { + "epoch": 0.07999416101014524, + "grad_norm": 0.12730729199654625, + "learning_rate": 7.988338192419826e-06, + "loss": 0.528, + "step": 137 + }, + { + "epoch": 0.08057805999562076, + "grad_norm": 0.1240355289813488, + "learning_rate": 8.0466472303207e-06, + "loss": 0.5065, + "step": 138 + }, + { + "epoch": 0.08116195898109627, + "grad_norm": 0.1256897845028877, + "learning_rate": 8.104956268221576e-06, + "loss": 0.5532, + "step": 139 + }, + { + "epoch": 0.08174585796657179, + "grad_norm": 0.12432521647787001, + "learning_rate": 8.16326530612245e-06, + "loss": 0.5206, + "step": 140 + }, + { + "epoch": 0.08232975695204729, + "grad_norm": 0.10943903042980331, + "learning_rate": 8.221574344023324e-06, + "loss": 0.4588, + "step": 141 + }, + { + "epoch": 0.08291365593752281, + "grad_norm": 0.11792429604633306, + "learning_rate": 8.279883381924198e-06, + "loss": 0.4474, + "step": 142 + }, + { + "epoch": 0.08349755492299832, + "grad_norm": 0.12879678321331278, + "learning_rate": 8.338192419825074e-06, + "loss": 0.5226, + "step": 143 + }, + { + "epoch": 0.08408145390847384, + "grad_norm": 0.11930723023319373, + "learning_rate": 8.396501457725948e-06, + "loss": 0.5217, + "step": 144 + }, + { + "epoch": 0.08466535289394934, + "grad_norm": 0.13145990621232267, + "learning_rate": 8.454810495626823e-06, + "loss": 0.4961, + "step": 145 + }, + { + "epoch": 0.08524925187942486, + "grad_norm": 0.11927667168830863, + "learning_rate": 8.513119533527697e-06, + "loss": 0.4434, + "step": 146 + }, + { + "epoch": 0.08583315086490037, + "grad_norm": 0.11899774147458363, + "learning_rate": 8.571428571428571e-06, + "loss": 0.4524, + "step": 147 + }, + { + "epoch": 0.08641704985037589, + "grad_norm": 0.11503714794308208, + "learning_rate": 8.629737609329447e-06, + "loss": 0.4522, + "step": 148 + }, + { + "epoch": 0.0870009488358514, + "grad_norm": 0.12335780537495883, + "learning_rate": 8.688046647230321e-06, + "loss": 0.4933, + "step": 149 + }, + { + "epoch": 0.08758484782132692, + "grad_norm": 0.11031905923722546, + "learning_rate": 8.746355685131195e-06, + "loss": 0.4447, + "step": 150 + }, + { + "epoch": 0.08816874680680242, + "grad_norm": 0.1203460815155038, + "learning_rate": 8.804664723032071e-06, + "loss": 0.4715, + "step": 151 + }, + { + "epoch": 0.08875264579227794, + "grad_norm": 0.12614464363737826, + "learning_rate": 8.862973760932945e-06, + "loss": 0.5163, + "step": 152 + }, + { + "epoch": 0.08933654477775345, + "grad_norm": 0.1164138441468125, + "learning_rate": 8.921282798833821e-06, + "loss": 0.4632, + "step": 153 + }, + { + "epoch": 0.08992044376322895, + "grad_norm": 0.1274066859341455, + "learning_rate": 8.979591836734695e-06, + "loss": 0.4815, + "step": 154 + }, + { + "epoch": 0.09050434274870447, + "grad_norm": 0.12157008390725106, + "learning_rate": 9.03790087463557e-06, + "loss": 0.4649, + "step": 155 + }, + { + "epoch": 0.09108824173417998, + "grad_norm": 0.12547292058721102, + "learning_rate": 9.096209912536444e-06, + "loss": 0.5295, + "step": 156 + }, + { + "epoch": 0.0916721407196555, + "grad_norm": 0.1212897637510307, + "learning_rate": 9.15451895043732e-06, + "loss": 0.5135, + "step": 157 + }, + { + "epoch": 0.09225603970513101, + "grad_norm": 0.12602734677910782, + "learning_rate": 9.212827988338194e-06, + "loss": 0.5111, + "step": 158 + }, + { + "epoch": 0.09283993869060653, + "grad_norm": 0.11963674242486431, + "learning_rate": 9.271137026239068e-06, + "loss": 0.4987, + "step": 159 + }, + { + "epoch": 0.09342383767608203, + "grad_norm": 0.1202530949130935, + "learning_rate": 9.329446064139942e-06, + "loss": 0.531, + "step": 160 + }, + { + "epoch": 0.09400773666155755, + "grad_norm": 0.12741373530457503, + "learning_rate": 9.387755102040818e-06, + "loss": 0.5138, + "step": 161 + }, + { + "epoch": 0.09459163564703306, + "grad_norm": 0.12418060663421622, + "learning_rate": 9.446064139941692e-06, + "loss": 0.4746, + "step": 162 + }, + { + "epoch": 0.09517553463250858, + "grad_norm": 0.12718782867754488, + "learning_rate": 9.504373177842566e-06, + "loss": 0.4979, + "step": 163 + }, + { + "epoch": 0.09575943361798409, + "grad_norm": 0.12055792674404836, + "learning_rate": 9.56268221574344e-06, + "loss": 0.4646, + "step": 164 + }, + { + "epoch": 0.0963433326034596, + "grad_norm": 0.1216872873766708, + "learning_rate": 9.620991253644316e-06, + "loss": 0.5, + "step": 165 + }, + { + "epoch": 0.09692723158893511, + "grad_norm": 0.11088464824772838, + "learning_rate": 9.67930029154519e-06, + "loss": 0.4559, + "step": 166 + }, + { + "epoch": 0.09751113057441063, + "grad_norm": 0.12316328782632133, + "learning_rate": 9.737609329446065e-06, + "loss": 0.495, + "step": 167 + }, + { + "epoch": 0.09809502955988614, + "grad_norm": 0.12812328828989114, + "learning_rate": 9.795918367346939e-06, + "loss": 0.4707, + "step": 168 + }, + { + "epoch": 0.09867892854536166, + "grad_norm": 0.12830826520307553, + "learning_rate": 9.854227405247815e-06, + "loss": 0.4676, + "step": 169 + }, + { + "epoch": 0.09926282753083716, + "grad_norm": 0.12410111194585906, + "learning_rate": 9.912536443148689e-06, + "loss": 0.4732, + "step": 170 + }, + { + "epoch": 0.09984672651631268, + "grad_norm": 0.12475191624470297, + "learning_rate": 9.970845481049563e-06, + "loss": 0.4392, + "step": 171 + }, + { + "epoch": 0.10043062550178819, + "grad_norm": 0.1241260083240558, + "learning_rate": 1.0029154518950439e-05, + "loss": 0.5403, + "step": 172 + }, + { + "epoch": 0.10101452448726371, + "grad_norm": 0.12507984731684313, + "learning_rate": 1.0087463556851313e-05, + "loss": 0.4753, + "step": 173 + }, + { + "epoch": 0.10159842347273922, + "grad_norm": 0.12814100486730318, + "learning_rate": 1.0145772594752187e-05, + "loss": 0.4905, + "step": 174 + }, + { + "epoch": 0.10218232245821472, + "grad_norm": 0.12469588332707274, + "learning_rate": 1.0204081632653063e-05, + "loss": 0.4778, + "step": 175 + }, + { + "epoch": 0.10276622144369024, + "grad_norm": 0.12232925422843359, + "learning_rate": 1.0262390670553937e-05, + "loss": 0.4787, + "step": 176 + }, + { + "epoch": 0.10335012042916575, + "grad_norm": 0.12269692779635057, + "learning_rate": 1.0320699708454811e-05, + "loss": 0.5052, + "step": 177 + }, + { + "epoch": 0.10393401941464127, + "grad_norm": 0.11916532467809526, + "learning_rate": 1.0379008746355685e-05, + "loss": 0.4885, + "step": 178 + }, + { + "epoch": 0.10451791840011677, + "grad_norm": 0.12092964578474148, + "learning_rate": 1.043731778425656e-05, + "loss": 0.4657, + "step": 179 + }, + { + "epoch": 0.1051018173855923, + "grad_norm": 0.11439204716514362, + "learning_rate": 1.0495626822157434e-05, + "loss": 0.5042, + "step": 180 + }, + { + "epoch": 0.1056857163710678, + "grad_norm": 0.12354388252423189, + "learning_rate": 1.0553935860058311e-05, + "loss": 0.5233, + "step": 181 + }, + { + "epoch": 0.10626961535654332, + "grad_norm": 0.12043570737854031, + "learning_rate": 1.0612244897959186e-05, + "loss": 0.519, + "step": 182 + }, + { + "epoch": 0.10685351434201883, + "grad_norm": 0.11834237063550702, + "learning_rate": 1.067055393586006e-05, + "loss": 0.4795, + "step": 183 + }, + { + "epoch": 0.10743741332749435, + "grad_norm": 0.11705709857814825, + "learning_rate": 1.0728862973760934e-05, + "loss": 0.4823, + "step": 184 + }, + { + "epoch": 0.10802131231296985, + "grad_norm": 0.12509083351745862, + "learning_rate": 1.0787172011661808e-05, + "loss": 0.4914, + "step": 185 + }, + { + "epoch": 0.10860521129844537, + "grad_norm": 0.10786115579153421, + "learning_rate": 1.0845481049562682e-05, + "loss": 0.5058, + "step": 186 + }, + { + "epoch": 0.10918911028392088, + "grad_norm": 0.11299638826020518, + "learning_rate": 1.0903790087463556e-05, + "loss": 0.4717, + "step": 187 + }, + { + "epoch": 0.1097730092693964, + "grad_norm": 0.11688095277211268, + "learning_rate": 1.0962099125364432e-05, + "loss": 0.5093, + "step": 188 + }, + { + "epoch": 0.1103569082548719, + "grad_norm": 0.12473021090764483, + "learning_rate": 1.1020408163265306e-05, + "loss": 0.4408, + "step": 189 + }, + { + "epoch": 0.11094080724034742, + "grad_norm": 0.11413721579271932, + "learning_rate": 1.1078717201166182e-05, + "loss": 0.4851, + "step": 190 + }, + { + "epoch": 0.11152470622582293, + "grad_norm": 0.12517361613874525, + "learning_rate": 1.1137026239067056e-05, + "loss": 0.5422, + "step": 191 + }, + { + "epoch": 0.11210860521129845, + "grad_norm": 0.11820773964475485, + "learning_rate": 1.119533527696793e-05, + "loss": 0.4897, + "step": 192 + }, + { + "epoch": 0.11269250419677396, + "grad_norm": 0.11803250458290135, + "learning_rate": 1.1253644314868807e-05, + "loss": 0.5076, + "step": 193 + }, + { + "epoch": 0.11327640318224948, + "grad_norm": 0.11607060814198182, + "learning_rate": 1.131195335276968e-05, + "loss": 0.4784, + "step": 194 + }, + { + "epoch": 0.11386030216772498, + "grad_norm": 0.11211128345308143, + "learning_rate": 1.1370262390670555e-05, + "loss": 0.4902, + "step": 195 + }, + { + "epoch": 0.1144442011532005, + "grad_norm": 0.11141384489238289, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.4436, + "step": 196 + }, + { + "epoch": 0.11502810013867601, + "grad_norm": 0.11215596624335802, + "learning_rate": 1.1486880466472303e-05, + "loss": 0.4432, + "step": 197 + }, + { + "epoch": 0.11561199912415152, + "grad_norm": 0.12399274896294848, + "learning_rate": 1.1545189504373179e-05, + "loss": 0.5312, + "step": 198 + }, + { + "epoch": 0.11619589810962704, + "grad_norm": 0.11293237079501135, + "learning_rate": 1.1603498542274055e-05, + "loss": 0.447, + "step": 199 + }, + { + "epoch": 0.11677979709510254, + "grad_norm": 0.1215012642583709, + "learning_rate": 1.1661807580174929e-05, + "loss": 0.5012, + "step": 200 + }, + { + "epoch": 0.11736369608057806, + "grad_norm": 0.12427299685002438, + "learning_rate": 1.1720116618075803e-05, + "loss": 0.4864, + "step": 201 + }, + { + "epoch": 0.11794759506605357, + "grad_norm": 0.12100393884045363, + "learning_rate": 1.1778425655976677e-05, + "loss": 0.451, + "step": 202 + }, + { + "epoch": 0.11853149405152909, + "grad_norm": 0.13024816718208015, + "learning_rate": 1.1836734693877552e-05, + "loss": 0.4804, + "step": 203 + }, + { + "epoch": 0.1191153930370046, + "grad_norm": 0.1361916016281245, + "learning_rate": 1.1895043731778426e-05, + "loss": 0.4728, + "step": 204 + }, + { + "epoch": 0.11969929202248011, + "grad_norm": 0.11804249910974884, + "learning_rate": 1.19533527696793e-05, + "loss": 0.4482, + "step": 205 + }, + { + "epoch": 0.12028319100795562, + "grad_norm": 0.12405592212995777, + "learning_rate": 1.2011661807580176e-05, + "loss": 0.4181, + "step": 206 + }, + { + "epoch": 0.12086708999343114, + "grad_norm": 0.11766180068909124, + "learning_rate": 1.2069970845481052e-05, + "loss": 0.4547, + "step": 207 + }, + { + "epoch": 0.12145098897890665, + "grad_norm": 0.12026405888532894, + "learning_rate": 1.2128279883381926e-05, + "loss": 0.4721, + "step": 208 + }, + { + "epoch": 0.12203488796438217, + "grad_norm": 0.12990354634558993, + "learning_rate": 1.21865889212828e-05, + "loss": 0.4995, + "step": 209 + }, + { + "epoch": 0.12261878694985767, + "grad_norm": 0.1246359916200624, + "learning_rate": 1.2244897959183674e-05, + "loss": 0.4749, + "step": 210 + }, + { + "epoch": 0.12320268593533319, + "grad_norm": 0.11976384962406618, + "learning_rate": 1.2303206997084548e-05, + "loss": 0.4499, + "step": 211 + }, + { + "epoch": 0.1237865849208087, + "grad_norm": 0.11933506479333074, + "learning_rate": 1.2361516034985424e-05, + "loss": 0.4764, + "step": 212 + }, + { + "epoch": 0.12437048390628422, + "grad_norm": 0.12542827966265213, + "learning_rate": 1.2419825072886298e-05, + "loss": 0.4605, + "step": 213 + }, + { + "epoch": 0.12495438289175972, + "grad_norm": 0.1150593280405559, + "learning_rate": 1.2478134110787173e-05, + "loss": 0.5055, + "step": 214 + }, + { + "epoch": 0.12553828187723523, + "grad_norm": 0.12343762040898296, + "learning_rate": 1.2536443148688047e-05, + "loss": 0.5234, + "step": 215 + }, + { + "epoch": 0.12612218086271076, + "grad_norm": 0.12445165491494599, + "learning_rate": 1.2594752186588923e-05, + "loss": 0.521, + "step": 216 + }, + { + "epoch": 0.12670607984818627, + "grad_norm": 0.12117234717727392, + "learning_rate": 1.2653061224489798e-05, + "loss": 0.5548, + "step": 217 + }, + { + "epoch": 0.12728997883366178, + "grad_norm": 0.11954033838949486, + "learning_rate": 1.2711370262390673e-05, + "loss": 0.4817, + "step": 218 + }, + { + "epoch": 0.12787387781913728, + "grad_norm": 0.12276715542710782, + "learning_rate": 1.2769679300291547e-05, + "loss": 0.5034, + "step": 219 + }, + { + "epoch": 0.1284577768046128, + "grad_norm": 0.1298628689318294, + "learning_rate": 1.2827988338192421e-05, + "loss": 0.514, + "step": 220 + }, + { + "epoch": 0.12904167579008832, + "grad_norm": 0.1266951544845043, + "learning_rate": 1.2886297376093295e-05, + "loss": 0.517, + "step": 221 + }, + { + "epoch": 0.12962557477556383, + "grad_norm": 0.1186973436485451, + "learning_rate": 1.294460641399417e-05, + "loss": 0.5038, + "step": 222 + }, + { + "epoch": 0.13020947376103933, + "grad_norm": 0.12484191464621862, + "learning_rate": 1.3002915451895044e-05, + "loss": 0.4575, + "step": 223 + }, + { + "epoch": 0.13079337274651484, + "grad_norm": 0.11994680057458637, + "learning_rate": 1.3061224489795918e-05, + "loss": 0.4744, + "step": 224 + }, + { + "epoch": 0.13137727173199037, + "grad_norm": 0.1108910363461259, + "learning_rate": 1.3119533527696795e-05, + "loss": 0.4618, + "step": 225 + }, + { + "epoch": 0.13196117071746588, + "grad_norm": 0.12156708934933559, + "learning_rate": 1.317784256559767e-05, + "loss": 0.5182, + "step": 226 + }, + { + "epoch": 0.1325450697029414, + "grad_norm": 0.12551779767427704, + "learning_rate": 1.3236151603498544e-05, + "loss": 0.4982, + "step": 227 + }, + { + "epoch": 0.1331289686884169, + "grad_norm": 0.11225517035511849, + "learning_rate": 1.3294460641399418e-05, + "loss": 0.459, + "step": 228 + }, + { + "epoch": 0.13371286767389243, + "grad_norm": 0.1195753060113868, + "learning_rate": 1.3352769679300292e-05, + "loss": 0.5366, + "step": 229 + }, + { + "epoch": 0.13429676665936793, + "grad_norm": 0.1311397152775005, + "learning_rate": 1.3411078717201168e-05, + "loss": 0.503, + "step": 230 + }, + { + "epoch": 0.13488066564484344, + "grad_norm": 0.1211510768312088, + "learning_rate": 1.3469387755102042e-05, + "loss": 0.4898, + "step": 231 + }, + { + "epoch": 0.13546456463031895, + "grad_norm": 0.12600913942872044, + "learning_rate": 1.3527696793002916e-05, + "loss": 0.4942, + "step": 232 + }, + { + "epoch": 0.13604846361579448, + "grad_norm": 0.12787422527968467, + "learning_rate": 1.358600583090379e-05, + "loss": 0.5281, + "step": 233 + }, + { + "epoch": 0.13663236260126999, + "grad_norm": 0.12776466871580142, + "learning_rate": 1.3644314868804666e-05, + "loss": 0.48, + "step": 234 + }, + { + "epoch": 0.1372162615867455, + "grad_norm": 0.12166931506499784, + "learning_rate": 1.370262390670554e-05, + "loss": 0.5034, + "step": 235 + }, + { + "epoch": 0.137800160572221, + "grad_norm": 0.11638765866070645, + "learning_rate": 1.3760932944606416e-05, + "loss": 0.4434, + "step": 236 + }, + { + "epoch": 0.13838405955769653, + "grad_norm": 0.12004747340609112, + "learning_rate": 1.381924198250729e-05, + "loss": 0.5035, + "step": 237 + }, + { + "epoch": 0.13896795854317204, + "grad_norm": 0.12419290968448686, + "learning_rate": 1.3877551020408165e-05, + "loss": 0.4752, + "step": 238 + }, + { + "epoch": 0.13955185752864754, + "grad_norm": 0.12141595054280938, + "learning_rate": 1.3935860058309039e-05, + "loss": 0.4491, + "step": 239 + }, + { + "epoch": 0.14013575651412305, + "grad_norm": 0.12329688383714835, + "learning_rate": 1.3994169096209913e-05, + "loss": 0.495, + "step": 240 + }, + { + "epoch": 0.14071965549959856, + "grad_norm": 0.11510920906734046, + "learning_rate": 1.4052478134110787e-05, + "loss": 0.504, + "step": 241 + }, + { + "epoch": 0.1413035544850741, + "grad_norm": 0.12941891219110963, + "learning_rate": 1.4110787172011665e-05, + "loss": 0.4928, + "step": 242 + }, + { + "epoch": 0.1418874534705496, + "grad_norm": 0.12337045751874921, + "learning_rate": 1.4169096209912539e-05, + "loss": 0.4724, + "step": 243 + }, + { + "epoch": 0.1424713524560251, + "grad_norm": 0.1235778334164274, + "learning_rate": 1.4227405247813413e-05, + "loss": 0.4577, + "step": 244 + }, + { + "epoch": 0.1430552514415006, + "grad_norm": 0.12628375572385392, + "learning_rate": 1.4285714285714287e-05, + "loss": 0.4861, + "step": 245 + }, + { + "epoch": 0.14363915042697614, + "grad_norm": 0.12799670378479905, + "learning_rate": 1.4344023323615161e-05, + "loss": 0.462, + "step": 246 + }, + { + "epoch": 0.14422304941245165, + "grad_norm": 0.1321155184287993, + "learning_rate": 1.4402332361516035e-05, + "loss": 0.5225, + "step": 247 + }, + { + "epoch": 0.14480694839792715, + "grad_norm": 0.11630499300519878, + "learning_rate": 1.446064139941691e-05, + "loss": 0.4518, + "step": 248 + }, + { + "epoch": 0.14539084738340266, + "grad_norm": 0.11490870582188943, + "learning_rate": 1.4518950437317786e-05, + "loss": 0.4495, + "step": 249 + }, + { + "epoch": 0.1459747463688782, + "grad_norm": 0.11987467106840151, + "learning_rate": 1.457725947521866e-05, + "loss": 0.4776, + "step": 250 + }, + { + "epoch": 0.1465586453543537, + "grad_norm": 0.11727098503092016, + "learning_rate": 1.4635568513119536e-05, + "loss": 0.4826, + "step": 251 + }, + { + "epoch": 0.1471425443398292, + "grad_norm": 0.12156378266134499, + "learning_rate": 1.469387755102041e-05, + "loss": 0.497, + "step": 252 + }, + { + "epoch": 0.1477264433253047, + "grad_norm": 0.11894652307870147, + "learning_rate": 1.4752186588921284e-05, + "loss": 0.5003, + "step": 253 + }, + { + "epoch": 0.14831034231078025, + "grad_norm": 0.12090016742619325, + "learning_rate": 1.481049562682216e-05, + "loss": 0.4711, + "step": 254 + }, + { + "epoch": 0.14889424129625575, + "grad_norm": 0.11842338577830101, + "learning_rate": 1.4868804664723034e-05, + "loss": 0.4837, + "step": 255 + }, + { + "epoch": 0.14947814028173126, + "grad_norm": 0.12243088835607291, + "learning_rate": 1.4927113702623908e-05, + "loss": 0.4589, + "step": 256 + }, + { + "epoch": 0.15006203926720676, + "grad_norm": 0.11810611612034028, + "learning_rate": 1.4985422740524782e-05, + "loss": 0.5004, + "step": 257 + }, + { + "epoch": 0.1506459382526823, + "grad_norm": 0.12227905879531428, + "learning_rate": 1.5043731778425656e-05, + "loss": 0.4911, + "step": 258 + }, + { + "epoch": 0.1512298372381578, + "grad_norm": 0.11317655294575424, + "learning_rate": 1.510204081632653e-05, + "loss": 0.4459, + "step": 259 + }, + { + "epoch": 0.1518137362236333, + "grad_norm": 0.12193313554591659, + "learning_rate": 1.5160349854227408e-05, + "loss": 0.4904, + "step": 260 + }, + { + "epoch": 0.15239763520910882, + "grad_norm": 0.11957668594350246, + "learning_rate": 1.5218658892128282e-05, + "loss": 0.4905, + "step": 261 + }, + { + "epoch": 0.15298153419458432, + "grad_norm": 0.11803153116343822, + "learning_rate": 1.5276967930029155e-05, + "loss": 0.448, + "step": 262 + }, + { + "epoch": 0.15356543318005986, + "grad_norm": 0.11628128188102277, + "learning_rate": 1.533527696793003e-05, + "loss": 0.4973, + "step": 263 + }, + { + "epoch": 0.15414933216553536, + "grad_norm": 0.11803334288345642, + "learning_rate": 1.5393586005830907e-05, + "loss": 0.4545, + "step": 264 + }, + { + "epoch": 0.15473323115101087, + "grad_norm": 0.12052510278848581, + "learning_rate": 1.545189504373178e-05, + "loss": 0.5061, + "step": 265 + }, + { + "epoch": 0.15531713013648638, + "grad_norm": 0.1102794232001335, + "learning_rate": 1.5510204081632655e-05, + "loss": 0.4529, + "step": 266 + }, + { + "epoch": 0.1559010291219619, + "grad_norm": 0.11938592890818091, + "learning_rate": 1.5568513119533527e-05, + "loss": 0.4988, + "step": 267 + }, + { + "epoch": 0.15648492810743742, + "grad_norm": 0.1183307439198149, + "learning_rate": 1.5626822157434403e-05, + "loss": 0.4836, + "step": 268 + }, + { + "epoch": 0.15706882709291292, + "grad_norm": 0.11094454673987766, + "learning_rate": 1.568513119533528e-05, + "loss": 0.4796, + "step": 269 + }, + { + "epoch": 0.15765272607838843, + "grad_norm": 0.1353776474271034, + "learning_rate": 1.5743440233236155e-05, + "loss": 0.4871, + "step": 270 + }, + { + "epoch": 0.15823662506386396, + "grad_norm": 0.12069533414841309, + "learning_rate": 1.5801749271137027e-05, + "loss": 0.49, + "step": 271 + }, + { + "epoch": 0.15882052404933947, + "grad_norm": 0.11240351189350616, + "learning_rate": 1.5860058309037903e-05, + "loss": 0.4696, + "step": 272 + }, + { + "epoch": 0.15940442303481497, + "grad_norm": 0.1274612444666027, + "learning_rate": 1.5918367346938776e-05, + "loss": 0.5028, + "step": 273 + }, + { + "epoch": 0.15998832202029048, + "grad_norm": 0.12123438557049905, + "learning_rate": 1.597667638483965e-05, + "loss": 0.4412, + "step": 274 + }, + { + "epoch": 0.160572221005766, + "grad_norm": 0.11903881286285385, + "learning_rate": 1.6034985422740524e-05, + "loss": 0.4793, + "step": 275 + }, + { + "epoch": 0.16115611999124152, + "grad_norm": 0.1262521558434459, + "learning_rate": 1.60932944606414e-05, + "loss": 0.4865, + "step": 276 + }, + { + "epoch": 0.16174001897671703, + "grad_norm": 0.11511852637776115, + "learning_rate": 1.6151603498542276e-05, + "loss": 0.4413, + "step": 277 + }, + { + "epoch": 0.16232391796219253, + "grad_norm": 0.1244482953527967, + "learning_rate": 1.6209912536443152e-05, + "loss": 0.4708, + "step": 278 + }, + { + "epoch": 0.16290781694766807, + "grad_norm": 0.12716212415464379, + "learning_rate": 1.6268221574344024e-05, + "loss": 0.4898, + "step": 279 + }, + { + "epoch": 0.16349171593314357, + "grad_norm": 0.12236530185363777, + "learning_rate": 1.63265306122449e-05, + "loss": 0.484, + "step": 280 + }, + { + "epoch": 0.16407561491861908, + "grad_norm": 0.1362358709832902, + "learning_rate": 1.6384839650145773e-05, + "loss": 0.4912, + "step": 281 + }, + { + "epoch": 0.16465951390409458, + "grad_norm": 0.11138105994059525, + "learning_rate": 1.644314868804665e-05, + "loss": 0.4372, + "step": 282 + }, + { + "epoch": 0.16524341288957012, + "grad_norm": 0.12329780529929299, + "learning_rate": 1.6501457725947524e-05, + "loss": 0.5292, + "step": 283 + }, + { + "epoch": 0.16582731187504562, + "grad_norm": 0.12194849368180884, + "learning_rate": 1.6559766763848397e-05, + "loss": 0.4632, + "step": 284 + }, + { + "epoch": 0.16641121086052113, + "grad_norm": 0.11270527031672307, + "learning_rate": 1.6618075801749273e-05, + "loss": 0.4462, + "step": 285 + }, + { + "epoch": 0.16699510984599664, + "grad_norm": 0.11715428168507913, + "learning_rate": 1.667638483965015e-05, + "loss": 0.4891, + "step": 286 + }, + { + "epoch": 0.16757900883147214, + "grad_norm": 0.11743682195071353, + "learning_rate": 1.673469387755102e-05, + "loss": 0.4669, + "step": 287 + }, + { + "epoch": 0.16816290781694768, + "grad_norm": 0.1216006852816552, + "learning_rate": 1.6793002915451897e-05, + "loss": 0.514, + "step": 288 + }, + { + "epoch": 0.16874680680242318, + "grad_norm": 0.12099122720838222, + "learning_rate": 1.6851311953352773e-05, + "loss": 0.5035, + "step": 289 + }, + { + "epoch": 0.1693307057878987, + "grad_norm": 0.12167588398751764, + "learning_rate": 1.6909620991253645e-05, + "loss": 0.4564, + "step": 290 + }, + { + "epoch": 0.1699146047733742, + "grad_norm": 0.12067892194410414, + "learning_rate": 1.696793002915452e-05, + "loss": 0.5302, + "step": 291 + }, + { + "epoch": 0.17049850375884973, + "grad_norm": 0.11695743043568546, + "learning_rate": 1.7026239067055393e-05, + "loss": 0.5114, + "step": 292 + }, + { + "epoch": 0.17108240274432523, + "grad_norm": 0.12448655713952463, + "learning_rate": 1.708454810495627e-05, + "loss": 0.4541, + "step": 293 + }, + { + "epoch": 0.17166630172980074, + "grad_norm": 0.11173382041495981, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.4898, + "step": 294 + }, + { + "epoch": 0.17225020071527625, + "grad_norm": 0.11735312684143906, + "learning_rate": 1.720116618075802e-05, + "loss": 0.4904, + "step": 295 + }, + { + "epoch": 0.17283409970075178, + "grad_norm": 0.11367685720580717, + "learning_rate": 1.7259475218658894e-05, + "loss": 0.4646, + "step": 296 + }, + { + "epoch": 0.1734179986862273, + "grad_norm": 0.11800488805567871, + "learning_rate": 1.731778425655977e-05, + "loss": 0.4637, + "step": 297 + }, + { + "epoch": 0.1740018976717028, + "grad_norm": 0.11094577163806059, + "learning_rate": 1.7376093294460642e-05, + "loss": 0.4189, + "step": 298 + }, + { + "epoch": 0.1745857966571783, + "grad_norm": 0.11491439051096573, + "learning_rate": 1.7434402332361518e-05, + "loss": 0.4941, + "step": 299 + }, + { + "epoch": 0.17516969564265383, + "grad_norm": 0.1183359437283957, + "learning_rate": 1.749271137026239e-05, + "loss": 0.4749, + "step": 300 + }, + { + "epoch": 0.17575359462812934, + "grad_norm": 0.11196201537981638, + "learning_rate": 1.7551020408163266e-05, + "loss": 0.4658, + "step": 301 + }, + { + "epoch": 0.17633749361360485, + "grad_norm": 0.11667553275114739, + "learning_rate": 1.7609329446064142e-05, + "loss": 0.4992, + "step": 302 + }, + { + "epoch": 0.17692139259908035, + "grad_norm": 0.1258157006901112, + "learning_rate": 1.7667638483965014e-05, + "loss": 0.5154, + "step": 303 + }, + { + "epoch": 0.17750529158455589, + "grad_norm": 0.11538138945235941, + "learning_rate": 1.772594752186589e-05, + "loss": 0.4526, + "step": 304 + }, + { + "epoch": 0.1780891905700314, + "grad_norm": 0.12161460779940975, + "learning_rate": 1.7784256559766766e-05, + "loss": 0.4907, + "step": 305 + }, + { + "epoch": 0.1786730895555069, + "grad_norm": 0.12679989651132298, + "learning_rate": 1.7842565597667642e-05, + "loss": 0.4855, + "step": 306 + }, + { + "epoch": 0.1792569885409824, + "grad_norm": 0.1175605722224771, + "learning_rate": 1.7900874635568515e-05, + "loss": 0.4769, + "step": 307 + }, + { + "epoch": 0.1798408875264579, + "grad_norm": 0.12578890295317222, + "learning_rate": 1.795918367346939e-05, + "loss": 0.472, + "step": 308 + }, + { + "epoch": 0.18042478651193344, + "grad_norm": 0.11355992877346005, + "learning_rate": 1.8017492711370263e-05, + "loss": 0.4198, + "step": 309 + }, + { + "epoch": 0.18100868549740895, + "grad_norm": 0.11205771506126262, + "learning_rate": 1.807580174927114e-05, + "loss": 0.4077, + "step": 310 + }, + { + "epoch": 0.18159258448288446, + "grad_norm": 0.11928790422692327, + "learning_rate": 1.813411078717201e-05, + "loss": 0.4559, + "step": 311 + }, + { + "epoch": 0.18217648346835996, + "grad_norm": 0.1363020411321485, + "learning_rate": 1.8192419825072887e-05, + "loss": 0.4929, + "step": 312 + }, + { + "epoch": 0.1827603824538355, + "grad_norm": 0.12504128655440488, + "learning_rate": 1.8250728862973763e-05, + "loss": 0.5503, + "step": 313 + }, + { + "epoch": 0.183344281439311, + "grad_norm": 0.12176631272575653, + "learning_rate": 1.830903790087464e-05, + "loss": 0.47, + "step": 314 + }, + { + "epoch": 0.1839281804247865, + "grad_norm": 0.11896202910345562, + "learning_rate": 1.836734693877551e-05, + "loss": 0.4656, + "step": 315 + }, + { + "epoch": 0.18451207941026201, + "grad_norm": 0.12571601312079406, + "learning_rate": 1.8425655976676387e-05, + "loss": 0.4752, + "step": 316 + }, + { + "epoch": 0.18509597839573755, + "grad_norm": 0.12053499290064615, + "learning_rate": 1.848396501457726e-05, + "loss": 0.4899, + "step": 317 + }, + { + "epoch": 0.18567987738121305, + "grad_norm": 0.11360764518192895, + "learning_rate": 1.8542274052478135e-05, + "loss": 0.4588, + "step": 318 + }, + { + "epoch": 0.18626377636668856, + "grad_norm": 0.12492128658751882, + "learning_rate": 1.8600583090379008e-05, + "loss": 0.4905, + "step": 319 + }, + { + "epoch": 0.18684767535216407, + "grad_norm": 0.11349093418075651, + "learning_rate": 1.8658892128279884e-05, + "loss": 0.5256, + "step": 320 + }, + { + "epoch": 0.1874315743376396, + "grad_norm": 0.11014312698562714, + "learning_rate": 1.871720116618076e-05, + "loss": 0.477, + "step": 321 + }, + { + "epoch": 0.1880154733231151, + "grad_norm": 0.1196931338687533, + "learning_rate": 1.8775510204081636e-05, + "loss": 0.4286, + "step": 322 + }, + { + "epoch": 0.1885993723085906, + "grad_norm": 0.11763470911320446, + "learning_rate": 1.8833819241982508e-05, + "loss": 0.4847, + "step": 323 + }, + { + "epoch": 0.18918327129406612, + "grad_norm": 0.11258413884466385, + "learning_rate": 1.8892128279883384e-05, + "loss": 0.4219, + "step": 324 + }, + { + "epoch": 0.18976717027954165, + "grad_norm": 0.12135774540948996, + "learning_rate": 1.895043731778426e-05, + "loss": 0.4949, + "step": 325 + }, + { + "epoch": 0.19035106926501716, + "grad_norm": 0.11962105480583143, + "learning_rate": 1.9008746355685132e-05, + "loss": 0.4941, + "step": 326 + }, + { + "epoch": 0.19093496825049266, + "grad_norm": 0.10253626999099455, + "learning_rate": 1.9067055393586008e-05, + "loss": 0.4511, + "step": 327 + }, + { + "epoch": 0.19151886723596817, + "grad_norm": 0.11521055399860232, + "learning_rate": 1.912536443148688e-05, + "loss": 0.477, + "step": 328 + }, + { + "epoch": 0.19210276622144368, + "grad_norm": 0.12268661705044029, + "learning_rate": 1.9183673469387756e-05, + "loss": 0.4794, + "step": 329 + }, + { + "epoch": 0.1926866652069192, + "grad_norm": 0.11650332005009081, + "learning_rate": 1.9241982507288632e-05, + "loss": 0.5121, + "step": 330 + }, + { + "epoch": 0.19327056419239472, + "grad_norm": 0.11780446457911503, + "learning_rate": 1.9300291545189508e-05, + "loss": 0.4977, + "step": 331 + }, + { + "epoch": 0.19385446317787022, + "grad_norm": 0.11289820521132098, + "learning_rate": 1.935860058309038e-05, + "loss": 0.4715, + "step": 332 + }, + { + "epoch": 0.19443836216334573, + "grad_norm": 0.11578534883937343, + "learning_rate": 1.9416909620991257e-05, + "loss": 0.5386, + "step": 333 + }, + { + "epoch": 0.19502226114882126, + "grad_norm": 0.13662211300686103, + "learning_rate": 1.947521865889213e-05, + "loss": 0.4992, + "step": 334 + }, + { + "epoch": 0.19560616013429677, + "grad_norm": 0.1083410667087245, + "learning_rate": 1.9533527696793005e-05, + "loss": 0.4554, + "step": 335 + }, + { + "epoch": 0.19619005911977228, + "grad_norm": 0.12318881489128079, + "learning_rate": 1.9591836734693877e-05, + "loss": 0.4416, + "step": 336 + }, + { + "epoch": 0.19677395810524778, + "grad_norm": 0.11591742917556982, + "learning_rate": 1.9650145772594753e-05, + "loss": 0.5053, + "step": 337 + }, + { + "epoch": 0.19735785709072332, + "grad_norm": 0.11041722941177642, + "learning_rate": 1.970845481049563e-05, + "loss": 0.4866, + "step": 338 + }, + { + "epoch": 0.19794175607619882, + "grad_norm": 0.10922202977087775, + "learning_rate": 1.9766763848396505e-05, + "loss": 0.4755, + "step": 339 + }, + { + "epoch": 0.19852565506167433, + "grad_norm": 0.13275809674593222, + "learning_rate": 1.9825072886297377e-05, + "loss": 0.4842, + "step": 340 + }, + { + "epoch": 0.19910955404714983, + "grad_norm": 0.11178604969598391, + "learning_rate": 1.9883381924198253e-05, + "loss": 0.4763, + "step": 341 + }, + { + "epoch": 0.19969345303262537, + "grad_norm": 0.13468051083127913, + "learning_rate": 1.9941690962099126e-05, + "loss": 0.5462, + "step": 342 + }, + { + "epoch": 0.20027735201810087, + "grad_norm": 0.11402720069772254, + "learning_rate": 2e-05, + "loss": 0.4893, + "step": 343 + }, + { + "epoch": 0.20086125100357638, + "grad_norm": 0.13385178798699593, + "learning_rate": 1.999999480140104e-05, + "loss": 0.5028, + "step": 344 + }, + { + "epoch": 0.20144514998905189, + "grad_norm": 0.11573574615782269, + "learning_rate": 1.999997920560957e-05, + "loss": 0.471, + "step": 345 + }, + { + "epoch": 0.20202904897452742, + "grad_norm": 0.12176600620571904, + "learning_rate": 1.9999953212641804e-05, + "loss": 0.5231, + "step": 346 + }, + { + "epoch": 0.20261294796000293, + "grad_norm": 0.11924205465014313, + "learning_rate": 1.9999916822524766e-05, + "loss": 0.4579, + "step": 347 + }, + { + "epoch": 0.20319684694547843, + "grad_norm": 0.110272816846901, + "learning_rate": 1.999987003529629e-05, + "loss": 0.4789, + "step": 348 + }, + { + "epoch": 0.20378074593095394, + "grad_norm": 0.12451484762220308, + "learning_rate": 1.9999812851005024e-05, + "loss": 0.4555, + "step": 349 + }, + { + "epoch": 0.20436464491642944, + "grad_norm": 0.11141320064911672, + "learning_rate": 1.9999745269710423e-05, + "loss": 0.4871, + "step": 350 + }, + { + "epoch": 0.20494854390190498, + "grad_norm": 0.11208852207407757, + "learning_rate": 1.999966729148275e-05, + "loss": 0.4628, + "step": 351 + }, + { + "epoch": 0.20553244288738048, + "grad_norm": 0.11395327123440692, + "learning_rate": 1.9999578916403086e-05, + "loss": 0.4682, + "step": 352 + }, + { + "epoch": 0.206116341872856, + "grad_norm": 0.1138581517529346, + "learning_rate": 1.9999480144563316e-05, + "loss": 0.4865, + "step": 353 + }, + { + "epoch": 0.2067002408583315, + "grad_norm": 0.10875287627701856, + "learning_rate": 1.999937097606613e-05, + "loss": 0.4759, + "step": 354 + }, + { + "epoch": 0.20728413984380703, + "grad_norm": 0.12606806825186004, + "learning_rate": 1.9999251411025034e-05, + "loss": 0.4817, + "step": 355 + }, + { + "epoch": 0.20786803882928254, + "grad_norm": 0.11371291093373521, + "learning_rate": 1.9999121449564347e-05, + "loss": 0.4635, + "step": 356 + }, + { + "epoch": 0.20845193781475804, + "grad_norm": 0.11105426795758348, + "learning_rate": 1.999898109181919e-05, + "loss": 0.4761, + "step": 357 + }, + { + "epoch": 0.20903583680023355, + "grad_norm": 0.10608609499215514, + "learning_rate": 1.9998830337935488e-05, + "loss": 0.4769, + "step": 358 + }, + { + "epoch": 0.20961973578570908, + "grad_norm": 0.12043751030774777, + "learning_rate": 1.9998669188069992e-05, + "loss": 0.4775, + "step": 359 + }, + { + "epoch": 0.2102036347711846, + "grad_norm": 0.1097960892482818, + "learning_rate": 1.9998497642390255e-05, + "loss": 0.4182, + "step": 360 + }, + { + "epoch": 0.2107875337566601, + "grad_norm": 0.11290225191879377, + "learning_rate": 1.9998315701074624e-05, + "loss": 0.5097, + "step": 361 + }, + { + "epoch": 0.2113714327421356, + "grad_norm": 0.11447406186365836, + "learning_rate": 1.999812336431228e-05, + "loss": 0.4848, + "step": 362 + }, + { + "epoch": 0.21195533172761113, + "grad_norm": 0.10758137015850869, + "learning_rate": 1.9997920632303192e-05, + "loss": 0.446, + "step": 363 + }, + { + "epoch": 0.21253923071308664, + "grad_norm": 0.11326391945771376, + "learning_rate": 1.9997707505258147e-05, + "loss": 0.4477, + "step": 364 + }, + { + "epoch": 0.21312312969856215, + "grad_norm": 0.11280485253358473, + "learning_rate": 1.9997483983398736e-05, + "loss": 0.4823, + "step": 365 + }, + { + "epoch": 0.21370702868403765, + "grad_norm": 0.1115188744102502, + "learning_rate": 1.9997250066957357e-05, + "loss": 0.4989, + "step": 366 + }, + { + "epoch": 0.2142909276695132, + "grad_norm": 0.11820873260115417, + "learning_rate": 1.9997005756177228e-05, + "loss": 0.4866, + "step": 367 + }, + { + "epoch": 0.2148748266549887, + "grad_norm": 0.1168252086230445, + "learning_rate": 1.9996751051312352e-05, + "loss": 0.4531, + "step": 368 + }, + { + "epoch": 0.2154587256404642, + "grad_norm": 0.1219636803488188, + "learning_rate": 1.9996485952627554e-05, + "loss": 0.527, + "step": 369 + }, + { + "epoch": 0.2160426246259397, + "grad_norm": 0.1053933030498993, + "learning_rate": 1.9996210460398464e-05, + "loss": 0.4639, + "step": 370 + }, + { + "epoch": 0.2166265236114152, + "grad_norm": 0.10633683394898225, + "learning_rate": 1.9995924574911516e-05, + "loss": 0.4502, + "step": 371 + }, + { + "epoch": 0.21721042259689075, + "grad_norm": 0.10967566168357494, + "learning_rate": 1.9995628296463953e-05, + "loss": 0.4995, + "step": 372 + }, + { + "epoch": 0.21779432158236625, + "grad_norm": 0.10259060957477167, + "learning_rate": 1.9995321625363814e-05, + "loss": 0.4504, + "step": 373 + }, + { + "epoch": 0.21837822056784176, + "grad_norm": 0.12072276785745267, + "learning_rate": 1.999500456192996e-05, + "loss": 0.5039, + "step": 374 + }, + { + "epoch": 0.21896211955331726, + "grad_norm": 0.11156929318275795, + "learning_rate": 1.9994677106492046e-05, + "loss": 0.465, + "step": 375 + }, + { + "epoch": 0.2195460185387928, + "grad_norm": 0.1138505284001503, + "learning_rate": 1.999433925939053e-05, + "loss": 0.4605, + "step": 376 + }, + { + "epoch": 0.2201299175242683, + "grad_norm": 0.09771179912935149, + "learning_rate": 1.999399102097668e-05, + "loss": 0.4345, + "step": 377 + }, + { + "epoch": 0.2207138165097438, + "grad_norm": 0.11690518618551832, + "learning_rate": 1.999363239161257e-05, + "loss": 0.5176, + "step": 378 + }, + { + "epoch": 0.22129771549521932, + "grad_norm": 0.11064238354403713, + "learning_rate": 1.9993263371671067e-05, + "loss": 0.4721, + "step": 379 + }, + { + "epoch": 0.22188161448069485, + "grad_norm": 0.11495676364860458, + "learning_rate": 1.9992883961535857e-05, + "loss": 0.4934, + "step": 380 + }, + { + "epoch": 0.22246551346617036, + "grad_norm": 0.11260666473433781, + "learning_rate": 1.9992494161601414e-05, + "loss": 0.4553, + "step": 381 + }, + { + "epoch": 0.22304941245164586, + "grad_norm": 0.10545729270035774, + "learning_rate": 1.999209397227302e-05, + "loss": 0.4413, + "step": 382 + }, + { + "epoch": 0.22363331143712137, + "grad_norm": 0.11742699007503096, + "learning_rate": 1.9991683393966764e-05, + "loss": 0.4848, + "step": 383 + }, + { + "epoch": 0.2242172104225969, + "grad_norm": 0.11395492114796957, + "learning_rate": 1.9991262427109532e-05, + "loss": 0.4826, + "step": 384 + }, + { + "epoch": 0.2248011094080724, + "grad_norm": 0.10897991467610253, + "learning_rate": 1.9990831072139008e-05, + "loss": 0.4444, + "step": 385 + }, + { + "epoch": 0.22538500839354791, + "grad_norm": 0.1135984796408721, + "learning_rate": 1.9990389329503685e-05, + "loss": 0.4452, + "step": 386 + }, + { + "epoch": 0.22596890737902342, + "grad_norm": 0.11240631369597227, + "learning_rate": 1.9989937199662845e-05, + "loss": 0.4651, + "step": 387 + }, + { + "epoch": 0.22655280636449895, + "grad_norm": 0.11080305072695718, + "learning_rate": 1.998947468308658e-05, + "loss": 0.426, + "step": 388 + }, + { + "epoch": 0.22713670534997446, + "grad_norm": 0.12114584413220336, + "learning_rate": 1.9989001780255784e-05, + "loss": 0.4555, + "step": 389 + }, + { + "epoch": 0.22772060433544997, + "grad_norm": 0.11908184163633564, + "learning_rate": 1.9988518491662134e-05, + "loss": 0.5247, + "step": 390 + }, + { + "epoch": 0.22830450332092547, + "grad_norm": 0.11545209570574225, + "learning_rate": 1.9988024817808116e-05, + "loss": 0.528, + "step": 391 + }, + { + "epoch": 0.228888402306401, + "grad_norm": 0.09882156821142817, + "learning_rate": 1.9987520759207014e-05, + "loss": 0.4409, + "step": 392 + }, + { + "epoch": 0.2294723012918765, + "grad_norm": 0.11382455217004005, + "learning_rate": 1.9987006316382913e-05, + "loss": 0.4598, + "step": 393 + }, + { + "epoch": 0.23005620027735202, + "grad_norm": 0.11205172389299375, + "learning_rate": 1.9986481489870684e-05, + "loss": 0.473, + "step": 394 + }, + { + "epoch": 0.23064009926282752, + "grad_norm": 0.10840908608226256, + "learning_rate": 1.9985946280215996e-05, + "loss": 0.4889, + "step": 395 + }, + { + "epoch": 0.23122399824830303, + "grad_norm": 0.11446418542270366, + "learning_rate": 1.9985400687975325e-05, + "loss": 0.4751, + "step": 396 + }, + { + "epoch": 0.23180789723377856, + "grad_norm": 0.11068298126289605, + "learning_rate": 1.998484471371593e-05, + "loss": 0.4652, + "step": 397 + }, + { + "epoch": 0.23239179621925407, + "grad_norm": 0.11599122241762827, + "learning_rate": 1.9984278358015867e-05, + "loss": 0.4965, + "step": 398 + }, + { + "epoch": 0.23297569520472958, + "grad_norm": 0.10731265036151659, + "learning_rate": 1.998370162146399e-05, + "loss": 0.4454, + "step": 399 + }, + { + "epoch": 0.23355959419020508, + "grad_norm": 0.1137584720615305, + "learning_rate": 1.9983114504659943e-05, + "loss": 0.4819, + "step": 400 + }, + { + "epoch": 0.23414349317568062, + "grad_norm": 0.1153264593643977, + "learning_rate": 1.998251700821416e-05, + "loss": 0.5144, + "step": 401 + }, + { + "epoch": 0.23472739216115612, + "grad_norm": 0.10473788358147003, + "learning_rate": 1.9981909132747875e-05, + "loss": 0.4719, + "step": 402 + }, + { + "epoch": 0.23531129114663163, + "grad_norm": 0.11945085908838624, + "learning_rate": 1.9981290878893103e-05, + "loss": 0.4723, + "step": 403 + }, + { + "epoch": 0.23589519013210714, + "grad_norm": 0.11757931782634197, + "learning_rate": 1.9980662247292657e-05, + "loss": 0.5219, + "step": 404 + }, + { + "epoch": 0.23647908911758267, + "grad_norm": 0.11725111667932307, + "learning_rate": 1.998002323860014e-05, + "loss": 0.4807, + "step": 405 + }, + { + "epoch": 0.23706298810305818, + "grad_norm": 0.10486780279010022, + "learning_rate": 1.997937385347994e-05, + "loss": 0.4536, + "step": 406 + }, + { + "epoch": 0.23764688708853368, + "grad_norm": 0.11715478846642553, + "learning_rate": 1.9978714092607234e-05, + "loss": 0.5145, + "step": 407 + }, + { + "epoch": 0.2382307860740092, + "grad_norm": 0.11614830730165444, + "learning_rate": 1.997804395666799e-05, + "loss": 0.5627, + "step": 408 + }, + { + "epoch": 0.23881468505948472, + "grad_norm": 0.10427713206602503, + "learning_rate": 1.997736344635896e-05, + "loss": 0.445, + "step": 409 + }, + { + "epoch": 0.23939858404496023, + "grad_norm": 0.10362555259177765, + "learning_rate": 1.997667256238769e-05, + "loss": 0.5209, + "step": 410 + }, + { + "epoch": 0.23998248303043573, + "grad_norm": 0.10718110595136525, + "learning_rate": 1.99759713054725e-05, + "loss": 0.4832, + "step": 411 + }, + { + "epoch": 0.24056638201591124, + "grad_norm": 0.12688033021748277, + "learning_rate": 1.99752596763425e-05, + "loss": 0.5121, + "step": 412 + }, + { + "epoch": 0.24115028100138677, + "grad_norm": 0.2197419532053834, + "learning_rate": 1.9974537675737587e-05, + "loss": 0.4731, + "step": 413 + }, + { + "epoch": 0.24173417998686228, + "grad_norm": 0.11331641347140724, + "learning_rate": 1.9973805304408437e-05, + "loss": 0.4664, + "step": 414 + }, + { + "epoch": 0.24231807897233779, + "grad_norm": 0.10718573856808986, + "learning_rate": 1.9973062563116515e-05, + "loss": 0.435, + "step": 415 + }, + { + "epoch": 0.2429019779578133, + "grad_norm": 0.1426525318749901, + "learning_rate": 1.9972309452634064e-05, + "loss": 0.4364, + "step": 416 + }, + { + "epoch": 0.2434858769432888, + "grad_norm": 0.10888079971106551, + "learning_rate": 1.9971545973744102e-05, + "loss": 0.4282, + "step": 417 + }, + { + "epoch": 0.24406977592876433, + "grad_norm": 0.11675154499002265, + "learning_rate": 1.997077212724044e-05, + "loss": 0.5238, + "step": 418 + }, + { + "epoch": 0.24465367491423984, + "grad_norm": 0.11105288734732757, + "learning_rate": 1.9969987913927657e-05, + "loss": 0.4311, + "step": 419 + }, + { + "epoch": 0.24523757389971534, + "grad_norm": 0.12643643277484065, + "learning_rate": 1.9969193334621117e-05, + "loss": 0.4508, + "step": 420 + }, + { + "epoch": 0.24582147288519085, + "grad_norm": 0.12211119562240913, + "learning_rate": 1.996838839014696e-05, + "loss": 0.5028, + "step": 421 + }, + { + "epoch": 0.24640537187066638, + "grad_norm": 0.10247716470117967, + "learning_rate": 1.9967573081342103e-05, + "loss": 0.4305, + "step": 422 + }, + { + "epoch": 0.2469892708561419, + "grad_norm": 0.11294442051242815, + "learning_rate": 1.9966747409054235e-05, + "loss": 0.4587, + "step": 423 + }, + { + "epoch": 0.2475731698416174, + "grad_norm": 0.16340175317473582, + "learning_rate": 1.996591137414183e-05, + "loss": 0.5042, + "step": 424 + }, + { + "epoch": 0.2481570688270929, + "grad_norm": 0.11494412709480582, + "learning_rate": 1.996506497747412e-05, + "loss": 0.4815, + "step": 425 + }, + { + "epoch": 0.24874096781256844, + "grad_norm": 0.10806120623244819, + "learning_rate": 1.9964208219931135e-05, + "loss": 0.4777, + "step": 426 + }, + { + "epoch": 0.24932486679804394, + "grad_norm": 0.3289994532754349, + "learning_rate": 1.9963341102403652e-05, + "loss": 0.4614, + "step": 427 + }, + { + "epoch": 0.24990876578351945, + "grad_norm": 0.14967391271931915, + "learning_rate": 1.996246362579323e-05, + "loss": 0.5981, + "step": 428 + }, + { + "epoch": 0.25049266476899495, + "grad_norm": 0.11234236590879111, + "learning_rate": 1.99615757910122e-05, + "loss": 0.5061, + "step": 429 + }, + { + "epoch": 0.25107656375447046, + "grad_norm": 0.1158364246132873, + "learning_rate": 1.9960677598983672e-05, + "loss": 0.5063, + "step": 430 + }, + { + "epoch": 0.25166046273994597, + "grad_norm": 0.10670542376965202, + "learning_rate": 1.9959769050641498e-05, + "loss": 0.4282, + "step": 431 + }, + { + "epoch": 0.25224436172542153, + "grad_norm": 0.10504803378825164, + "learning_rate": 1.9958850146930326e-05, + "loss": 0.4831, + "step": 432 + }, + { + "epoch": 0.25282826071089703, + "grad_norm": 0.11255367638886131, + "learning_rate": 1.9957920888805548e-05, + "loss": 0.4442, + "step": 433 + }, + { + "epoch": 0.25341215969637254, + "grad_norm": 0.106937043943414, + "learning_rate": 1.9956981277233342e-05, + "loss": 0.4518, + "step": 434 + }, + { + "epoch": 0.25399605868184805, + "grad_norm": 0.10938332954018594, + "learning_rate": 1.9956031313190634e-05, + "loss": 0.4509, + "step": 435 + }, + { + "epoch": 0.25457995766732355, + "grad_norm": 0.10755006100854324, + "learning_rate": 1.9955070997665122e-05, + "loss": 0.4341, + "step": 436 + }, + { + "epoch": 0.25516385665279906, + "grad_norm": 0.11933126407325542, + "learning_rate": 1.9954100331655265e-05, + "loss": 0.4838, + "step": 437 + }, + { + "epoch": 0.25574775563827457, + "grad_norm": 0.11162898791498366, + "learning_rate": 1.9953119316170286e-05, + "loss": 0.5128, + "step": 438 + }, + { + "epoch": 0.25633165462375007, + "grad_norm": 0.10198685994169777, + "learning_rate": 1.9952127952230166e-05, + "loss": 0.4383, + "step": 439 + }, + { + "epoch": 0.2569155536092256, + "grad_norm": 0.10512253449310441, + "learning_rate": 1.995112624086564e-05, + "loss": 0.47, + "step": 440 + }, + { + "epoch": 0.25749945259470114, + "grad_norm": 0.1072880972084344, + "learning_rate": 1.9950114183118215e-05, + "loss": 0.5105, + "step": 441 + }, + { + "epoch": 0.25808335158017665, + "grad_norm": 0.1166177623033071, + "learning_rate": 1.9949091780040143e-05, + "loss": 0.4538, + "step": 442 + }, + { + "epoch": 0.25866725056565215, + "grad_norm": 0.10151070492519414, + "learning_rate": 1.9948059032694432e-05, + "loss": 0.4619, + "step": 443 + }, + { + "epoch": 0.25925114955112766, + "grad_norm": 0.12575622280811238, + "learning_rate": 1.9947015942154864e-05, + "loss": 0.5266, + "step": 444 + }, + { + "epoch": 0.25983504853660316, + "grad_norm": 0.10891654996784639, + "learning_rate": 1.994596250950595e-05, + "loss": 0.4464, + "step": 445 + }, + { + "epoch": 0.26041894752207867, + "grad_norm": 0.10467012062916746, + "learning_rate": 1.9944898735842963e-05, + "loss": 0.5343, + "step": 446 + }, + { + "epoch": 0.2610028465075542, + "grad_norm": 0.118650580619336, + "learning_rate": 1.9943824622271934e-05, + "loss": 0.5079, + "step": 447 + }, + { + "epoch": 0.2615867454930297, + "grad_norm": 0.10287388165049094, + "learning_rate": 1.9942740169909643e-05, + "loss": 0.4823, + "step": 448 + }, + { + "epoch": 0.26217064447850524, + "grad_norm": 0.11401455862572256, + "learning_rate": 1.9941645379883613e-05, + "loss": 0.4691, + "step": 449 + }, + { + "epoch": 0.26275454346398075, + "grad_norm": 0.11050764328075006, + "learning_rate": 1.9940540253332118e-05, + "loss": 0.4575, + "step": 450 + }, + { + "epoch": 0.26333844244945626, + "grad_norm": 0.12526617118077785, + "learning_rate": 1.993942479140418e-05, + "loss": 0.5475, + "step": 451 + }, + { + "epoch": 0.26392234143493176, + "grad_norm": 0.10854021139701223, + "learning_rate": 1.993829899525957e-05, + "loss": 0.4738, + "step": 452 + }, + { + "epoch": 0.26450624042040727, + "grad_norm": 0.11486946363013296, + "learning_rate": 1.99371628660688e-05, + "loss": 0.5016, + "step": 453 + }, + { + "epoch": 0.2650901394058828, + "grad_norm": 0.10646918515431725, + "learning_rate": 1.9936016405013117e-05, + "loss": 0.4079, + "step": 454 + }, + { + "epoch": 0.2656740383913583, + "grad_norm": 0.10783869746297771, + "learning_rate": 1.9934859613284535e-05, + "loss": 0.4728, + "step": 455 + }, + { + "epoch": 0.2662579373768338, + "grad_norm": 0.1021109873270093, + "learning_rate": 1.993369249208578e-05, + "loss": 0.4425, + "step": 456 + }, + { + "epoch": 0.26684183636230935, + "grad_norm": 0.17928307762489282, + "learning_rate": 1.9932515042630335e-05, + "loss": 0.5065, + "step": 457 + }, + { + "epoch": 0.26742573534778485, + "grad_norm": 0.09594239919439491, + "learning_rate": 1.9931327266142425e-05, + "loss": 0.4449, + "step": 458 + }, + { + "epoch": 0.26800963433326036, + "grad_norm": 0.10233352139166572, + "learning_rate": 1.9930129163856992e-05, + "loss": 0.4372, + "step": 459 + }, + { + "epoch": 0.26859353331873587, + "grad_norm": 0.10856843435305369, + "learning_rate": 1.9928920737019735e-05, + "loss": 0.4962, + "step": 460 + }, + { + "epoch": 0.2691774323042114, + "grad_norm": 0.10354318251001918, + "learning_rate": 1.9927701986887077e-05, + "loss": 0.468, + "step": 461 + }, + { + "epoch": 0.2697613312896869, + "grad_norm": 0.10172446956623674, + "learning_rate": 1.9926472914726177e-05, + "loss": 0.416, + "step": 462 + }, + { + "epoch": 0.2703452302751624, + "grad_norm": 0.1226515684597648, + "learning_rate": 1.9925233521814926e-05, + "loss": 0.4468, + "step": 463 + }, + { + "epoch": 0.2709291292606379, + "grad_norm": 0.09854167255841996, + "learning_rate": 1.9923983809441945e-05, + "loss": 0.4799, + "step": 464 + }, + { + "epoch": 0.2715130282461134, + "grad_norm": 0.10910776478279463, + "learning_rate": 1.9922723778906583e-05, + "loss": 0.47, + "step": 465 + }, + { + "epoch": 0.27209692723158896, + "grad_norm": 0.09951411612066836, + "learning_rate": 1.9921453431518923e-05, + "loss": 0.4867, + "step": 466 + }, + { + "epoch": 0.27268082621706446, + "grad_norm": 0.1081222494861953, + "learning_rate": 1.9920172768599763e-05, + "loss": 0.4666, + "step": 467 + }, + { + "epoch": 0.27326472520253997, + "grad_norm": 0.11991744718172768, + "learning_rate": 1.991888179148064e-05, + "loss": 0.4806, + "step": 468 + }, + { + "epoch": 0.2738486241880155, + "grad_norm": 0.10607042770569018, + "learning_rate": 1.991758050150381e-05, + "loss": 0.4914, + "step": 469 + }, + { + "epoch": 0.274432523173491, + "grad_norm": 0.10712483139884131, + "learning_rate": 1.991626890002224e-05, + "loss": 0.4986, + "step": 470 + }, + { + "epoch": 0.2750164221589665, + "grad_norm": 0.104637323365252, + "learning_rate": 1.9914946988399636e-05, + "loss": 0.4575, + "step": 471 + }, + { + "epoch": 0.275600321144442, + "grad_norm": 0.11332881067169139, + "learning_rate": 1.9913614768010418e-05, + "loss": 0.4839, + "step": 472 + }, + { + "epoch": 0.2761842201299175, + "grad_norm": 0.09975675467797741, + "learning_rate": 1.9912272240239715e-05, + "loss": 0.3971, + "step": 473 + }, + { + "epoch": 0.27676811911539306, + "grad_norm": 0.1075994043547606, + "learning_rate": 1.9910919406483384e-05, + "loss": 0.4625, + "step": 474 + }, + { + "epoch": 0.27735201810086857, + "grad_norm": 0.1189653971380939, + "learning_rate": 1.9909556268147995e-05, + "loss": 0.4623, + "step": 475 + }, + { + "epoch": 0.2779359170863441, + "grad_norm": 0.09696989808206406, + "learning_rate": 1.990818282665082e-05, + "loss": 0.4291, + "step": 476 + }, + { + "epoch": 0.2785198160718196, + "grad_norm": 0.10680752977157684, + "learning_rate": 1.9906799083419865e-05, + "loss": 0.468, + "step": 477 + }, + { + "epoch": 0.2791037150572951, + "grad_norm": 0.11658418933554979, + "learning_rate": 1.9905405039893827e-05, + "loss": 0.4825, + "step": 478 + }, + { + "epoch": 0.2796876140427706, + "grad_norm": 0.1047619297725638, + "learning_rate": 1.9904000697522126e-05, + "loss": 0.4261, + "step": 479 + }, + { + "epoch": 0.2802715130282461, + "grad_norm": 0.11102487265632709, + "learning_rate": 1.9902586057764882e-05, + "loss": 0.4834, + "step": 480 + }, + { + "epoch": 0.2808554120137216, + "grad_norm": 0.11484092653949689, + "learning_rate": 1.9901161122092923e-05, + "loss": 0.4678, + "step": 481 + }, + { + "epoch": 0.2814393109991971, + "grad_norm": 0.10209204150598852, + "learning_rate": 1.9899725891987788e-05, + "loss": 0.4866, + "step": 482 + }, + { + "epoch": 0.2820232099846727, + "grad_norm": 0.12177389514212382, + "learning_rate": 1.9898280368941708e-05, + "loss": 0.4112, + "step": 483 + }, + { + "epoch": 0.2826071089701482, + "grad_norm": 0.10456275252049713, + "learning_rate": 1.989682455445762e-05, + "loss": 0.4233, + "step": 484 + }, + { + "epoch": 0.2831910079556237, + "grad_norm": 0.1103185228542731, + "learning_rate": 1.9895358450049175e-05, + "loss": 0.4915, + "step": 485 + }, + { + "epoch": 0.2837749069410992, + "grad_norm": 0.11547777501147886, + "learning_rate": 1.9893882057240698e-05, + "loss": 0.4845, + "step": 486 + }, + { + "epoch": 0.2843588059265747, + "grad_norm": 0.10599350717543674, + "learning_rate": 1.989239537756723e-05, + "loss": 0.4295, + "step": 487 + }, + { + "epoch": 0.2849427049120502, + "grad_norm": 0.10449204350067366, + "learning_rate": 1.98908984125745e-05, + "loss": 0.4329, + "step": 488 + }, + { + "epoch": 0.2855266038975257, + "grad_norm": 0.11251355479212671, + "learning_rate": 1.9889391163818935e-05, + "loss": 0.4898, + "step": 489 + }, + { + "epoch": 0.2861105028830012, + "grad_norm": 0.1112436204386576, + "learning_rate": 1.9887873632867645e-05, + "loss": 0.4846, + "step": 490 + }, + { + "epoch": 0.2866944018684768, + "grad_norm": 0.10681261038113704, + "learning_rate": 1.9886345821298442e-05, + "loss": 0.4527, + "step": 491 + }, + { + "epoch": 0.2872783008539523, + "grad_norm": 0.09895133685235381, + "learning_rate": 1.988480773069982e-05, + "loss": 0.4623, + "step": 492 + }, + { + "epoch": 0.2878621998394278, + "grad_norm": 0.1094214550505383, + "learning_rate": 1.9883259362670967e-05, + "loss": 0.4727, + "step": 493 + }, + { + "epoch": 0.2884460988249033, + "grad_norm": 0.1097042335972487, + "learning_rate": 1.9881700718821744e-05, + "loss": 0.4691, + "step": 494 + }, + { + "epoch": 0.2890299978103788, + "grad_norm": 0.10315430143103399, + "learning_rate": 1.988013180077271e-05, + "loss": 0.4655, + "step": 495 + }, + { + "epoch": 0.2896138967958543, + "grad_norm": 0.09761758873831035, + "learning_rate": 1.9878552610155096e-05, + "loss": 0.4541, + "step": 496 + }, + { + "epoch": 0.2901977957813298, + "grad_norm": 0.11382870899976318, + "learning_rate": 1.987696314861082e-05, + "loss": 0.4672, + "step": 497 + }, + { + "epoch": 0.2907816947668053, + "grad_norm": 0.1143040794765024, + "learning_rate": 1.9875363417792477e-05, + "loss": 0.5102, + "step": 498 + }, + { + "epoch": 0.2913655937522809, + "grad_norm": 0.10372466482071581, + "learning_rate": 1.9873753419363336e-05, + "loss": 0.4492, + "step": 499 + }, + { + "epoch": 0.2919494927377564, + "grad_norm": 0.10365676819610492, + "learning_rate": 1.9872133154997345e-05, + "loss": 0.4821, + "step": 500 + }, + { + "epoch": 0.2925333917232319, + "grad_norm": 0.11238374432479349, + "learning_rate": 1.9870502626379127e-05, + "loss": 0.4702, + "step": 501 + }, + { + "epoch": 0.2931172907087074, + "grad_norm": 0.11527790542184983, + "learning_rate": 1.986886183520398e-05, + "loss": 0.4611, + "step": 502 + }, + { + "epoch": 0.2937011896941829, + "grad_norm": 0.0998504549823008, + "learning_rate": 1.9867210783177857e-05, + "loss": 0.4864, + "step": 503 + }, + { + "epoch": 0.2942850886796584, + "grad_norm": 0.10112504246090578, + "learning_rate": 1.986554947201739e-05, + "loss": 0.4373, + "step": 504 + }, + { + "epoch": 0.2948689876651339, + "grad_norm": 0.10168325752296033, + "learning_rate": 1.9863877903449883e-05, + "loss": 0.4334, + "step": 505 + }, + { + "epoch": 0.2954528866506094, + "grad_norm": 0.10107280777215895, + "learning_rate": 1.9862196079213298e-05, + "loss": 0.4764, + "step": 506 + }, + { + "epoch": 0.29603678563608493, + "grad_norm": 0.11318127800011997, + "learning_rate": 1.986050400105626e-05, + "loss": 0.4723, + "step": 507 + }, + { + "epoch": 0.2966206846215605, + "grad_norm": 0.10120376064881256, + "learning_rate": 1.9858801670738052e-05, + "loss": 0.4443, + "step": 508 + }, + { + "epoch": 0.297204583607036, + "grad_norm": 0.10921253174734938, + "learning_rate": 1.9857089090028628e-05, + "loss": 0.5274, + "step": 509 + }, + { + "epoch": 0.2977884825925115, + "grad_norm": 0.10219430671654223, + "learning_rate": 1.9855366260708586e-05, + "loss": 0.4623, + "step": 510 + }, + { + "epoch": 0.298372381577987, + "grad_norm": 0.09708101542013702, + "learning_rate": 1.9853633184569187e-05, + "loss": 0.4714, + "step": 511 + }, + { + "epoch": 0.2989562805634625, + "grad_norm": 0.10282560205873797, + "learning_rate": 1.9851889863412347e-05, + "loss": 0.5062, + "step": 512 + }, + { + "epoch": 0.299540179548938, + "grad_norm": 0.1037546383120784, + "learning_rate": 1.985013629905063e-05, + "loss": 0.4341, + "step": 513 + }, + { + "epoch": 0.30012407853441353, + "grad_norm": 0.10234879743776315, + "learning_rate": 1.9848372493307253e-05, + "loss": 0.4169, + "step": 514 + }, + { + "epoch": 0.30070797751988904, + "grad_norm": 0.1090808179800267, + "learning_rate": 1.9846598448016077e-05, + "loss": 0.472, + "step": 515 + }, + { + "epoch": 0.3012918765053646, + "grad_norm": 0.11013404915549675, + "learning_rate": 1.984481416502161e-05, + "loss": 0.4778, + "step": 516 + }, + { + "epoch": 0.3018757754908401, + "grad_norm": 0.11631008887217607, + "learning_rate": 1.9843019646179014e-05, + "loss": 0.4698, + "step": 517 + }, + { + "epoch": 0.3024596744763156, + "grad_norm": 0.10855736383096663, + "learning_rate": 1.984121489335408e-05, + "loss": 0.4833, + "step": 518 + }, + { + "epoch": 0.3030435734617911, + "grad_norm": 0.10036005611807718, + "learning_rate": 1.9839399908423248e-05, + "loss": 0.4676, + "step": 519 + }, + { + "epoch": 0.3036274724472666, + "grad_norm": 0.10939789107517071, + "learning_rate": 1.983757469327359e-05, + "loss": 0.4768, + "step": 520 + }, + { + "epoch": 0.30421137143274213, + "grad_norm": 0.10054975819185766, + "learning_rate": 1.983573924980282e-05, + "loss": 0.4571, + "step": 521 + }, + { + "epoch": 0.30479527041821763, + "grad_norm": 0.10774824376987564, + "learning_rate": 1.983389357991929e-05, + "loss": 0.491, + "step": 522 + }, + { + "epoch": 0.30537916940369314, + "grad_norm": 0.09132951395705934, + "learning_rate": 1.9832037685541973e-05, + "loss": 0.4181, + "step": 523 + }, + { + "epoch": 0.30596306838916865, + "grad_norm": 0.10291767810873917, + "learning_rate": 1.983017156860048e-05, + "loss": 0.4666, + "step": 524 + }, + { + "epoch": 0.3065469673746442, + "grad_norm": 0.09906330521744948, + "learning_rate": 1.9828295231035054e-05, + "loss": 0.4682, + "step": 525 + }, + { + "epoch": 0.3071308663601197, + "grad_norm": 0.10751743790311827, + "learning_rate": 1.9826408674796552e-05, + "loss": 0.4871, + "step": 526 + }, + { + "epoch": 0.3077147653455952, + "grad_norm": 0.10354489464988675, + "learning_rate": 1.9824511901846475e-05, + "loss": 0.4868, + "step": 527 + }, + { + "epoch": 0.3082986643310707, + "grad_norm": 0.10140666223433537, + "learning_rate": 1.9822604914156927e-05, + "loss": 0.4568, + "step": 528 + }, + { + "epoch": 0.30888256331654623, + "grad_norm": 0.10299165773616481, + "learning_rate": 1.982068771371064e-05, + "loss": 0.5024, + "step": 529 + }, + { + "epoch": 0.30946646230202174, + "grad_norm": 0.10444983323638574, + "learning_rate": 1.9818760302500976e-05, + "loss": 0.4589, + "step": 530 + }, + { + "epoch": 0.31005036128749724, + "grad_norm": 0.10500363635096446, + "learning_rate": 1.9816822682531888e-05, + "loss": 0.4517, + "step": 531 + }, + { + "epoch": 0.31063426027297275, + "grad_norm": 0.10383240498193995, + "learning_rate": 1.981487485581797e-05, + "loss": 0.4251, + "step": 532 + }, + { + "epoch": 0.3112181592584483, + "grad_norm": 0.10960108733004714, + "learning_rate": 1.9812916824384406e-05, + "loss": 0.4251, + "step": 533 + }, + { + "epoch": 0.3118020582439238, + "grad_norm": 0.10668352247965693, + "learning_rate": 1.9810948590267013e-05, + "loss": 0.4728, + "step": 534 + }, + { + "epoch": 0.3123859572293993, + "grad_norm": 0.10628873382183247, + "learning_rate": 1.9808970155512187e-05, + "loss": 0.4844, + "step": 535 + }, + { + "epoch": 0.31296985621487483, + "grad_norm": 0.10209005657757413, + "learning_rate": 1.9806981522176957e-05, + "loss": 0.4565, + "step": 536 + }, + { + "epoch": 0.31355375520035034, + "grad_norm": 0.09808423761265042, + "learning_rate": 1.9804982692328944e-05, + "loss": 0.4154, + "step": 537 + }, + { + "epoch": 0.31413765418582584, + "grad_norm": 0.10401046049481014, + "learning_rate": 1.9802973668046364e-05, + "loss": 0.4825, + "step": 538 + }, + { + "epoch": 0.31472155317130135, + "grad_norm": 0.10169200204142519, + "learning_rate": 1.9800954451418044e-05, + "loss": 0.4994, + "step": 539 + }, + { + "epoch": 0.31530545215677686, + "grad_norm": 0.11497179984506595, + "learning_rate": 1.9798925044543402e-05, + "loss": 0.5039, + "step": 540 + }, + { + "epoch": 0.3158893511422524, + "grad_norm": 0.10026905929410926, + "learning_rate": 1.9796885449532456e-05, + "loss": 0.477, + "step": 541 + }, + { + "epoch": 0.3164732501277279, + "grad_norm": 0.09819218219422444, + "learning_rate": 1.979483566850581e-05, + "loss": 0.4565, + "step": 542 + }, + { + "epoch": 0.31705714911320343, + "grad_norm": 0.10416567523507561, + "learning_rate": 1.9792775703594663e-05, + "loss": 0.4564, + "step": 543 + }, + { + "epoch": 0.31764104809867894, + "grad_norm": 0.0945812755985712, + "learning_rate": 1.97907055569408e-05, + "loss": 0.4356, + "step": 544 + }, + { + "epoch": 0.31822494708415444, + "grad_norm": 0.1041201776002029, + "learning_rate": 1.9788625230696596e-05, + "loss": 0.4395, + "step": 545 + }, + { + "epoch": 0.31880884606962995, + "grad_norm": 0.10948756670359895, + "learning_rate": 1.9786534727025005e-05, + "loss": 0.4902, + "step": 546 + }, + { + "epoch": 0.31939274505510545, + "grad_norm": 0.11099839118478609, + "learning_rate": 1.9784434048099565e-05, + "loss": 0.4719, + "step": 547 + }, + { + "epoch": 0.31997664404058096, + "grad_norm": 0.09625296069968811, + "learning_rate": 1.9782323196104395e-05, + "loss": 0.4308, + "step": 548 + }, + { + "epoch": 0.32056054302605647, + "grad_norm": 0.09612625805332019, + "learning_rate": 1.978020217323419e-05, + "loss": 0.4469, + "step": 549 + }, + { + "epoch": 0.321144442011532, + "grad_norm": 0.10728672960393655, + "learning_rate": 1.9778070981694216e-05, + "loss": 0.4736, + "step": 550 + }, + { + "epoch": 0.32172834099700753, + "grad_norm": 0.10410137897803999, + "learning_rate": 1.9775929623700318e-05, + "loss": 0.4425, + "step": 551 + }, + { + "epoch": 0.32231223998248304, + "grad_norm": 0.09851154038711088, + "learning_rate": 1.9773778101478908e-05, + "loss": 0.4777, + "step": 552 + }, + { + "epoch": 0.32289613896795855, + "grad_norm": 0.09670431703182338, + "learning_rate": 1.9771616417266966e-05, + "loss": 0.4331, + "step": 553 + }, + { + "epoch": 0.32348003795343405, + "grad_norm": 0.10908299442653899, + "learning_rate": 1.976944457331204e-05, + "loss": 0.5283, + "step": 554 + }, + { + "epoch": 0.32406393693890956, + "grad_norm": 0.09994989351258005, + "learning_rate": 1.976726257187223e-05, + "loss": 0.4604, + "step": 555 + }, + { + "epoch": 0.32464783592438506, + "grad_norm": 0.09504489457483642, + "learning_rate": 1.9765070415216218e-05, + "loss": 0.4387, + "step": 556 + }, + { + "epoch": 0.32523173490986057, + "grad_norm": 0.09447734920520308, + "learning_rate": 1.976286810562323e-05, + "loss": 0.4394, + "step": 557 + }, + { + "epoch": 0.32581563389533613, + "grad_norm": 0.10627007557458634, + "learning_rate": 1.976065564538304e-05, + "loss": 0.4786, + "step": 558 + }, + { + "epoch": 0.32639953288081164, + "grad_norm": 0.10650188134195858, + "learning_rate": 1.9758433036796003e-05, + "loss": 0.4789, + "step": 559 + }, + { + "epoch": 0.32698343186628714, + "grad_norm": 0.10507976460241705, + "learning_rate": 1.9756200282173e-05, + "loss": 0.4339, + "step": 560 + }, + { + "epoch": 0.32756733085176265, + "grad_norm": 0.09474684354285225, + "learning_rate": 1.975395738383547e-05, + "loss": 0.4421, + "step": 561 + }, + { + "epoch": 0.32815122983723816, + "grad_norm": 0.11471985616587957, + "learning_rate": 1.9751704344115402e-05, + "loss": 0.475, + "step": 562 + }, + { + "epoch": 0.32873512882271366, + "grad_norm": 0.10534234925463819, + "learning_rate": 1.9749441165355322e-05, + "loss": 0.4256, + "step": 563 + }, + { + "epoch": 0.32931902780818917, + "grad_norm": 0.0914463914730369, + "learning_rate": 1.9747167849908305e-05, + "loss": 0.4306, + "step": 564 + }, + { + "epoch": 0.3299029267936647, + "grad_norm": 0.09759413277847973, + "learning_rate": 1.974488440013796e-05, + "loss": 0.4682, + "step": 565 + }, + { + "epoch": 0.33048682577914024, + "grad_norm": 0.1025472899909111, + "learning_rate": 1.9742590818418435e-05, + "loss": 0.4318, + "step": 566 + }, + { + "epoch": 0.33107072476461574, + "grad_norm": 0.10357676176201945, + "learning_rate": 1.9740287107134417e-05, + "loss": 0.4634, + "step": 567 + }, + { + "epoch": 0.33165462375009125, + "grad_norm": 0.10238074058901285, + "learning_rate": 1.9737973268681117e-05, + "loss": 0.4561, + "step": 568 + }, + { + "epoch": 0.33223852273556675, + "grad_norm": 0.10368577669742425, + "learning_rate": 1.9735649305464274e-05, + "loss": 0.4876, + "step": 569 + }, + { + "epoch": 0.33282242172104226, + "grad_norm": 0.10002264783977753, + "learning_rate": 1.9733315219900165e-05, + "loss": 0.4645, + "step": 570 + }, + { + "epoch": 0.33340632070651777, + "grad_norm": 0.09908471632148969, + "learning_rate": 1.9730971014415585e-05, + "loss": 0.4282, + "step": 571 + }, + { + "epoch": 0.3339902196919933, + "grad_norm": 0.09880008018399945, + "learning_rate": 1.9728616691447845e-05, + "loss": 0.431, + "step": 572 + }, + { + "epoch": 0.3345741186774688, + "grad_norm": 0.11054284432994925, + "learning_rate": 1.9726252253444784e-05, + "loss": 0.4687, + "step": 573 + }, + { + "epoch": 0.3351580176629443, + "grad_norm": 0.10238060883074551, + "learning_rate": 1.9723877702864758e-05, + "loss": 0.4968, + "step": 574 + }, + { + "epoch": 0.33574191664841985, + "grad_norm": 0.10298732911451251, + "learning_rate": 1.9721493042176632e-05, + "loss": 0.4691, + "step": 575 + }, + { + "epoch": 0.33632581563389535, + "grad_norm": 0.10823253299886151, + "learning_rate": 1.9719098273859782e-05, + "loss": 0.4858, + "step": 576 + }, + { + "epoch": 0.33690971461937086, + "grad_norm": 0.10223885626729912, + "learning_rate": 1.97166934004041e-05, + "loss": 0.4834, + "step": 577 + }, + { + "epoch": 0.33749361360484637, + "grad_norm": 0.10158986450834483, + "learning_rate": 1.9714278424309983e-05, + "loss": 0.4516, + "step": 578 + }, + { + "epoch": 0.33807751259032187, + "grad_norm": 0.10504882456337442, + "learning_rate": 1.971185334808832e-05, + "loss": 0.4498, + "step": 579 + }, + { + "epoch": 0.3386614115757974, + "grad_norm": 0.09955173818506927, + "learning_rate": 1.9709418174260523e-05, + "loss": 0.4691, + "step": 580 + }, + { + "epoch": 0.3392453105612729, + "grad_norm": 0.11031495532322617, + "learning_rate": 1.970697290535848e-05, + "loss": 0.4605, + "step": 581 + }, + { + "epoch": 0.3398292095467484, + "grad_norm": 0.10911487031480178, + "learning_rate": 1.970451754392459e-05, + "loss": 0.4234, + "step": 582 + }, + { + "epoch": 0.34041310853222395, + "grad_norm": 0.1001063300456224, + "learning_rate": 1.970205209251174e-05, + "loss": 0.4345, + "step": 583 + }, + { + "epoch": 0.34099700751769946, + "grad_norm": 0.1085890356250462, + "learning_rate": 1.9699576553683308e-05, + "loss": 0.4558, + "step": 584 + }, + { + "epoch": 0.34158090650317496, + "grad_norm": 0.10927102800910049, + "learning_rate": 1.9697090930013164e-05, + "loss": 0.4502, + "step": 585 + }, + { + "epoch": 0.34216480548865047, + "grad_norm": 0.09682454620380922, + "learning_rate": 1.969459522408566e-05, + "loss": 0.3985, + "step": 586 + }, + { + "epoch": 0.342748704474126, + "grad_norm": 0.10800155250201722, + "learning_rate": 1.9692089438495622e-05, + "loss": 0.4154, + "step": 587 + }, + { + "epoch": 0.3433326034596015, + "grad_norm": 0.10804813541400918, + "learning_rate": 1.9689573575848375e-05, + "loss": 0.4751, + "step": 588 + }, + { + "epoch": 0.343916502445077, + "grad_norm": 0.1030256803199937, + "learning_rate": 1.9687047638759707e-05, + "loss": 0.4463, + "step": 589 + }, + { + "epoch": 0.3445004014305525, + "grad_norm": 0.09838912236393378, + "learning_rate": 1.968451162985589e-05, + "loss": 0.4397, + "step": 590 + }, + { + "epoch": 0.345084300416028, + "grad_norm": 0.09661087195414314, + "learning_rate": 1.9681965551773653e-05, + "loss": 0.4429, + "step": 591 + }, + { + "epoch": 0.34566819940150356, + "grad_norm": 0.0907611569074992, + "learning_rate": 1.967940940716021e-05, + "loss": 0.4569, + "step": 592 + }, + { + "epoch": 0.34625209838697907, + "grad_norm": 0.10824473768049658, + "learning_rate": 1.9676843198673237e-05, + "loss": 0.4359, + "step": 593 + }, + { + "epoch": 0.3468359973724546, + "grad_norm": 0.094711012062842, + "learning_rate": 1.9674266928980863e-05, + "loss": 0.4892, + "step": 594 + }, + { + "epoch": 0.3474198963579301, + "grad_norm": 0.10199979372792325, + "learning_rate": 1.9671680600761694e-05, + "loss": 0.4251, + "step": 595 + }, + { + "epoch": 0.3480037953434056, + "grad_norm": 0.108425219268512, + "learning_rate": 1.966908421670479e-05, + "loss": 0.4797, + "step": 596 + }, + { + "epoch": 0.3485876943288811, + "grad_norm": 0.09790748493464309, + "learning_rate": 1.9666477779509655e-05, + "loss": 0.4214, + "step": 597 + }, + { + "epoch": 0.3491715933143566, + "grad_norm": 0.11241964099720271, + "learning_rate": 1.9663861291886256e-05, + "loss": 0.4733, + "step": 598 + }, + { + "epoch": 0.3497554922998321, + "grad_norm": 0.09161949042216751, + "learning_rate": 1.9661234756555006e-05, + "loss": 0.4342, + "step": 599 + }, + { + "epoch": 0.35033939128530767, + "grad_norm": 0.10090676399244018, + "learning_rate": 1.965859817624677e-05, + "loss": 0.4453, + "step": 600 + }, + { + "epoch": 0.3509232902707832, + "grad_norm": 0.10530934436807371, + "learning_rate": 1.9655951553702848e-05, + "loss": 0.4602, + "step": 601 + }, + { + "epoch": 0.3515071892562587, + "grad_norm": 0.10444480122750918, + "learning_rate": 1.965329489167499e-05, + "loss": 0.463, + "step": 602 + }, + { + "epoch": 0.3520910882417342, + "grad_norm": 0.09883252403570524, + "learning_rate": 1.9650628192925372e-05, + "loss": 0.4533, + "step": 603 + }, + { + "epoch": 0.3526749872272097, + "grad_norm": 0.10646205043498647, + "learning_rate": 1.9647951460226622e-05, + "loss": 0.4619, + "step": 604 + }, + { + "epoch": 0.3532588862126852, + "grad_norm": 0.10015931966734541, + "learning_rate": 1.964526469636179e-05, + "loss": 0.4631, + "step": 605 + }, + { + "epoch": 0.3538427851981607, + "grad_norm": 0.10049485435134459, + "learning_rate": 1.9642567904124354e-05, + "loss": 0.4503, + "step": 606 + }, + { + "epoch": 0.3544266841836362, + "grad_norm": 0.10536753319659906, + "learning_rate": 1.963986108631823e-05, + "loss": 0.5195, + "step": 607 + }, + { + "epoch": 0.35501058316911177, + "grad_norm": 0.1014901513656249, + "learning_rate": 1.9637144245757742e-05, + "loss": 0.4326, + "step": 608 + }, + { + "epoch": 0.3555944821545873, + "grad_norm": 0.09316392306397735, + "learning_rate": 1.9634417385267643e-05, + "loss": 0.4286, + "step": 609 + }, + { + "epoch": 0.3561783811400628, + "grad_norm": 0.10755130074403955, + "learning_rate": 1.963168050768311e-05, + "loss": 0.4722, + "step": 610 + }, + { + "epoch": 0.3567622801255383, + "grad_norm": 0.10020872270463323, + "learning_rate": 1.9628933615849726e-05, + "loss": 0.494, + "step": 611 + }, + { + "epoch": 0.3573461791110138, + "grad_norm": 0.09677021627155821, + "learning_rate": 1.962617671262349e-05, + "loss": 0.4648, + "step": 612 + }, + { + "epoch": 0.3579300780964893, + "grad_norm": 0.11202516433605784, + "learning_rate": 1.9623409800870804e-05, + "loss": 0.4389, + "step": 613 + }, + { + "epoch": 0.3585139770819648, + "grad_norm": 0.1036523318595974, + "learning_rate": 1.9620632883468484e-05, + "loss": 0.4706, + "step": 614 + }, + { + "epoch": 0.3590978760674403, + "grad_norm": 0.10578789605033635, + "learning_rate": 1.9617845963303744e-05, + "loss": 0.4417, + "step": 615 + }, + { + "epoch": 0.3596817750529158, + "grad_norm": 0.10015311051317556, + "learning_rate": 1.9615049043274207e-05, + "loss": 0.4432, + "step": 616 + }, + { + "epoch": 0.3602656740383914, + "grad_norm": 0.10155397379492057, + "learning_rate": 1.9612242126287876e-05, + "loss": 0.4503, + "step": 617 + }, + { + "epoch": 0.3608495730238669, + "grad_norm": 0.10189830783113861, + "learning_rate": 1.960942521526317e-05, + "loss": 0.4689, + "step": 618 + }, + { + "epoch": 0.3614334720093424, + "grad_norm": 0.10321201682570888, + "learning_rate": 1.9606598313128874e-05, + "loss": 0.4189, + "step": 619 + }, + { + "epoch": 0.3620173709948179, + "grad_norm": 0.09600652580905998, + "learning_rate": 1.9603761422824187e-05, + "loss": 0.4293, + "step": 620 + }, + { + "epoch": 0.3626012699802934, + "grad_norm": 0.10459699676170506, + "learning_rate": 1.9600914547298666e-05, + "loss": 0.4624, + "step": 621 + }, + { + "epoch": 0.3631851689657689, + "grad_norm": 0.10308020736146781, + "learning_rate": 1.9598057689512277e-05, + "loss": 0.4398, + "step": 622 + }, + { + "epoch": 0.3637690679512444, + "grad_norm": 0.10158288712961176, + "learning_rate": 1.9595190852435345e-05, + "loss": 0.4705, + "step": 623 + }, + { + "epoch": 0.3643529669367199, + "grad_norm": 0.09030341218064625, + "learning_rate": 1.9592314039048575e-05, + "loss": 0.4263, + "step": 624 + }, + { + "epoch": 0.3649368659221955, + "grad_norm": 0.10591427082269132, + "learning_rate": 1.9589427252343054e-05, + "loss": 0.4894, + "step": 625 + }, + { + "epoch": 0.365520764907671, + "grad_norm": 0.09687465545172563, + "learning_rate": 1.9586530495320227e-05, + "loss": 0.4692, + "step": 626 + }, + { + "epoch": 0.3661046638931465, + "grad_norm": 0.09889797291417558, + "learning_rate": 1.958362377099191e-05, + "loss": 0.479, + "step": 627 + }, + { + "epoch": 0.366688562878622, + "grad_norm": 0.09893350388389607, + "learning_rate": 1.958070708238028e-05, + "loss": 0.5121, + "step": 628 + }, + { + "epoch": 0.3672724618640975, + "grad_norm": 0.0972781118942622, + "learning_rate": 1.957778043251788e-05, + "loss": 0.4207, + "step": 629 + }, + { + "epoch": 0.367856360849573, + "grad_norm": 0.08780321577589092, + "learning_rate": 1.9574843824447602e-05, + "loss": 0.4686, + "step": 630 + }, + { + "epoch": 0.3684402598350485, + "grad_norm": 0.10279732403969223, + "learning_rate": 1.9571897261222694e-05, + "loss": 0.4445, + "step": 631 + }, + { + "epoch": 0.36902415882052403, + "grad_norm": 0.09688834758406975, + "learning_rate": 1.9568940745906762e-05, + "loss": 0.4788, + "step": 632 + }, + { + "epoch": 0.36960805780599953, + "grad_norm": 3.6867229109677657, + "learning_rate": 1.956597428157375e-05, + "loss": 0.5054, + "step": 633 + }, + { + "epoch": 0.3701919567914751, + "grad_norm": 0.10762659066050867, + "learning_rate": 1.956299787130795e-05, + "loss": 0.4586, + "step": 634 + }, + { + "epoch": 0.3707758557769506, + "grad_norm": 0.1046460815361308, + "learning_rate": 1.9560011518203996e-05, + "loss": 0.5178, + "step": 635 + }, + { + "epoch": 0.3713597547624261, + "grad_norm": 0.10484560569021949, + "learning_rate": 1.9557015225366855e-05, + "loss": 0.4738, + "step": 636 + }, + { + "epoch": 0.3719436537479016, + "grad_norm": 0.10193259788696873, + "learning_rate": 1.9554008995911837e-05, + "loss": 0.4345, + "step": 637 + }, + { + "epoch": 0.3725275527333771, + "grad_norm": 0.10043361009770983, + "learning_rate": 1.9550992832964575e-05, + "loss": 0.4223, + "step": 638 + }, + { + "epoch": 0.3731114517188526, + "grad_norm": 0.1012107500381748, + "learning_rate": 1.9547966739661032e-05, + "loss": 0.4671, + "step": 639 + }, + { + "epoch": 0.37369535070432813, + "grad_norm": 0.10391247156770918, + "learning_rate": 1.95449307191475e-05, + "loss": 0.4674, + "step": 640 + }, + { + "epoch": 0.37427924968980364, + "grad_norm": 0.09774840801969574, + "learning_rate": 1.9541884774580588e-05, + "loss": 0.4631, + "step": 641 + }, + { + "epoch": 0.3748631486752792, + "grad_norm": 0.10412032893722169, + "learning_rate": 1.953882890912723e-05, + "loss": 0.3847, + "step": 642 + }, + { + "epoch": 0.3754470476607547, + "grad_norm": 0.09370908098037004, + "learning_rate": 1.953576312596466e-05, + "loss": 0.4692, + "step": 643 + }, + { + "epoch": 0.3760309466462302, + "grad_norm": 0.09613130109461863, + "learning_rate": 1.9532687428280442e-05, + "loss": 0.4298, + "step": 644 + }, + { + "epoch": 0.3766148456317057, + "grad_norm": 0.09563205208635914, + "learning_rate": 1.952960181927243e-05, + "loss": 0.456, + "step": 645 + }, + { + "epoch": 0.3771987446171812, + "grad_norm": 0.10639058262627189, + "learning_rate": 1.9526506302148805e-05, + "loss": 0.3895, + "step": 646 + }, + { + "epoch": 0.37778264360265673, + "grad_norm": 0.09827245234484716, + "learning_rate": 1.9523400880128032e-05, + "loss": 0.449, + "step": 647 + }, + { + "epoch": 0.37836654258813224, + "grad_norm": 0.09836421588891571, + "learning_rate": 1.952028555643888e-05, + "loss": 0.4949, + "step": 648 + }, + { + "epoch": 0.37895044157360774, + "grad_norm": 0.10642022971664636, + "learning_rate": 1.9517160334320405e-05, + "loss": 0.4554, + "step": 649 + }, + { + "epoch": 0.3795343405590833, + "grad_norm": 0.10030767700185018, + "learning_rate": 1.9514025217021976e-05, + "loss": 0.4837, + "step": 650 + }, + { + "epoch": 0.3801182395445588, + "grad_norm": 0.09840376329861864, + "learning_rate": 1.951088020780323e-05, + "loss": 0.4595, + "step": 651 + }, + { + "epoch": 0.3807021385300343, + "grad_norm": 0.10658249276685958, + "learning_rate": 1.950772530993409e-05, + "loss": 0.4628, + "step": 652 + }, + { + "epoch": 0.3812860375155098, + "grad_norm": 0.10145735153329775, + "learning_rate": 1.9504560526694773e-05, + "loss": 0.4576, + "step": 653 + }, + { + "epoch": 0.38186993650098533, + "grad_norm": 0.09954608013864548, + "learning_rate": 1.9501385861375765e-05, + "loss": 0.4769, + "step": 654 + }, + { + "epoch": 0.38245383548646084, + "grad_norm": 0.0990907137448538, + "learning_rate": 1.949820131727783e-05, + "loss": 0.4408, + "step": 655 + }, + { + "epoch": 0.38303773447193634, + "grad_norm": 0.10045038643779156, + "learning_rate": 1.9495006897711994e-05, + "loss": 0.4308, + "step": 656 + }, + { + "epoch": 0.38362163345741185, + "grad_norm": 0.09667887499903267, + "learning_rate": 1.949180260599957e-05, + "loss": 0.4654, + "step": 657 + }, + { + "epoch": 0.38420553244288735, + "grad_norm": 0.09768183762998903, + "learning_rate": 1.9488588445472115e-05, + "loss": 0.4544, + "step": 658 + }, + { + "epoch": 0.3847894314283629, + "grad_norm": 0.09984200418128063, + "learning_rate": 1.9485364419471454e-05, + "loss": 0.4702, + "step": 659 + }, + { + "epoch": 0.3853733304138384, + "grad_norm": 0.10093801957243512, + "learning_rate": 1.948213053134968e-05, + "loss": 0.4686, + "step": 660 + }, + { + "epoch": 0.38595722939931393, + "grad_norm": 0.09897565318157636, + "learning_rate": 1.9478886784469124e-05, + "loss": 0.472, + "step": 661 + }, + { + "epoch": 0.38654112838478943, + "grad_norm": 0.10699233599065264, + "learning_rate": 1.947563318220237e-05, + "loss": 0.4368, + "step": 662 + }, + { + "epoch": 0.38712502737026494, + "grad_norm": 0.10781206350109068, + "learning_rate": 1.9472369727932263e-05, + "loss": 0.46, + "step": 663 + }, + { + "epoch": 0.38770892635574045, + "grad_norm": 0.10435536443178586, + "learning_rate": 1.9469096425051872e-05, + "loss": 0.4253, + "step": 664 + }, + { + "epoch": 0.38829282534121595, + "grad_norm": 0.09688322219090957, + "learning_rate": 1.946581327696452e-05, + "loss": 0.4281, + "step": 665 + }, + { + "epoch": 0.38887672432669146, + "grad_norm": 0.10205761526375948, + "learning_rate": 1.9462520287083755e-05, + "loss": 0.4924, + "step": 666 + }, + { + "epoch": 0.389460623312167, + "grad_norm": 0.0970286670687602, + "learning_rate": 1.945921745883337e-05, + "loss": 0.4484, + "step": 667 + }, + { + "epoch": 0.3900445222976425, + "grad_norm": 0.10233701353420592, + "learning_rate": 1.945590479564738e-05, + "loss": 0.49, + "step": 668 + }, + { + "epoch": 0.39062842128311803, + "grad_norm": 0.09978525319021925, + "learning_rate": 1.9452582300970025e-05, + "loss": 0.4804, + "step": 669 + }, + { + "epoch": 0.39121232026859354, + "grad_norm": 0.09697418489968354, + "learning_rate": 1.944924997825577e-05, + "loss": 0.471, + "step": 670 + }, + { + "epoch": 0.39179621925406904, + "grad_norm": 0.09705879067733492, + "learning_rate": 1.944590783096929e-05, + "loss": 0.4266, + "step": 671 + }, + { + "epoch": 0.39238011823954455, + "grad_norm": 0.09597661671448982, + "learning_rate": 1.944255586258549e-05, + "loss": 0.4244, + "step": 672 + }, + { + "epoch": 0.39296401722502006, + "grad_norm": 0.09497916020340623, + "learning_rate": 1.9439194076589477e-05, + "loss": 0.4307, + "step": 673 + }, + { + "epoch": 0.39354791621049556, + "grad_norm": 0.09717915115278374, + "learning_rate": 1.9435822476476566e-05, + "loss": 0.443, + "step": 674 + }, + { + "epoch": 0.3941318151959711, + "grad_norm": 0.10565813792177095, + "learning_rate": 1.943244106575227e-05, + "loss": 0.4645, + "step": 675 + }, + { + "epoch": 0.39471571418144663, + "grad_norm": 0.09373042351576927, + "learning_rate": 1.9429049847932317e-05, + "loss": 0.4454, + "step": 676 + }, + { + "epoch": 0.39529961316692214, + "grad_norm": 0.09885897494181131, + "learning_rate": 1.9425648826542618e-05, + "loss": 0.4384, + "step": 677 + }, + { + "epoch": 0.39588351215239764, + "grad_norm": 0.093869503572077, + "learning_rate": 1.9422238005119287e-05, + "loss": 0.4281, + "step": 678 + }, + { + "epoch": 0.39646741113787315, + "grad_norm": 0.09924049775400841, + "learning_rate": 1.9418817387208614e-05, + "loss": 0.4597, + "step": 679 + }, + { + "epoch": 0.39705131012334866, + "grad_norm": 0.09295686092794682, + "learning_rate": 1.9415386976367095e-05, + "loss": 0.4255, + "step": 680 + }, + { + "epoch": 0.39763520910882416, + "grad_norm": 0.09661711776551775, + "learning_rate": 1.9411946776161388e-05, + "loss": 0.5134, + "step": 681 + }, + { + "epoch": 0.39821910809429967, + "grad_norm": 0.09477171972380122, + "learning_rate": 1.9408496790168337e-05, + "loss": 0.4339, + "step": 682 + }, + { + "epoch": 0.3988030070797752, + "grad_norm": 0.0928958908048635, + "learning_rate": 1.9405037021974965e-05, + "loss": 0.4615, + "step": 683 + }, + { + "epoch": 0.39938690606525074, + "grad_norm": 0.09048156906012363, + "learning_rate": 1.9401567475178457e-05, + "loss": 0.4918, + "step": 684 + }, + { + "epoch": 0.39997080505072624, + "grad_norm": 0.09839260946206091, + "learning_rate": 1.9398088153386175e-05, + "loss": 0.4814, + "step": 685 + }, + { + "epoch": 0.40055470403620175, + "grad_norm": 0.10040481632526985, + "learning_rate": 1.939459906021563e-05, + "loss": 0.4692, + "step": 686 + }, + { + "epoch": 0.40113860302167725, + "grad_norm": 0.09548070788858064, + "learning_rate": 1.939110019929451e-05, + "loss": 0.4588, + "step": 687 + }, + { + "epoch": 0.40172250200715276, + "grad_norm": 0.09625281883938391, + "learning_rate": 1.938759157426065e-05, + "loss": 0.4602, + "step": 688 + }, + { + "epoch": 0.40230640099262827, + "grad_norm": 0.1000920478665631, + "learning_rate": 1.9384073188762027e-05, + "loss": 0.4637, + "step": 689 + }, + { + "epoch": 0.40289029997810377, + "grad_norm": 0.10106930226982075, + "learning_rate": 1.9380545046456787e-05, + "loss": 0.4532, + "step": 690 + }, + { + "epoch": 0.4034741989635793, + "grad_norm": 0.08987324829035546, + "learning_rate": 1.9377007151013205e-05, + "loss": 0.4645, + "step": 691 + }, + { + "epoch": 0.40405809794905484, + "grad_norm": 0.09788513847447915, + "learning_rate": 1.93734595061097e-05, + "loss": 0.4653, + "step": 692 + }, + { + "epoch": 0.40464199693453035, + "grad_norm": 0.10005111382121191, + "learning_rate": 1.9369902115434827e-05, + "loss": 0.4788, + "step": 693 + }, + { + "epoch": 0.40522589592000585, + "grad_norm": 0.09646110576131113, + "learning_rate": 1.936633498268728e-05, + "loss": 0.4737, + "step": 694 + }, + { + "epoch": 0.40580979490548136, + "grad_norm": 0.09288001382859375, + "learning_rate": 1.9362758111575878e-05, + "loss": 0.4271, + "step": 695 + }, + { + "epoch": 0.40639369389095686, + "grad_norm": 0.10319622398679855, + "learning_rate": 1.9359171505819558e-05, + "loss": 0.4905, + "step": 696 + }, + { + "epoch": 0.40697759287643237, + "grad_norm": 0.09211418276517859, + "learning_rate": 1.935557516914739e-05, + "loss": 0.4668, + "step": 697 + }, + { + "epoch": 0.4075614918619079, + "grad_norm": 0.0990968175825858, + "learning_rate": 1.9351969105298558e-05, + "loss": 0.4191, + "step": 698 + }, + { + "epoch": 0.4081453908473834, + "grad_norm": 0.10817527451618268, + "learning_rate": 1.9348353318022353e-05, + "loss": 0.4486, + "step": 699 + }, + { + "epoch": 0.4087292898328589, + "grad_norm": 0.13775649791077152, + "learning_rate": 1.9344727811078183e-05, + "loss": 0.4603, + "step": 700 + }, + { + "epoch": 0.40931318881833445, + "grad_norm": 0.09553573721994327, + "learning_rate": 1.934109258823556e-05, + "loss": 0.4834, + "step": 701 + }, + { + "epoch": 0.40989708780380996, + "grad_norm": 0.08930175596774133, + "learning_rate": 1.9337447653274097e-05, + "loss": 0.418, + "step": 702 + }, + { + "epoch": 0.41048098678928546, + "grad_norm": 0.09552610297965453, + "learning_rate": 1.9333793009983505e-05, + "loss": 0.4781, + "step": 703 + }, + { + "epoch": 0.41106488577476097, + "grad_norm": 0.09733341430385385, + "learning_rate": 1.9330128662163588e-05, + "loss": 0.4536, + "step": 704 + }, + { + "epoch": 0.4116487847602365, + "grad_norm": 0.09876621596264634, + "learning_rate": 1.9326454613624243e-05, + "loss": 0.4143, + "step": 705 + }, + { + "epoch": 0.412232683745712, + "grad_norm": 0.12778465525451385, + "learning_rate": 1.932277086818545e-05, + "loss": 0.4923, + "step": 706 + }, + { + "epoch": 0.4128165827311875, + "grad_norm": 0.09131161134469687, + "learning_rate": 1.931907742967727e-05, + "loss": 0.4176, + "step": 707 + }, + { + "epoch": 0.413400481716663, + "grad_norm": 0.0913546060107367, + "learning_rate": 1.9315374301939843e-05, + "loss": 0.4171, + "step": 708 + }, + { + "epoch": 0.41398438070213855, + "grad_norm": 0.09354507617139513, + "learning_rate": 1.9311661488823388e-05, + "loss": 0.4873, + "step": 709 + }, + { + "epoch": 0.41456827968761406, + "grad_norm": 0.09342122476068508, + "learning_rate": 1.930793899418819e-05, + "loss": 0.4487, + "step": 710 + }, + { + "epoch": 0.41515217867308957, + "grad_norm": 0.09355660177573726, + "learning_rate": 1.93042068219046e-05, + "loss": 0.4657, + "step": 711 + }, + { + "epoch": 0.4157360776585651, + "grad_norm": 0.10601045005945436, + "learning_rate": 1.9300464975853032e-05, + "loss": 0.4404, + "step": 712 + }, + { + "epoch": 0.4163199766440406, + "grad_norm": 0.09317304195392487, + "learning_rate": 1.9296713459923955e-05, + "loss": 0.4432, + "step": 713 + }, + { + "epoch": 0.4169038756295161, + "grad_norm": 0.09372091573414124, + "learning_rate": 1.9292952278017892e-05, + "loss": 0.4674, + "step": 714 + }, + { + "epoch": 0.4174877746149916, + "grad_norm": 0.10133374795262069, + "learning_rate": 1.9289181434045428e-05, + "loss": 0.4811, + "step": 715 + }, + { + "epoch": 0.4180716736004671, + "grad_norm": 0.09418650938199447, + "learning_rate": 1.9285400931927177e-05, + "loss": 0.4135, + "step": 716 + }, + { + "epoch": 0.41865557258594266, + "grad_norm": 0.10487908876100785, + "learning_rate": 1.92816107755938e-05, + "loss": 0.4354, + "step": 717 + }, + { + "epoch": 0.41923947157141817, + "grad_norm": 0.10388081024856194, + "learning_rate": 1.9277810968986004e-05, + "loss": 0.4818, + "step": 718 + }, + { + "epoch": 0.41982337055689367, + "grad_norm": 0.08793986910208784, + "learning_rate": 1.9274001516054513e-05, + "loss": 0.4145, + "step": 719 + }, + { + "epoch": 0.4204072695423692, + "grad_norm": 0.09372756450094956, + "learning_rate": 1.9270182420760104e-05, + "loss": 0.4754, + "step": 720 + }, + { + "epoch": 0.4209911685278447, + "grad_norm": 0.09868490196013983, + "learning_rate": 1.9266353687073557e-05, + "loss": 0.4404, + "step": 721 + }, + { + "epoch": 0.4215750675133202, + "grad_norm": 0.09849272631829611, + "learning_rate": 1.9262515318975686e-05, + "loss": 0.4769, + "step": 722 + }, + { + "epoch": 0.4221589664987957, + "grad_norm": 0.10640092382068086, + "learning_rate": 1.9258667320457313e-05, + "loss": 0.4621, + "step": 723 + }, + { + "epoch": 0.4227428654842712, + "grad_norm": 0.11444620066809462, + "learning_rate": 1.9254809695519284e-05, + "loss": 0.4199, + "step": 724 + }, + { + "epoch": 0.4233267644697467, + "grad_norm": 0.10003495575714573, + "learning_rate": 1.9250942448172444e-05, + "loss": 0.4888, + "step": 725 + }, + { + "epoch": 0.42391066345522227, + "grad_norm": 0.095672997974565, + "learning_rate": 1.924706558243765e-05, + "loss": 0.4426, + "step": 726 + }, + { + "epoch": 0.4244945624406978, + "grad_norm": 0.11681466195385821, + "learning_rate": 1.9243179102345753e-05, + "loss": 0.52, + "step": 727 + }, + { + "epoch": 0.4250784614261733, + "grad_norm": 0.09941896599778058, + "learning_rate": 1.923928301193761e-05, + "loss": 0.4702, + "step": 728 + }, + { + "epoch": 0.4256623604116488, + "grad_norm": 0.09538982224042505, + "learning_rate": 1.923537731526405e-05, + "loss": 0.4358, + "step": 729 + }, + { + "epoch": 0.4262462593971243, + "grad_norm": 0.09692536187546909, + "learning_rate": 1.9231462016385917e-05, + "loss": 0.4465, + "step": 730 + }, + { + "epoch": 0.4268301583825998, + "grad_norm": 0.09988657334876043, + "learning_rate": 1.9227537119374017e-05, + "loss": 0.4428, + "step": 731 + }, + { + "epoch": 0.4274140573680753, + "grad_norm": 0.10070480954309978, + "learning_rate": 1.9223602628309144e-05, + "loss": 0.4497, + "step": 732 + }, + { + "epoch": 0.4279979563535508, + "grad_norm": 0.094438293830413, + "learning_rate": 1.921965854728207e-05, + "loss": 0.428, + "step": 733 + }, + { + "epoch": 0.4285818553390264, + "grad_norm": 0.09512830445985353, + "learning_rate": 1.9215704880393527e-05, + "loss": 0.4697, + "step": 734 + }, + { + "epoch": 0.4291657543245019, + "grad_norm": 0.10397602796910944, + "learning_rate": 1.9211741631754228e-05, + "loss": 0.4666, + "step": 735 + }, + { + "epoch": 0.4297496533099774, + "grad_norm": 0.10157654880623344, + "learning_rate": 1.9207768805484838e-05, + "loss": 0.4488, + "step": 736 + }, + { + "epoch": 0.4303335522954529, + "grad_norm": 0.10442492716963263, + "learning_rate": 1.9203786405715984e-05, + "loss": 0.4374, + "step": 737 + }, + { + "epoch": 0.4309174512809284, + "grad_norm": 0.09491166868996115, + "learning_rate": 1.9199794436588244e-05, + "loss": 0.459, + "step": 738 + }, + { + "epoch": 0.4315013502664039, + "grad_norm": 0.09697070669621846, + "learning_rate": 1.9195792902252148e-05, + "loss": 0.4602, + "step": 739 + }, + { + "epoch": 0.4320852492518794, + "grad_norm": 0.09580292345539558, + "learning_rate": 1.9191781806868172e-05, + "loss": 0.4042, + "step": 740 + }, + { + "epoch": 0.4326691482373549, + "grad_norm": 0.10478926722822438, + "learning_rate": 1.918776115460673e-05, + "loss": 0.4733, + "step": 741 + }, + { + "epoch": 0.4332530472228304, + "grad_norm": 0.09521309242328896, + "learning_rate": 1.9183730949648173e-05, + "loss": 0.4539, + "step": 742 + }, + { + "epoch": 0.433836946208306, + "grad_norm": 0.09934889659646114, + "learning_rate": 1.9179691196182782e-05, + "loss": 0.4365, + "step": 743 + }, + { + "epoch": 0.4344208451937815, + "grad_norm": 0.09526955026157252, + "learning_rate": 1.917564189841078e-05, + "loss": 0.424, + "step": 744 + }, + { + "epoch": 0.435004744179257, + "grad_norm": 0.09665847531023757, + "learning_rate": 1.9171583060542288e-05, + "loss": 0.4059, + "step": 745 + }, + { + "epoch": 0.4355886431647325, + "grad_norm": 0.09128068320586188, + "learning_rate": 1.916751468679737e-05, + "loss": 0.4009, + "step": 746 + }, + { + "epoch": 0.436172542150208, + "grad_norm": 0.10266081664365868, + "learning_rate": 1.9163436781405992e-05, + "loss": 0.4477, + "step": 747 + }, + { + "epoch": 0.4367564411356835, + "grad_norm": 0.09485085147489529, + "learning_rate": 1.915934934860803e-05, + "loss": 0.3975, + "step": 748 + }, + { + "epoch": 0.437340340121159, + "grad_norm": 0.09268599026618018, + "learning_rate": 1.915525239265327e-05, + "loss": 0.4297, + "step": 749 + }, + { + "epoch": 0.4379242391066345, + "grad_norm": 0.0955940890234051, + "learning_rate": 1.91511459178014e-05, + "loss": 0.4079, + "step": 750 + }, + { + "epoch": 0.4385081380921101, + "grad_norm": 0.0999066344135719, + "learning_rate": 1.9147029928322002e-05, + "loss": 0.4602, + "step": 751 + }, + { + "epoch": 0.4390920370775856, + "grad_norm": 0.108840179231072, + "learning_rate": 1.9142904428494554e-05, + "loss": 0.4596, + "step": 752 + }, + { + "epoch": 0.4396759360630611, + "grad_norm": 0.09450134539811168, + "learning_rate": 1.9138769422608413e-05, + "loss": 0.4076, + "step": 753 + }, + { + "epoch": 0.4402598350485366, + "grad_norm": 0.09725152650972217, + "learning_rate": 1.9134624914962835e-05, + "loss": 0.4768, + "step": 754 + }, + { + "epoch": 0.4408437340340121, + "grad_norm": 0.09904196751235762, + "learning_rate": 1.9130470909866943e-05, + "loss": 0.4515, + "step": 755 + }, + { + "epoch": 0.4414276330194876, + "grad_norm": 0.09754395257490471, + "learning_rate": 1.9126307411639736e-05, + "loss": 0.4917, + "step": 756 + }, + { + "epoch": 0.4420115320049631, + "grad_norm": 0.09860352056716974, + "learning_rate": 1.912213442461009e-05, + "loss": 0.4353, + "step": 757 + }, + { + "epoch": 0.44259543099043863, + "grad_norm": 0.09351758172934979, + "learning_rate": 1.9117951953116737e-05, + "loss": 0.4567, + "step": 758 + }, + { + "epoch": 0.4431793299759142, + "grad_norm": 0.09339342513240347, + "learning_rate": 1.911376000150828e-05, + "loss": 0.4865, + "step": 759 + }, + { + "epoch": 0.4437632289613897, + "grad_norm": 0.09857546531315137, + "learning_rate": 1.9109558574143173e-05, + "loss": 0.4667, + "step": 760 + }, + { + "epoch": 0.4443471279468652, + "grad_norm": 0.09456625837571232, + "learning_rate": 1.9105347675389723e-05, + "loss": 0.4572, + "step": 761 + }, + { + "epoch": 0.4449310269323407, + "grad_norm": 0.09781427079978897, + "learning_rate": 1.9101127309626083e-05, + "loss": 0.4521, + "step": 762 + }, + { + "epoch": 0.4455149259178162, + "grad_norm": 0.10351096056630595, + "learning_rate": 1.909689748124025e-05, + "loss": 0.4317, + "step": 763 + }, + { + "epoch": 0.4460988249032917, + "grad_norm": 0.09069666931275955, + "learning_rate": 1.9092658194630065e-05, + "loss": 0.4425, + "step": 764 + }, + { + "epoch": 0.44668272388876723, + "grad_norm": 0.09930501640724729, + "learning_rate": 1.9088409454203196e-05, + "loss": 0.4525, + "step": 765 + }, + { + "epoch": 0.44726662287424274, + "grad_norm": 0.09500969914942309, + "learning_rate": 1.908415126437714e-05, + "loss": 0.4867, + "step": 766 + }, + { + "epoch": 0.44785052185971824, + "grad_norm": 0.09830280650537278, + "learning_rate": 1.9079883629579224e-05, + "loss": 0.4448, + "step": 767 + }, + { + "epoch": 0.4484344208451938, + "grad_norm": 0.10539902733210592, + "learning_rate": 1.9075606554246594e-05, + "loss": 0.5181, + "step": 768 + }, + { + "epoch": 0.4490183198306693, + "grad_norm": 0.08726744385978374, + "learning_rate": 1.9071320042826206e-05, + "loss": 0.4384, + "step": 769 + }, + { + "epoch": 0.4496022188161448, + "grad_norm": 0.08824860026681157, + "learning_rate": 1.9067024099774828e-05, + "loss": 0.4203, + "step": 770 + }, + { + "epoch": 0.4501861178016203, + "grad_norm": 0.09306389581325199, + "learning_rate": 1.9062718729559048e-05, + "loss": 0.4541, + "step": 771 + }, + { + "epoch": 0.45077001678709583, + "grad_norm": 0.09541178654048854, + "learning_rate": 1.9058403936655235e-05, + "loss": 0.4733, + "step": 772 + }, + { + "epoch": 0.45135391577257133, + "grad_norm": 0.09168184341027187, + "learning_rate": 1.9054079725549565e-05, + "loss": 0.4618, + "step": 773 + }, + { + "epoch": 0.45193781475804684, + "grad_norm": 0.09831086017469631, + "learning_rate": 1.9049746100738012e-05, + "loss": 0.4994, + "step": 774 + }, + { + "epoch": 0.45252171374352235, + "grad_norm": 0.09910512012020897, + "learning_rate": 1.9045403066726325e-05, + "loss": 0.5232, + "step": 775 + }, + { + "epoch": 0.4531056127289979, + "grad_norm": 0.10107850752068585, + "learning_rate": 1.904105062803005e-05, + "loss": 0.5046, + "step": 776 + }, + { + "epoch": 0.4536895117144734, + "grad_norm": 0.11248055006576568, + "learning_rate": 1.9036688789174496e-05, + "loss": 0.4835, + "step": 777 + }, + { + "epoch": 0.4542734106999489, + "grad_norm": 0.10274280324682473, + "learning_rate": 1.9032317554694756e-05, + "loss": 0.4692, + "step": 778 + }, + { + "epoch": 0.4548573096854244, + "grad_norm": 0.0990597993917765, + "learning_rate": 1.9027936929135688e-05, + "loss": 0.4626, + "step": 779 + }, + { + "epoch": 0.45544120867089993, + "grad_norm": 0.0863769981161385, + "learning_rate": 1.9023546917051917e-05, + "loss": 0.4037, + "step": 780 + }, + { + "epoch": 0.45602510765637544, + "grad_norm": 0.10098434652813407, + "learning_rate": 1.901914752300783e-05, + "loss": 0.4624, + "step": 781 + }, + { + "epoch": 0.45660900664185095, + "grad_norm": 0.10303601537163731, + "learning_rate": 1.9014738751577552e-05, + "loss": 0.4223, + "step": 782 + }, + { + "epoch": 0.45719290562732645, + "grad_norm": 0.09606279938348049, + "learning_rate": 1.901032060734498e-05, + "loss": 0.4288, + "step": 783 + }, + { + "epoch": 0.457776804612802, + "grad_norm": 0.09331468009129633, + "learning_rate": 1.900589309490374e-05, + "loss": 0.4236, + "step": 784 + }, + { + "epoch": 0.4583607035982775, + "grad_norm": 0.09128091180221701, + "learning_rate": 1.9001456218857207e-05, + "loss": 0.4488, + "step": 785 + }, + { + "epoch": 0.458944602583753, + "grad_norm": 0.10964172278733782, + "learning_rate": 1.899700998381849e-05, + "loss": 0.4482, + "step": 786 + }, + { + "epoch": 0.45952850156922853, + "grad_norm": 0.10145108447945543, + "learning_rate": 1.899255439441043e-05, + "loss": 0.5295, + "step": 787 + }, + { + "epoch": 0.46011240055470404, + "grad_norm": 0.10667656069915707, + "learning_rate": 1.8988089455265585e-05, + "loss": 0.4595, + "step": 788 + }, + { + "epoch": 0.46069629954017954, + "grad_norm": 0.09316394917203134, + "learning_rate": 1.898361517102624e-05, + "loss": 0.422, + "step": 789 + }, + { + "epoch": 0.46128019852565505, + "grad_norm": 0.09205649606114237, + "learning_rate": 1.8979131546344404e-05, + "loss": 0.4428, + "step": 790 + }, + { + "epoch": 0.46186409751113056, + "grad_norm": 0.10973649097427049, + "learning_rate": 1.8974638585881787e-05, + "loss": 0.4328, + "step": 791 + }, + { + "epoch": 0.46244799649660606, + "grad_norm": 0.09702581545370792, + "learning_rate": 1.8970136294309805e-05, + "loss": 0.4277, + "step": 792 + }, + { + "epoch": 0.4630318954820816, + "grad_norm": 0.0908639843975569, + "learning_rate": 1.896562467630959e-05, + "loss": 0.4671, + "step": 793 + }, + { + "epoch": 0.46361579446755713, + "grad_norm": 0.10401738168718568, + "learning_rate": 1.896110373657195e-05, + "loss": 0.4286, + "step": 794 + }, + { + "epoch": 0.46419969345303264, + "grad_norm": 0.09576436261440777, + "learning_rate": 1.89565734797974e-05, + "loss": 0.4304, + "step": 795 + }, + { + "epoch": 0.46478359243850814, + "grad_norm": 0.10520625686783411, + "learning_rate": 1.895203391069613e-05, + "loss": 0.5034, + "step": 796 + }, + { + "epoch": 0.46536749142398365, + "grad_norm": 0.09725166900757243, + "learning_rate": 1.8947485033988034e-05, + "loss": 0.5036, + "step": 797 + }, + { + "epoch": 0.46595139040945915, + "grad_norm": 0.09588856314473586, + "learning_rate": 1.894292685440266e-05, + "loss": 0.47, + "step": 798 + }, + { + "epoch": 0.46653528939493466, + "grad_norm": 0.08783262599617325, + "learning_rate": 1.893835937667924e-05, + "loss": 0.4314, + "step": 799 + }, + { + "epoch": 0.46711918838041017, + "grad_norm": 0.0989983091517398, + "learning_rate": 1.8933782605566672e-05, + "loss": 0.442, + "step": 800 + }, + { + "epoch": 0.46770308736588573, + "grad_norm": 0.09986990503908583, + "learning_rate": 1.8929196545823512e-05, + "loss": 0.4551, + "step": 801 + }, + { + "epoch": 0.46828698635136123, + "grad_norm": 0.08639402620932013, + "learning_rate": 1.8924601202217977e-05, + "loss": 0.4424, + "step": 802 + }, + { + "epoch": 0.46887088533683674, + "grad_norm": 0.09328657057227002, + "learning_rate": 1.8919996579527943e-05, + "loss": 0.4229, + "step": 803 + }, + { + "epoch": 0.46945478432231225, + "grad_norm": 0.0996999194207526, + "learning_rate": 1.891538268254092e-05, + "loss": 0.4655, + "step": 804 + }, + { + "epoch": 0.47003868330778775, + "grad_norm": 0.10139776524902651, + "learning_rate": 1.8910759516054074e-05, + "loss": 0.4429, + "step": 805 + }, + { + "epoch": 0.47062258229326326, + "grad_norm": 0.08729940755179776, + "learning_rate": 1.8906127084874198e-05, + "loss": 0.4128, + "step": 806 + }, + { + "epoch": 0.47120648127873876, + "grad_norm": 0.11019483509843539, + "learning_rate": 1.8901485393817724e-05, + "loss": 0.4109, + "step": 807 + }, + { + "epoch": 0.47179038026421427, + "grad_norm": 0.09595661886534966, + "learning_rate": 1.889683444771071e-05, + "loss": 0.4162, + "step": 808 + }, + { + "epoch": 0.4723742792496898, + "grad_norm": 0.10914351625202208, + "learning_rate": 1.889217425138884e-05, + "loss": 0.4994, + "step": 809 + }, + { + "epoch": 0.47295817823516534, + "grad_norm": 0.09199812817845891, + "learning_rate": 1.8887504809697405e-05, + "loss": 0.4323, + "step": 810 + }, + { + "epoch": 0.47354207722064084, + "grad_norm": 0.09493748090415777, + "learning_rate": 1.888282612749132e-05, + "loss": 0.4471, + "step": 811 + }, + { + "epoch": 0.47412597620611635, + "grad_norm": 0.10049256829520699, + "learning_rate": 1.8878138209635107e-05, + "loss": 0.4103, + "step": 812 + }, + { + "epoch": 0.47470987519159186, + "grad_norm": 0.08586050294583081, + "learning_rate": 1.887344106100288e-05, + "loss": 0.4263, + "step": 813 + }, + { + "epoch": 0.47529377417706736, + "grad_norm": 0.09594398276522334, + "learning_rate": 1.886873468647836e-05, + "loss": 0.4009, + "step": 814 + }, + { + "epoch": 0.47587767316254287, + "grad_norm": 0.09473995617452265, + "learning_rate": 1.8864019090954865e-05, + "loss": 0.4322, + "step": 815 + }, + { + "epoch": 0.4764615721480184, + "grad_norm": 0.09092151394473023, + "learning_rate": 1.8859294279335285e-05, + "loss": 0.3982, + "step": 816 + }, + { + "epoch": 0.4770454711334939, + "grad_norm": 0.08794154229363474, + "learning_rate": 1.8854560256532098e-05, + "loss": 0.4095, + "step": 817 + }, + { + "epoch": 0.47762937011896944, + "grad_norm": 0.10038720798021021, + "learning_rate": 1.884981702746737e-05, + "loss": 0.4844, + "step": 818 + }, + { + "epoch": 0.47821326910444495, + "grad_norm": 0.10502662115201077, + "learning_rate": 1.8845064597072723e-05, + "loss": 0.4748, + "step": 819 + }, + { + "epoch": 0.47879716808992046, + "grad_norm": 0.08936674951098275, + "learning_rate": 1.884030297028936e-05, + "loss": 0.412, + "step": 820 + }, + { + "epoch": 0.47938106707539596, + "grad_norm": 0.09565542779038012, + "learning_rate": 1.8835532152068025e-05, + "loss": 0.4413, + "step": 821 + }, + { + "epoch": 0.47996496606087147, + "grad_norm": 0.09409742372758004, + "learning_rate": 1.8830752147369047e-05, + "loss": 0.4466, + "step": 822 + }, + { + "epoch": 0.480548865046347, + "grad_norm": 0.09292940897064067, + "learning_rate": 1.8825962961162284e-05, + "loss": 0.4317, + "step": 823 + }, + { + "epoch": 0.4811327640318225, + "grad_norm": 0.0978081546786304, + "learning_rate": 1.8821164598427148e-05, + "loss": 0.4344, + "step": 824 + }, + { + "epoch": 0.481716663017298, + "grad_norm": 0.09492272965047195, + "learning_rate": 1.8816357064152596e-05, + "loss": 0.4457, + "step": 825 + }, + { + "epoch": 0.48230056200277355, + "grad_norm": 0.09962596144744926, + "learning_rate": 1.8811540363337107e-05, + "loss": 0.4241, + "step": 826 + }, + { + "epoch": 0.48288446098824905, + "grad_norm": 0.09325491585637402, + "learning_rate": 1.880671450098871e-05, + "loss": 0.4647, + "step": 827 + }, + { + "epoch": 0.48346835997372456, + "grad_norm": 0.09390164933958882, + "learning_rate": 1.880187948212495e-05, + "loss": 0.4615, + "step": 828 + }, + { + "epoch": 0.48405225895920007, + "grad_norm": 0.09700968353064397, + "learning_rate": 1.8797035311772884e-05, + "loss": 0.4503, + "step": 829 + }, + { + "epoch": 0.48463615794467557, + "grad_norm": 0.0947314877984535, + "learning_rate": 1.8792181994969095e-05, + "loss": 0.5247, + "step": 830 + }, + { + "epoch": 0.4852200569301511, + "grad_norm": 0.09108793271116974, + "learning_rate": 1.8787319536759677e-05, + "loss": 0.4342, + "step": 831 + }, + { + "epoch": 0.4858039559156266, + "grad_norm": 0.10424302038632871, + "learning_rate": 1.878244794220022e-05, + "loss": 0.4685, + "step": 832 + }, + { + "epoch": 0.4863878549011021, + "grad_norm": 0.08739832377571537, + "learning_rate": 1.8777567216355814e-05, + "loss": 0.4509, + "step": 833 + }, + { + "epoch": 0.4869717538865776, + "grad_norm": 0.09469309839438637, + "learning_rate": 1.8772677364301052e-05, + "loss": 0.4163, + "step": 834 + }, + { + "epoch": 0.48755565287205316, + "grad_norm": 0.0969524111151948, + "learning_rate": 1.8767778391120008e-05, + "loss": 0.4612, + "step": 835 + }, + { + "epoch": 0.48813955185752866, + "grad_norm": 0.09919332520042368, + "learning_rate": 1.876287030190624e-05, + "loss": 0.4365, + "step": 836 + }, + { + "epoch": 0.48872345084300417, + "grad_norm": 0.09558178633461559, + "learning_rate": 1.8757953101762786e-05, + "loss": 0.4628, + "step": 837 + }, + { + "epoch": 0.4893073498284797, + "grad_norm": 0.09188574314722465, + "learning_rate": 1.8753026795802158e-05, + "loss": 0.4172, + "step": 838 + }, + { + "epoch": 0.4898912488139552, + "grad_norm": 0.08944150796585457, + "learning_rate": 1.8748091389146336e-05, + "loss": 0.4561, + "step": 839 + }, + { + "epoch": 0.4904751477994307, + "grad_norm": 0.092647283822532, + "learning_rate": 1.8743146886926755e-05, + "loss": 0.4682, + "step": 840 + }, + { + "epoch": 0.4910590467849062, + "grad_norm": 0.09479368487136343, + "learning_rate": 1.8738193294284312e-05, + "loss": 0.4868, + "step": 841 + }, + { + "epoch": 0.4916429457703817, + "grad_norm": 0.10067915412662097, + "learning_rate": 1.873323061636936e-05, + "loss": 0.448, + "step": 842 + }, + { + "epoch": 0.49222684475585726, + "grad_norm": 0.10217056355758754, + "learning_rate": 1.8728258858341684e-05, + "loss": 0.4551, + "step": 843 + }, + { + "epoch": 0.49281074374133277, + "grad_norm": 0.08777078897945947, + "learning_rate": 1.872327802537053e-05, + "loss": 0.3954, + "step": 844 + }, + { + "epoch": 0.4933946427268083, + "grad_norm": 0.08951786314423488, + "learning_rate": 1.8718288122634566e-05, + "loss": 0.3754, + "step": 845 + }, + { + "epoch": 0.4939785417122838, + "grad_norm": 0.10835266223695356, + "learning_rate": 1.8713289155321888e-05, + "loss": 0.441, + "step": 846 + }, + { + "epoch": 0.4945624406977593, + "grad_norm": 0.08863517861824605, + "learning_rate": 1.8708281128630023e-05, + "loss": 0.4052, + "step": 847 + }, + { + "epoch": 0.4951463396832348, + "grad_norm": 0.09538672588663484, + "learning_rate": 1.870326404776592e-05, + "loss": 0.4853, + "step": 848 + }, + { + "epoch": 0.4957302386687103, + "grad_norm": 0.10508205007328283, + "learning_rate": 1.869823791794593e-05, + "loss": 0.4458, + "step": 849 + }, + { + "epoch": 0.4963141376541858, + "grad_norm": 0.09217583907139751, + "learning_rate": 1.869320274439583e-05, + "loss": 0.4327, + "step": 850 + }, + { + "epoch": 0.4968980366396613, + "grad_norm": 0.09742589692727338, + "learning_rate": 1.8688158532350775e-05, + "loss": 0.4794, + "step": 851 + }, + { + "epoch": 0.4974819356251369, + "grad_norm": 0.09830267021456901, + "learning_rate": 1.8683105287055344e-05, + "loss": 0.4665, + "step": 852 + }, + { + "epoch": 0.4980658346106124, + "grad_norm": 0.09304090842138145, + "learning_rate": 1.8678043013763493e-05, + "loss": 0.4524, + "step": 853 + }, + { + "epoch": 0.4986497335960879, + "grad_norm": 0.0998780399501032, + "learning_rate": 1.8672971717738565e-05, + "loss": 0.4283, + "step": 854 + }, + { + "epoch": 0.4992336325815634, + "grad_norm": 0.09871843624787562, + "learning_rate": 1.866789140425329e-05, + "loss": 0.4414, + "step": 855 + }, + { + "epoch": 0.4998175315670389, + "grad_norm": 0.09812237483231633, + "learning_rate": 1.866280207858977e-05, + "loss": 0.4652, + "step": 856 + }, + { + "epoch": 0.5004014305525144, + "grad_norm": 0.0894014548843526, + "learning_rate": 1.865770374603948e-05, + "loss": 0.4363, + "step": 857 + }, + { + "epoch": 0.5009853295379899, + "grad_norm": 0.09383480200830926, + "learning_rate": 1.865259641190325e-05, + "loss": 0.4209, + "step": 858 + }, + { + "epoch": 0.5015692285234654, + "grad_norm": 0.09395991016473568, + "learning_rate": 1.864748008149128e-05, + "loss": 0.4558, + "step": 859 + }, + { + "epoch": 0.5021531275089409, + "grad_norm": 0.09306038910751134, + "learning_rate": 1.8642354760123122e-05, + "loss": 0.4272, + "step": 860 + }, + { + "epoch": 0.5027370264944164, + "grad_norm": 0.09291112046450577, + "learning_rate": 1.8637220453127675e-05, + "loss": 0.4206, + "step": 861 + }, + { + "epoch": 0.5033209254798919, + "grad_norm": 0.0950121715969986, + "learning_rate": 1.8632077165843174e-05, + "loss": 0.4829, + "step": 862 + }, + { + "epoch": 0.5039048244653674, + "grad_norm": 0.09311282837534227, + "learning_rate": 1.86269249036172e-05, + "loss": 0.3982, + "step": 863 + }, + { + "epoch": 0.5044887234508431, + "grad_norm": 0.09684473748106996, + "learning_rate": 1.8621763671806663e-05, + "loss": 0.4514, + "step": 864 + }, + { + "epoch": 0.5050726224363186, + "grad_norm": 0.09168294598881577, + "learning_rate": 1.8616593475777795e-05, + "loss": 0.4683, + "step": 865 + }, + { + "epoch": 0.5056565214217941, + "grad_norm": 0.09396052328996955, + "learning_rate": 1.8611414320906155e-05, + "loss": 0.4635, + "step": 866 + }, + { + "epoch": 0.5062404204072696, + "grad_norm": 0.09398323067821267, + "learning_rate": 1.8606226212576612e-05, + "loss": 0.4492, + "step": 867 + }, + { + "epoch": 0.5068243193927451, + "grad_norm": 0.0986539793001123, + "learning_rate": 1.860102915618334e-05, + "loss": 0.4157, + "step": 868 + }, + { + "epoch": 0.5074082183782206, + "grad_norm": 0.09216328742482369, + "learning_rate": 1.8595823157129828e-05, + "loss": 0.4501, + "step": 869 + }, + { + "epoch": 0.5079921173636961, + "grad_norm": 0.08990220419948014, + "learning_rate": 1.8590608220828855e-05, + "loss": 0.4793, + "step": 870 + }, + { + "epoch": 0.5085760163491716, + "grad_norm": 0.0890322235187184, + "learning_rate": 1.8585384352702486e-05, + "loss": 0.4582, + "step": 871 + }, + { + "epoch": 0.5091599153346471, + "grad_norm": 0.08971247017594339, + "learning_rate": 1.8580151558182093e-05, + "loss": 0.416, + "step": 872 + }, + { + "epoch": 0.5097438143201226, + "grad_norm": 0.10329395181564563, + "learning_rate": 1.85749098427083e-05, + "loss": 0.5178, + "step": 873 + }, + { + "epoch": 0.5103277133055981, + "grad_norm": 0.09251641752580829, + "learning_rate": 1.856965921173104e-05, + "loss": 0.4371, + "step": 874 + }, + { + "epoch": 0.5109116122910736, + "grad_norm": 0.09691671971111523, + "learning_rate": 1.8564399670709482e-05, + "loss": 0.413, + "step": 875 + }, + { + "epoch": 0.5114955112765491, + "grad_norm": 0.0946128227766421, + "learning_rate": 1.8559131225112085e-05, + "loss": 0.4484, + "step": 876 + }, + { + "epoch": 0.5120794102620246, + "grad_norm": 0.10068933012258086, + "learning_rate": 1.8553853880416555e-05, + "loss": 0.4325, + "step": 877 + }, + { + "epoch": 0.5126633092475001, + "grad_norm": 0.10174874033959455, + "learning_rate": 1.8548567642109847e-05, + "loss": 0.4135, + "step": 878 + }, + { + "epoch": 0.5132472082329756, + "grad_norm": 0.09637203481248677, + "learning_rate": 1.8543272515688172e-05, + "loss": 0.4322, + "step": 879 + }, + { + "epoch": 0.5138311072184512, + "grad_norm": 0.1085211799009884, + "learning_rate": 1.8537968506656976e-05, + "loss": 0.4182, + "step": 880 + }, + { + "epoch": 0.5144150062039268, + "grad_norm": 0.0883541702695197, + "learning_rate": 1.8532655620530943e-05, + "loss": 0.4178, + "step": 881 + }, + { + "epoch": 0.5149989051894023, + "grad_norm": 0.09594198370080147, + "learning_rate": 1.8527333862833986e-05, + "loss": 0.443, + "step": 882 + }, + { + "epoch": 0.5155828041748778, + "grad_norm": 0.11559309326568142, + "learning_rate": 1.852200323909924e-05, + "loss": 0.5149, + "step": 883 + }, + { + "epoch": 0.5161667031603533, + "grad_norm": 0.09940779772291125, + "learning_rate": 1.851666375486906e-05, + "loss": 0.4443, + "step": 884 + }, + { + "epoch": 0.5167506021458288, + "grad_norm": 0.09640186075990573, + "learning_rate": 1.8511315415695013e-05, + "loss": 0.4805, + "step": 885 + }, + { + "epoch": 0.5173345011313043, + "grad_norm": 0.10391848429363157, + "learning_rate": 1.8505958227137875e-05, + "loss": 0.4284, + "step": 886 + }, + { + "epoch": 0.5179184001167798, + "grad_norm": 0.10264742504985584, + "learning_rate": 1.8500592194767625e-05, + "loss": 0.3861, + "step": 887 + }, + { + "epoch": 0.5185022991022553, + "grad_norm": 0.09195989684971762, + "learning_rate": 1.8495217324163428e-05, + "loss": 0.4475, + "step": 888 + }, + { + "epoch": 0.5190861980877308, + "grad_norm": 0.09171936819965204, + "learning_rate": 1.8489833620913644e-05, + "loss": 0.3863, + "step": 889 + }, + { + "epoch": 0.5196700970732063, + "grad_norm": 0.10765154824517782, + "learning_rate": 1.848444109061581e-05, + "loss": 0.4417, + "step": 890 + }, + { + "epoch": 0.5202539960586818, + "grad_norm": 0.10156051843533291, + "learning_rate": 1.847903973887666e-05, + "loss": 0.5237, + "step": 891 + }, + { + "epoch": 0.5208378950441573, + "grad_norm": 0.08789742839500561, + "learning_rate": 1.8473629571312073e-05, + "loss": 0.4425, + "step": 892 + }, + { + "epoch": 0.5214217940296328, + "grad_norm": 0.09868925186168293, + "learning_rate": 1.8468210593547114e-05, + "loss": 0.5198, + "step": 893 + }, + { + "epoch": 0.5220056930151084, + "grad_norm": 0.10792775235207878, + "learning_rate": 1.8462782811216e-05, + "loss": 0.4504, + "step": 894 + }, + { + "epoch": 0.5225895920005839, + "grad_norm": 0.09321962851542848, + "learning_rate": 1.8457346229962106e-05, + "loss": 0.46, + "step": 895 + }, + { + "epoch": 0.5231734909860594, + "grad_norm": 0.09799367392850776, + "learning_rate": 1.845190085543795e-05, + "loss": 0.4473, + "step": 896 + }, + { + "epoch": 0.5237573899715349, + "grad_norm": 0.1035309144024498, + "learning_rate": 1.8446446693305194e-05, + "loss": 0.4839, + "step": 897 + }, + { + "epoch": 0.5243412889570105, + "grad_norm": 0.09555828583416338, + "learning_rate": 1.8440983749234647e-05, + "loss": 0.4452, + "step": 898 + }, + { + "epoch": 0.524925187942486, + "grad_norm": 0.08898770247296253, + "learning_rate": 1.8435512028906232e-05, + "loss": 0.4284, + "step": 899 + }, + { + "epoch": 0.5255090869279615, + "grad_norm": 0.101394241076554, + "learning_rate": 1.8430031538009005e-05, + "loss": 0.4836, + "step": 900 + }, + { + "epoch": 0.526092985913437, + "grad_norm": 0.09903464838669773, + "learning_rate": 1.8424542282241144e-05, + "loss": 0.4719, + "step": 901 + }, + { + "epoch": 0.5266768848989125, + "grad_norm": 0.10665270294247976, + "learning_rate": 1.841904426730994e-05, + "loss": 0.4746, + "step": 902 + }, + { + "epoch": 0.527260783884388, + "grad_norm": 0.09295494144019728, + "learning_rate": 1.8413537498931778e-05, + "loss": 0.4312, + "step": 903 + }, + { + "epoch": 0.5278446828698635, + "grad_norm": 0.09387289582592735, + "learning_rate": 1.840802198283216e-05, + "loss": 0.4724, + "step": 904 + }, + { + "epoch": 0.528428581855339, + "grad_norm": 0.09737652017065004, + "learning_rate": 1.840249772474568e-05, + "loss": 0.4473, + "step": 905 + }, + { + "epoch": 0.5290124808408145, + "grad_norm": 0.08520653011805002, + "learning_rate": 1.8396964730416014e-05, + "loss": 0.4057, + "step": 906 + }, + { + "epoch": 0.52959637982629, + "grad_norm": 0.0886228219166744, + "learning_rate": 1.8391423005595928e-05, + "loss": 0.4474, + "step": 907 + }, + { + "epoch": 0.5301802788117655, + "grad_norm": 0.08572397208069563, + "learning_rate": 1.8385872556047263e-05, + "loss": 0.418, + "step": 908 + }, + { + "epoch": 0.530764177797241, + "grad_norm": 0.08676847212474795, + "learning_rate": 1.8380313387540928e-05, + "loss": 0.4319, + "step": 909 + }, + { + "epoch": 0.5313480767827166, + "grad_norm": 0.09436905277512657, + "learning_rate": 1.8374745505856904e-05, + "loss": 0.497, + "step": 910 + }, + { + "epoch": 0.5319319757681921, + "grad_norm": 0.09630929313091194, + "learning_rate": 1.836916891678423e-05, + "loss": 0.4715, + "step": 911 + }, + { + "epoch": 0.5325158747536676, + "grad_norm": 0.08819563249112496, + "learning_rate": 1.836358362612099e-05, + "loss": 0.4433, + "step": 912 + }, + { + "epoch": 0.5330997737391431, + "grad_norm": 0.09454788949095562, + "learning_rate": 1.8357989639674324e-05, + "loss": 0.4474, + "step": 913 + }, + { + "epoch": 0.5336836727246187, + "grad_norm": 0.10095262807108653, + "learning_rate": 1.835238696326041e-05, + "loss": 0.4452, + "step": 914 + }, + { + "epoch": 0.5342675717100942, + "grad_norm": 0.09843298763645125, + "learning_rate": 1.8346775602704464e-05, + "loss": 0.4598, + "step": 915 + }, + { + "epoch": 0.5348514706955697, + "grad_norm": 0.09776005188657978, + "learning_rate": 1.8341155563840726e-05, + "loss": 0.4916, + "step": 916 + }, + { + "epoch": 0.5354353696810452, + "grad_norm": 0.10296063608760868, + "learning_rate": 1.833552685251246e-05, + "loss": 0.4246, + "step": 917 + }, + { + "epoch": 0.5360192686665207, + "grad_norm": 0.09262410402296078, + "learning_rate": 1.8329889474571952e-05, + "loss": 0.4136, + "step": 918 + }, + { + "epoch": 0.5366031676519962, + "grad_norm": 0.08695029397624086, + "learning_rate": 1.83242434358805e-05, + "loss": 0.4115, + "step": 919 + }, + { + "epoch": 0.5371870666374717, + "grad_norm": 0.1045086785161924, + "learning_rate": 1.8318588742308387e-05, + "loss": 0.4472, + "step": 920 + }, + { + "epoch": 0.5377709656229472, + "grad_norm": 0.09789402286681481, + "learning_rate": 1.8312925399734923e-05, + "loss": 0.4761, + "step": 921 + }, + { + "epoch": 0.5383548646084227, + "grad_norm": 0.09671300651217099, + "learning_rate": 1.8307253414048395e-05, + "loss": 0.4579, + "step": 922 + }, + { + "epoch": 0.5389387635938983, + "grad_norm": 0.10251436493752156, + "learning_rate": 1.8301572791146077e-05, + "loss": 0.4498, + "step": 923 + }, + { + "epoch": 0.5395226625793738, + "grad_norm": 0.09325975796522497, + "learning_rate": 1.8295883536934228e-05, + "loss": 0.4384, + "step": 924 + }, + { + "epoch": 0.5401065615648493, + "grad_norm": 0.08838810195093905, + "learning_rate": 1.8290185657328073e-05, + "loss": 0.4925, + "step": 925 + }, + { + "epoch": 0.5406904605503248, + "grad_norm": 0.09433343110016502, + "learning_rate": 1.8284479158251813e-05, + "loss": 0.4299, + "step": 926 + }, + { + "epoch": 0.5412743595358003, + "grad_norm": 0.09233149842332497, + "learning_rate": 1.827876404563861e-05, + "loss": 0.4146, + "step": 927 + }, + { + "epoch": 0.5418582585212758, + "grad_norm": 0.09005366024688333, + "learning_rate": 1.8273040325430575e-05, + "loss": 0.4382, + "step": 928 + }, + { + "epoch": 0.5424421575067513, + "grad_norm": 0.09060804734202627, + "learning_rate": 1.8267308003578774e-05, + "loss": 0.4159, + "step": 929 + }, + { + "epoch": 0.5430260564922268, + "grad_norm": 0.09480055885323524, + "learning_rate": 1.826156708604322e-05, + "loss": 0.4166, + "step": 930 + }, + { + "epoch": 0.5436099554777024, + "grad_norm": 0.08404018697315614, + "learning_rate": 1.8255817578792858e-05, + "loss": 0.3696, + "step": 931 + }, + { + "epoch": 0.5441938544631779, + "grad_norm": 0.09326093699737327, + "learning_rate": 1.825005948780556e-05, + "loss": 0.4336, + "step": 932 + }, + { + "epoch": 0.5447777534486534, + "grad_norm": 0.08851471141468747, + "learning_rate": 1.824429281906813e-05, + "loss": 0.4574, + "step": 933 + }, + { + "epoch": 0.5453616524341289, + "grad_norm": 0.0938496281712297, + "learning_rate": 1.8238517578576288e-05, + "loss": 0.427, + "step": 934 + }, + { + "epoch": 0.5459455514196044, + "grad_norm": 0.09473660148188785, + "learning_rate": 1.8232733772334663e-05, + "loss": 0.4032, + "step": 935 + }, + { + "epoch": 0.5465294504050799, + "grad_norm": 0.08937319155438347, + "learning_rate": 1.8226941406356794e-05, + "loss": 0.4433, + "step": 936 + }, + { + "epoch": 0.5471133493905554, + "grad_norm": 0.09781014323382994, + "learning_rate": 1.8221140486665125e-05, + "loss": 0.4657, + "step": 937 + }, + { + "epoch": 0.547697248376031, + "grad_norm": 0.0935563862727287, + "learning_rate": 1.8215331019290975e-05, + "loss": 0.4404, + "step": 938 + }, + { + "epoch": 0.5482811473615065, + "grad_norm": 0.10752705428222993, + "learning_rate": 1.8209513010274572e-05, + "loss": 0.4717, + "step": 939 + }, + { + "epoch": 0.548865046346982, + "grad_norm": 0.09197306173138178, + "learning_rate": 1.820368646566501e-05, + "loss": 0.4714, + "step": 940 + }, + { + "epoch": 0.5494489453324575, + "grad_norm": 0.09291270132257323, + "learning_rate": 1.8197851391520265e-05, + "loss": 0.4816, + "step": 941 + }, + { + "epoch": 0.550032844317933, + "grad_norm": 0.09610441348951204, + "learning_rate": 1.8192007793907177e-05, + "loss": 0.4494, + "step": 942 + }, + { + "epoch": 0.5506167433034085, + "grad_norm": 0.1033218946003561, + "learning_rate": 1.8186155678901457e-05, + "loss": 0.4759, + "step": 943 + }, + { + "epoch": 0.551200642288884, + "grad_norm": 0.08919050475365557, + "learning_rate": 1.8180295052587653e-05, + "loss": 0.4179, + "step": 944 + }, + { + "epoch": 0.5517845412743595, + "grad_norm": 0.08667645303763409, + "learning_rate": 1.8174425921059183e-05, + "loss": 0.4023, + "step": 945 + }, + { + "epoch": 0.552368440259835, + "grad_norm": 0.0887195469020464, + "learning_rate": 1.81685482904183e-05, + "loss": 0.4474, + "step": 946 + }, + { + "epoch": 0.5529523392453105, + "grad_norm": 0.10106778434913423, + "learning_rate": 1.8162662166776085e-05, + "loss": 0.4835, + "step": 947 + }, + { + "epoch": 0.5535362382307861, + "grad_norm": 0.09685501708157448, + "learning_rate": 1.8156767556252464e-05, + "loss": 0.4129, + "step": 948 + }, + { + "epoch": 0.5541201372162616, + "grad_norm": 0.092676685307622, + "learning_rate": 1.815086446497618e-05, + "loss": 0.3985, + "step": 949 + }, + { + "epoch": 0.5547040362017371, + "grad_norm": 0.0919707063401775, + "learning_rate": 1.8144952899084787e-05, + "loss": 0.4225, + "step": 950 + }, + { + "epoch": 0.5552879351872126, + "grad_norm": 0.107061234908121, + "learning_rate": 1.8139032864724665e-05, + "loss": 0.554, + "step": 951 + }, + { + "epoch": 0.5558718341726882, + "grad_norm": 0.10044775621225685, + "learning_rate": 1.813310436805099e-05, + "loss": 0.4987, + "step": 952 + }, + { + "epoch": 0.5564557331581637, + "grad_norm": 0.08867589266875235, + "learning_rate": 1.8127167415227736e-05, + "loss": 0.3935, + "step": 953 + }, + { + "epoch": 0.5570396321436392, + "grad_norm": 0.09437021473292602, + "learning_rate": 1.8121222012427666e-05, + "loss": 0.4578, + "step": 954 + }, + { + "epoch": 0.5576235311291147, + "grad_norm": 0.0887645543803271, + "learning_rate": 1.8115268165832336e-05, + "loss": 0.4467, + "step": 955 + }, + { + "epoch": 0.5582074301145902, + "grad_norm": 0.09897137092624282, + "learning_rate": 1.810930588163208e-05, + "loss": 0.4386, + "step": 956 + }, + { + "epoch": 0.5587913291000657, + "grad_norm": 0.10499292376316478, + "learning_rate": 1.8103335166026002e-05, + "loss": 0.4513, + "step": 957 + }, + { + "epoch": 0.5593752280855412, + "grad_norm": 0.08463326810425786, + "learning_rate": 1.8097356025221975e-05, + "loss": 0.4158, + "step": 958 + }, + { + "epoch": 0.5599591270710167, + "grad_norm": 0.09551114619473185, + "learning_rate": 1.8091368465436626e-05, + "loss": 0.4694, + "step": 959 + }, + { + "epoch": 0.5605430260564922, + "grad_norm": 0.09495323054801798, + "learning_rate": 1.8085372492895338e-05, + "loss": 0.4325, + "step": 960 + }, + { + "epoch": 0.5611269250419677, + "grad_norm": 0.09594048139789328, + "learning_rate": 1.807936811383225e-05, + "loss": 0.4707, + "step": 961 + }, + { + "epoch": 0.5617108240274432, + "grad_norm": 0.09247203135669485, + "learning_rate": 1.8073355334490227e-05, + "loss": 0.471, + "step": 962 + }, + { + "epoch": 0.5622947230129187, + "grad_norm": 0.08633199812874957, + "learning_rate": 1.806733416112088e-05, + "loss": 0.4212, + "step": 963 + }, + { + "epoch": 0.5628786219983942, + "grad_norm": 0.09251922288342454, + "learning_rate": 1.8061304599984537e-05, + "loss": 0.4255, + "step": 964 + }, + { + "epoch": 0.5634625209838698, + "grad_norm": 0.09923380580896518, + "learning_rate": 1.8055266657350256e-05, + "loss": 0.4776, + "step": 965 + }, + { + "epoch": 0.5640464199693453, + "grad_norm": 0.09287667447435466, + "learning_rate": 1.8049220339495797e-05, + "loss": 0.4169, + "step": 966 + }, + { + "epoch": 0.5646303189548209, + "grad_norm": 0.08887458968062588, + "learning_rate": 1.804316565270765e-05, + "loss": 0.401, + "step": 967 + }, + { + "epoch": 0.5652142179402964, + "grad_norm": 0.10262789593286849, + "learning_rate": 1.8037102603280984e-05, + "loss": 0.4671, + "step": 968 + }, + { + "epoch": 0.5657981169257719, + "grad_norm": 0.08682160745152644, + "learning_rate": 1.8031031197519673e-05, + "loss": 0.4237, + "step": 969 + }, + { + "epoch": 0.5663820159112474, + "grad_norm": 0.09579424164575238, + "learning_rate": 1.8024951441736275e-05, + "loss": 0.4151, + "step": 970 + }, + { + "epoch": 0.5669659148967229, + "grad_norm": 0.09111583572344727, + "learning_rate": 1.8018863342252038e-05, + "loss": 0.4591, + "step": 971 + }, + { + "epoch": 0.5675498138821984, + "grad_norm": 0.08188673386735772, + "learning_rate": 1.801276690539688e-05, + "loss": 0.3766, + "step": 972 + }, + { + "epoch": 0.5681337128676739, + "grad_norm": 0.09274174941322617, + "learning_rate": 1.800666213750938e-05, + "loss": 0.4978, + "step": 973 + }, + { + "epoch": 0.5687176118531494, + "grad_norm": 0.08152205889003643, + "learning_rate": 1.800054904493679e-05, + "loss": 0.3865, + "step": 974 + }, + { + "epoch": 0.5693015108386249, + "grad_norm": 0.09438032160462145, + "learning_rate": 1.7994427634035016e-05, + "loss": 0.4302, + "step": 975 + }, + { + "epoch": 0.5698854098241004, + "grad_norm": 0.09506885598483442, + "learning_rate": 1.7988297911168602e-05, + "loss": 0.4005, + "step": 976 + }, + { + "epoch": 0.5704693088095759, + "grad_norm": 0.08562616866793664, + "learning_rate": 1.798215988271075e-05, + "loss": 0.4032, + "step": 977 + }, + { + "epoch": 0.5710532077950514, + "grad_norm": 0.08977560050501772, + "learning_rate": 1.7976013555043286e-05, + "loss": 0.4106, + "step": 978 + }, + { + "epoch": 0.5716371067805269, + "grad_norm": 0.08973808106323344, + "learning_rate": 1.7969858934556676e-05, + "loss": 0.416, + "step": 979 + }, + { + "epoch": 0.5722210057660024, + "grad_norm": 0.09181287258714561, + "learning_rate": 1.796369602764999e-05, + "loss": 0.448, + "step": 980 + }, + { + "epoch": 0.5728049047514779, + "grad_norm": 0.09119847002314112, + "learning_rate": 1.7957524840730925e-05, + "loss": 0.4386, + "step": 981 + }, + { + "epoch": 0.5733888037369536, + "grad_norm": 0.09521167710462089, + "learning_rate": 1.7951345380215795e-05, + "loss": 0.4219, + "step": 982 + }, + { + "epoch": 0.5739727027224291, + "grad_norm": 0.0989402430788166, + "learning_rate": 1.79451576525295e-05, + "loss": 0.4656, + "step": 983 + }, + { + "epoch": 0.5745566017079046, + "grad_norm": 0.1005920705472971, + "learning_rate": 1.7938961664105546e-05, + "loss": 0.4977, + "step": 984 + }, + { + "epoch": 0.5751405006933801, + "grad_norm": 0.0860530739132807, + "learning_rate": 1.793275742138602e-05, + "loss": 0.4432, + "step": 985 + }, + { + "epoch": 0.5757243996788556, + "grad_norm": 0.09251576507555148, + "learning_rate": 1.7926544930821608e-05, + "loss": 0.4155, + "step": 986 + }, + { + "epoch": 0.5763082986643311, + "grad_norm": 0.10395767337240244, + "learning_rate": 1.7920324198871546e-05, + "loss": 0.5087, + "step": 987 + }, + { + "epoch": 0.5768921976498066, + "grad_norm": 0.09178056251217832, + "learning_rate": 1.791409523200366e-05, + "loss": 0.4408, + "step": 988 + }, + { + "epoch": 0.5774760966352821, + "grad_norm": 0.09819491674637948, + "learning_rate": 1.7907858036694325e-05, + "loss": 0.4266, + "step": 989 + }, + { + "epoch": 0.5780599956207576, + "grad_norm": 0.09534488990140029, + "learning_rate": 1.790161261942848e-05, + "loss": 0.4662, + "step": 990 + }, + { + "epoch": 0.5786438946062331, + "grad_norm": 0.09005150190638692, + "learning_rate": 1.7895358986699607e-05, + "loss": 0.4212, + "step": 991 + }, + { + "epoch": 0.5792277935917086, + "grad_norm": 0.0946658441311317, + "learning_rate": 1.7889097145009736e-05, + "loss": 0.4781, + "step": 992 + }, + { + "epoch": 0.5798116925771841, + "grad_norm": 0.09154622471280416, + "learning_rate": 1.788282710086942e-05, + "loss": 0.4372, + "step": 993 + }, + { + "epoch": 0.5803955915626596, + "grad_norm": 0.10118040549187084, + "learning_rate": 1.7876548860797756e-05, + "loss": 0.4397, + "step": 994 + }, + { + "epoch": 0.5809794905481351, + "grad_norm": 0.08688754038581874, + "learning_rate": 1.787026243132235e-05, + "loss": 0.406, + "step": 995 + }, + { + "epoch": 0.5815633895336106, + "grad_norm": 0.09182395292703004, + "learning_rate": 1.7863967818979328e-05, + "loss": 0.45, + "step": 996 + }, + { + "epoch": 0.5821472885190861, + "grad_norm": 0.09275192214871388, + "learning_rate": 1.785766503031332e-05, + "loss": 0.4612, + "step": 997 + }, + { + "epoch": 0.5827311875045618, + "grad_norm": 0.10268377108046198, + "learning_rate": 1.785135407187747e-05, + "loss": 0.436, + "step": 998 + }, + { + "epoch": 0.5833150864900373, + "grad_norm": 0.09550855543744852, + "learning_rate": 1.7845034950233394e-05, + "loss": 0.4953, + "step": 999 + }, + { + "epoch": 0.5838989854755128, + "grad_norm": 0.08729501527735156, + "learning_rate": 1.7838707671951215e-05, + "loss": 0.4198, + "step": 1000 + }, + { + "epoch": 0.5844828844609883, + "grad_norm": 0.09300738367985945, + "learning_rate": 1.7832372243609527e-05, + "loss": 0.4647, + "step": 1001 + }, + { + "epoch": 0.5850667834464638, + "grad_norm": 0.09854866779069027, + "learning_rate": 1.78260286717954e-05, + "loss": 0.4896, + "step": 1002 + }, + { + "epoch": 0.5856506824319393, + "grad_norm": 0.0924779092815472, + "learning_rate": 1.781967696310437e-05, + "loss": 0.4367, + "step": 1003 + }, + { + "epoch": 0.5862345814174148, + "grad_norm": 0.0967889057070036, + "learning_rate": 1.7813317124140445e-05, + "loss": 0.4456, + "step": 1004 + }, + { + "epoch": 0.5868184804028903, + "grad_norm": 0.0897600662432182, + "learning_rate": 1.7806949161516062e-05, + "loss": 0.4477, + "step": 1005 + }, + { + "epoch": 0.5874023793883658, + "grad_norm": 0.10048027290038623, + "learning_rate": 1.7800573081852124e-05, + "loss": 0.4281, + "step": 1006 + }, + { + "epoch": 0.5879862783738413, + "grad_norm": 0.09316365328827741, + "learning_rate": 1.7794188891777964e-05, + "loss": 0.4217, + "step": 1007 + }, + { + "epoch": 0.5885701773593168, + "grad_norm": 0.08477431725287295, + "learning_rate": 1.7787796597931354e-05, + "loss": 0.3876, + "step": 1008 + }, + { + "epoch": 0.5891540763447923, + "grad_norm": 0.09201264827776867, + "learning_rate": 1.7781396206958485e-05, + "loss": 0.4295, + "step": 1009 + }, + { + "epoch": 0.5897379753302678, + "grad_norm": 0.10068814575254426, + "learning_rate": 1.7774987725513975e-05, + "loss": 0.4245, + "step": 1010 + }, + { + "epoch": 0.5903218743157433, + "grad_norm": 0.09866396452718652, + "learning_rate": 1.7768571160260845e-05, + "loss": 0.421, + "step": 1011 + }, + { + "epoch": 0.5909057733012189, + "grad_norm": 0.11546115045819015, + "learning_rate": 1.7762146517870526e-05, + "loss": 0.4997, + "step": 1012 + }, + { + "epoch": 0.5914896722866944, + "grad_norm": 0.09448609353498712, + "learning_rate": 1.7755713805022846e-05, + "loss": 0.4729, + "step": 1013 + }, + { + "epoch": 0.5920735712721699, + "grad_norm": 0.09668634129415987, + "learning_rate": 1.7749273028406025e-05, + "loss": 0.3923, + "step": 1014 + }, + { + "epoch": 0.5926574702576455, + "grad_norm": 0.09767768585418288, + "learning_rate": 1.7742824194716664e-05, + "loss": 0.4744, + "step": 1015 + }, + { + "epoch": 0.593241369243121, + "grad_norm": 0.08479542894008153, + "learning_rate": 1.7736367310659743e-05, + "loss": 0.4361, + "step": 1016 + }, + { + "epoch": 0.5938252682285965, + "grad_norm": 0.09113235502544996, + "learning_rate": 1.7729902382948617e-05, + "loss": 0.4191, + "step": 1017 + }, + { + "epoch": 0.594409167214072, + "grad_norm": 0.10464245862198668, + "learning_rate": 1.772342941830499e-05, + "loss": 0.4469, + "step": 1018 + }, + { + "epoch": 0.5949930661995475, + "grad_norm": 0.0884530211249895, + "learning_rate": 1.771694842345894e-05, + "loss": 0.4382, + "step": 1019 + }, + { + "epoch": 0.595576965185023, + "grad_norm": 0.10491796090893739, + "learning_rate": 1.771045940514888e-05, + "loss": 0.4312, + "step": 1020 + }, + { + "epoch": 0.5961608641704985, + "grad_norm": 0.092341394630284, + "learning_rate": 1.7703962370121575e-05, + "loss": 0.4211, + "step": 1021 + }, + { + "epoch": 0.596744763155974, + "grad_norm": 0.10088333257954919, + "learning_rate": 1.769745732513212e-05, + "loss": 0.4854, + "step": 1022 + }, + { + "epoch": 0.5973286621414495, + "grad_norm": 0.1012389035317393, + "learning_rate": 1.7690944276943935e-05, + "loss": 0.4173, + "step": 1023 + }, + { + "epoch": 0.597912561126925, + "grad_norm": 0.10003942009393488, + "learning_rate": 1.768442323232877e-05, + "loss": 0.4627, + "step": 1024 + }, + { + "epoch": 0.5984964601124005, + "grad_norm": 0.08762942285011394, + "learning_rate": 1.767789419806668e-05, + "loss": 0.4522, + "step": 1025 + }, + { + "epoch": 0.599080359097876, + "grad_norm": 0.09173039483607474, + "learning_rate": 1.7671357180946035e-05, + "loss": 0.4068, + "step": 1026 + }, + { + "epoch": 0.5996642580833516, + "grad_norm": 0.0911305308295463, + "learning_rate": 1.76648121877635e-05, + "loss": 0.4177, + "step": 1027 + }, + { + "epoch": 0.6002481570688271, + "grad_norm": 0.08582861913236603, + "learning_rate": 1.7658259225324036e-05, + "loss": 0.4294, + "step": 1028 + }, + { + "epoch": 0.6008320560543026, + "grad_norm": 0.10422934936302668, + "learning_rate": 1.765169830044088e-05, + "loss": 0.4736, + "step": 1029 + }, + { + "epoch": 0.6014159550397781, + "grad_norm": 0.10030182317270714, + "learning_rate": 1.7645129419935565e-05, + "loss": 0.4081, + "step": 1030 + }, + { + "epoch": 0.6019998540252536, + "grad_norm": 0.0855729853323428, + "learning_rate": 1.763855259063788e-05, + "loss": 0.3963, + "step": 1031 + }, + { + "epoch": 0.6025837530107292, + "grad_norm": 0.0910340708499548, + "learning_rate": 1.7631967819385883e-05, + "loss": 0.4128, + "step": 1032 + }, + { + "epoch": 0.6031676519962047, + "grad_norm": 0.08845103989201504, + "learning_rate": 1.76253751130259e-05, + "loss": 0.4092, + "step": 1033 + }, + { + "epoch": 0.6037515509816802, + "grad_norm": 0.09271583451087083, + "learning_rate": 1.761877447841249e-05, + "loss": 0.4469, + "step": 1034 + }, + { + "epoch": 0.6043354499671557, + "grad_norm": 0.09447872832066602, + "learning_rate": 1.7612165922408463e-05, + "loss": 0.4338, + "step": 1035 + }, + { + "epoch": 0.6049193489526312, + "grad_norm": 0.09511521677601188, + "learning_rate": 1.760554945188487e-05, + "loss": 0.4538, + "step": 1036 + }, + { + "epoch": 0.6055032479381067, + "grad_norm": 0.09729963866854865, + "learning_rate": 1.759892507372099e-05, + "loss": 0.4611, + "step": 1037 + }, + { + "epoch": 0.6060871469235822, + "grad_norm": 0.09424240387400062, + "learning_rate": 1.759229279480431e-05, + "loss": 0.4072, + "step": 1038 + }, + { + "epoch": 0.6066710459090577, + "grad_norm": 0.09416339446884527, + "learning_rate": 1.758565262203055e-05, + "loss": 0.4593, + "step": 1039 + }, + { + "epoch": 0.6072549448945332, + "grad_norm": 0.08975844325047844, + "learning_rate": 1.757900456230362e-05, + "loss": 0.4291, + "step": 1040 + }, + { + "epoch": 0.6078388438800088, + "grad_norm": 0.09004685899208928, + "learning_rate": 1.757234862253565e-05, + "loss": 0.4383, + "step": 1041 + }, + { + "epoch": 0.6084227428654843, + "grad_norm": 0.09511844575341194, + "learning_rate": 1.7565684809646946e-05, + "loss": 0.4421, + "step": 1042 + }, + { + "epoch": 0.6090066418509598, + "grad_norm": 0.0919737175309237, + "learning_rate": 1.7559013130566003e-05, + "loss": 0.4022, + "step": 1043 + }, + { + "epoch": 0.6095905408364353, + "grad_norm": 0.09649971398460819, + "learning_rate": 1.755233359222951e-05, + "loss": 0.4236, + "step": 1044 + }, + { + "epoch": 0.6101744398219108, + "grad_norm": 0.09158564494451435, + "learning_rate": 1.7545646201582304e-05, + "loss": 0.4217, + "step": 1045 + }, + { + "epoch": 0.6107583388073863, + "grad_norm": 0.09511816034188841, + "learning_rate": 1.75389509655774e-05, + "loss": 0.4344, + "step": 1046 + }, + { + "epoch": 0.6113422377928618, + "grad_norm": 0.09113291591159842, + "learning_rate": 1.7532247891175968e-05, + "loss": 0.4141, + "step": 1047 + }, + { + "epoch": 0.6119261367783373, + "grad_norm": 0.09286963558417387, + "learning_rate": 1.7525536985347328e-05, + "loss": 0.51, + "step": 1048 + }, + { + "epoch": 0.6125100357638129, + "grad_norm": 0.09986349516606924, + "learning_rate": 1.751881825506894e-05, + "loss": 0.5281, + "step": 1049 + }, + { + "epoch": 0.6130939347492884, + "grad_norm": 0.1004574009258164, + "learning_rate": 1.7512091707326403e-05, + "loss": 0.4238, + "step": 1050 + }, + { + "epoch": 0.6136778337347639, + "grad_norm": 0.0905132175422742, + "learning_rate": 1.750535734911344e-05, + "loss": 0.4118, + "step": 1051 + }, + { + "epoch": 0.6142617327202394, + "grad_norm": 0.08925144937183616, + "learning_rate": 1.7498615187431894e-05, + "loss": 0.4558, + "step": 1052 + }, + { + "epoch": 0.6148456317057149, + "grad_norm": 0.09576091943873984, + "learning_rate": 1.7491865229291733e-05, + "loss": 0.4622, + "step": 1053 + }, + { + "epoch": 0.6154295306911904, + "grad_norm": 0.09617966005930463, + "learning_rate": 1.7485107481711014e-05, + "loss": 0.4903, + "step": 1054 + }, + { + "epoch": 0.616013429676666, + "grad_norm": 0.09074829890428297, + "learning_rate": 1.74783419517159e-05, + "loss": 0.4213, + "step": 1055 + }, + { + "epoch": 0.6165973286621415, + "grad_norm": 0.09160086943055429, + "learning_rate": 1.7471568646340653e-05, + "loss": 0.4028, + "step": 1056 + }, + { + "epoch": 0.617181227647617, + "grad_norm": 0.09668496262755956, + "learning_rate": 1.746478757262761e-05, + "loss": 0.4786, + "step": 1057 + }, + { + "epoch": 0.6177651266330925, + "grad_norm": 0.09172806226938145, + "learning_rate": 1.7457998737627183e-05, + "loss": 0.464, + "step": 1058 + }, + { + "epoch": 0.618349025618568, + "grad_norm": 0.09555527117311402, + "learning_rate": 1.745120214839786e-05, + "loss": 0.417, + "step": 1059 + }, + { + "epoch": 0.6189329246040435, + "grad_norm": 0.09303359767117025, + "learning_rate": 1.7444397812006194e-05, + "loss": 0.4622, + "step": 1060 + }, + { + "epoch": 0.619516823589519, + "grad_norm": 0.10092929540024043, + "learning_rate": 1.7437585735526785e-05, + "loss": 0.4482, + "step": 1061 + }, + { + "epoch": 0.6201007225749945, + "grad_norm": 0.09522912198016714, + "learning_rate": 1.7430765926042287e-05, + "loss": 0.4866, + "step": 1062 + }, + { + "epoch": 0.62068462156047, + "grad_norm": 0.0929947034334768, + "learning_rate": 1.7423938390643384e-05, + "loss": 0.4421, + "step": 1063 + }, + { + "epoch": 0.6212685205459455, + "grad_norm": 0.09482994413019931, + "learning_rate": 1.7417103136428806e-05, + "loss": 0.4396, + "step": 1064 + }, + { + "epoch": 0.6218524195314211, + "grad_norm": 0.08714984905730136, + "learning_rate": 1.74102601705053e-05, + "loss": 0.4441, + "step": 1065 + }, + { + "epoch": 0.6224363185168966, + "grad_norm": 0.0924792252479541, + "learning_rate": 1.7403409499987633e-05, + "loss": 0.4161, + "step": 1066 + }, + { + "epoch": 0.6230202175023721, + "grad_norm": 0.08980341850239634, + "learning_rate": 1.739655113199858e-05, + "loss": 0.3923, + "step": 1067 + }, + { + "epoch": 0.6236041164878476, + "grad_norm": 0.09460638292227179, + "learning_rate": 1.7389685073668925e-05, + "loss": 0.4247, + "step": 1068 + }, + { + "epoch": 0.6241880154733231, + "grad_norm": 0.08960313595863191, + "learning_rate": 1.7382811332137444e-05, + "loss": 0.3778, + "step": 1069 + }, + { + "epoch": 0.6247719144587986, + "grad_norm": 0.0966818573924528, + "learning_rate": 1.7375929914550906e-05, + "loss": 0.4748, + "step": 1070 + }, + { + "epoch": 0.6253558134442742, + "grad_norm": 0.08503147854315851, + "learning_rate": 1.7369040828064046e-05, + "loss": 0.3997, + "step": 1071 + }, + { + "epoch": 0.6259397124297497, + "grad_norm": 0.08841680174374698, + "learning_rate": 1.73621440798396e-05, + "loss": 0.4397, + "step": 1072 + }, + { + "epoch": 0.6265236114152252, + "grad_norm": 0.0923157605383763, + "learning_rate": 1.7355239677048237e-05, + "loss": 0.4699, + "step": 1073 + }, + { + "epoch": 0.6271075104007007, + "grad_norm": 0.08659566449358148, + "learning_rate": 1.734832762686861e-05, + "loss": 0.4567, + "step": 1074 + }, + { + "epoch": 0.6276914093861762, + "grad_norm": 0.08544771168355118, + "learning_rate": 1.7341407936487316e-05, + "loss": 0.4303, + "step": 1075 + }, + { + "epoch": 0.6282753083716517, + "grad_norm": 0.09484712484096319, + "learning_rate": 1.7334480613098893e-05, + "loss": 0.4632, + "step": 1076 + }, + { + "epoch": 0.6288592073571272, + "grad_norm": 0.0859589741344882, + "learning_rate": 1.7327545663905813e-05, + "loss": 0.391, + "step": 1077 + }, + { + "epoch": 0.6294431063426027, + "grad_norm": 0.08709869297310086, + "learning_rate": 1.7320603096118476e-05, + "loss": 0.4288, + "step": 1078 + }, + { + "epoch": 0.6300270053280782, + "grad_norm": 0.09090981837244683, + "learning_rate": 1.731365291695522e-05, + "loss": 0.4288, + "step": 1079 + }, + { + "epoch": 0.6306109043135537, + "grad_norm": 0.09040325685888743, + "learning_rate": 1.730669513364227e-05, + "loss": 0.4159, + "step": 1080 + }, + { + "epoch": 0.6311948032990292, + "grad_norm": 0.08778488629604453, + "learning_rate": 1.7299729753413783e-05, + "loss": 0.4643, + "step": 1081 + }, + { + "epoch": 0.6317787022845048, + "grad_norm": 0.08949196250424887, + "learning_rate": 1.7292756783511793e-05, + "loss": 0.4638, + "step": 1082 + }, + { + "epoch": 0.6323626012699803, + "grad_norm": 0.08651513131791888, + "learning_rate": 1.728577623118624e-05, + "loss": 0.399, + "step": 1083 + }, + { + "epoch": 0.6329465002554558, + "grad_norm": 0.09005582045162003, + "learning_rate": 1.7278788103694944e-05, + "loss": 0.3881, + "step": 1084 + }, + { + "epoch": 0.6335303992409314, + "grad_norm": 0.09485288229314275, + "learning_rate": 1.7271792408303593e-05, + "loss": 0.4753, + "step": 1085 + }, + { + "epoch": 0.6341142982264069, + "grad_norm": 0.08900741072775287, + "learning_rate": 1.7264789152285754e-05, + "loss": 0.3987, + "step": 1086 + }, + { + "epoch": 0.6346981972118824, + "grad_norm": 0.09116732520961288, + "learning_rate": 1.7257778342922853e-05, + "loss": 0.4427, + "step": 1087 + }, + { + "epoch": 0.6352820961973579, + "grad_norm": 0.09303822050070606, + "learning_rate": 1.7250759987504165e-05, + "loss": 0.4269, + "step": 1088 + }, + { + "epoch": 0.6358659951828334, + "grad_norm": 0.08844625897320774, + "learning_rate": 1.724373409332681e-05, + "loss": 0.4497, + "step": 1089 + }, + { + "epoch": 0.6364498941683089, + "grad_norm": 0.0918542186800055, + "learning_rate": 1.7236700667695754e-05, + "loss": 0.4711, + "step": 1090 + }, + { + "epoch": 0.6370337931537844, + "grad_norm": 0.09282032557399952, + "learning_rate": 1.7229659717923784e-05, + "loss": 0.4746, + "step": 1091 + }, + { + "epoch": 0.6376176921392599, + "grad_norm": 0.09189651103418493, + "learning_rate": 1.722261125133152e-05, + "loss": 0.4257, + "step": 1092 + }, + { + "epoch": 0.6382015911247354, + "grad_norm": 0.09328906266929847, + "learning_rate": 1.721555527524739e-05, + "loss": 0.4103, + "step": 1093 + }, + { + "epoch": 0.6387854901102109, + "grad_norm": 0.08939908394267948, + "learning_rate": 1.7208491797007634e-05, + "loss": 0.447, + "step": 1094 + }, + { + "epoch": 0.6393693890956864, + "grad_norm": 0.09039804075326334, + "learning_rate": 1.7201420823956286e-05, + "loss": 0.4331, + "step": 1095 + }, + { + "epoch": 0.6399532880811619, + "grad_norm": 0.09024471481601266, + "learning_rate": 1.719434236344518e-05, + "loss": 0.4143, + "step": 1096 + }, + { + "epoch": 0.6405371870666374, + "grad_norm": 0.09248907230090099, + "learning_rate": 1.7187256422833928e-05, + "loss": 0.4572, + "step": 1097 + }, + { + "epoch": 0.6411210860521129, + "grad_norm": 0.10754471151877978, + "learning_rate": 1.7180163009489924e-05, + "loss": 0.505, + "step": 1098 + }, + { + "epoch": 0.6417049850375885, + "grad_norm": 0.08806311245222992, + "learning_rate": 1.7173062130788337e-05, + "loss": 0.3833, + "step": 1099 + }, + { + "epoch": 0.642288884023064, + "grad_norm": 0.09826677038221497, + "learning_rate": 1.716595379411208e-05, + "loss": 0.494, + "step": 1100 + }, + { + "epoch": 0.6428727830085396, + "grad_norm": 0.09409963899915887, + "learning_rate": 1.715883800685184e-05, + "loss": 0.4153, + "step": 1101 + }, + { + "epoch": 0.6434566819940151, + "grad_norm": 0.09037735567839263, + "learning_rate": 1.7151714776406034e-05, + "loss": 0.41, + "step": 1102 + }, + { + "epoch": 0.6440405809794906, + "grad_norm": 0.09422038489223211, + "learning_rate": 1.7144584110180834e-05, + "loss": 0.4456, + "step": 1103 + }, + { + "epoch": 0.6446244799649661, + "grad_norm": 0.09755953584139025, + "learning_rate": 1.7137446015590128e-05, + "loss": 0.435, + "step": 1104 + }, + { + "epoch": 0.6452083789504416, + "grad_norm": 0.08816393569404342, + "learning_rate": 1.7130300500055537e-05, + "loss": 0.4021, + "step": 1105 + }, + { + "epoch": 0.6457922779359171, + "grad_norm": 0.10527599946439106, + "learning_rate": 1.7123147571006398e-05, + "loss": 0.4695, + "step": 1106 + }, + { + "epoch": 0.6463761769213926, + "grad_norm": 0.09235535451921445, + "learning_rate": 1.711598723587975e-05, + "loss": 0.4513, + "step": 1107 + }, + { + "epoch": 0.6469600759068681, + "grad_norm": 0.09354321728558446, + "learning_rate": 1.710881950212033e-05, + "loss": 0.4776, + "step": 1108 + }, + { + "epoch": 0.6475439748923436, + "grad_norm": 0.09867934848614315, + "learning_rate": 1.7101644377180586e-05, + "loss": 0.4219, + "step": 1109 + }, + { + "epoch": 0.6481278738778191, + "grad_norm": 0.09555440942956286, + "learning_rate": 1.7094461868520625e-05, + "loss": 0.444, + "step": 1110 + }, + { + "epoch": 0.6487117728632946, + "grad_norm": 0.09569576707931939, + "learning_rate": 1.708727198360825e-05, + "loss": 0.4282, + "step": 1111 + }, + { + "epoch": 0.6492956718487701, + "grad_norm": 0.093120158736734, + "learning_rate": 1.7080074729918918e-05, + "loss": 0.4555, + "step": 1112 + }, + { + "epoch": 0.6498795708342456, + "grad_norm": 0.08753974599742118, + "learning_rate": 1.7072870114935766e-05, + "loss": 0.4196, + "step": 1113 + }, + { + "epoch": 0.6504634698197211, + "grad_norm": 0.09461624569741206, + "learning_rate": 1.7065658146149572e-05, + "loss": 0.3844, + "step": 1114 + }, + { + "epoch": 0.6510473688051966, + "grad_norm": 0.08860515602951763, + "learning_rate": 1.7058438831058763e-05, + "loss": 0.4416, + "step": 1115 + }, + { + "epoch": 0.6516312677906723, + "grad_norm": 0.09152625267014462, + "learning_rate": 1.70512121771694e-05, + "loss": 0.4518, + "step": 1116 + }, + { + "epoch": 0.6522151667761478, + "grad_norm": 0.0940680112965581, + "learning_rate": 1.7043978191995177e-05, + "loss": 0.4292, + "step": 1117 + }, + { + "epoch": 0.6527990657616233, + "grad_norm": 0.10794032267491846, + "learning_rate": 1.7036736883057422e-05, + "loss": 0.4358, + "step": 1118 + }, + { + "epoch": 0.6533829647470988, + "grad_norm": 0.09194057780039627, + "learning_rate": 1.702948825788506e-05, + "loss": 0.4382, + "step": 1119 + }, + { + "epoch": 0.6539668637325743, + "grad_norm": 0.10291805174351656, + "learning_rate": 1.7022232324014628e-05, + "loss": 0.4391, + "step": 1120 + }, + { + "epoch": 0.6545507627180498, + "grad_norm": 0.08855339328582391, + "learning_rate": 1.7014969088990265e-05, + "loss": 0.3773, + "step": 1121 + }, + { + "epoch": 0.6551346617035253, + "grad_norm": 0.08422699153651776, + "learning_rate": 1.7007698560363704e-05, + "loss": 0.3948, + "step": 1122 + }, + { + "epoch": 0.6557185606890008, + "grad_norm": 0.09236446869233887, + "learning_rate": 1.7000420745694256e-05, + "loss": 0.4238, + "step": 1123 + }, + { + "epoch": 0.6563024596744763, + "grad_norm": 0.09946613955873425, + "learning_rate": 1.6993135652548803e-05, + "loss": 0.4161, + "step": 1124 + }, + { + "epoch": 0.6568863586599518, + "grad_norm": 0.09672673613646379, + "learning_rate": 1.6985843288501814e-05, + "loss": 0.4162, + "step": 1125 + }, + { + "epoch": 0.6574702576454273, + "grad_norm": 0.08630257083770781, + "learning_rate": 1.697854366113529e-05, + "loss": 0.3998, + "step": 1126 + }, + { + "epoch": 0.6580541566309028, + "grad_norm": 0.09543728262506625, + "learning_rate": 1.6971236778038806e-05, + "loss": 0.4446, + "step": 1127 + }, + { + "epoch": 0.6586380556163783, + "grad_norm": 0.08289152130256408, + "learning_rate": 1.6963922646809475e-05, + "loss": 0.414, + "step": 1128 + }, + { + "epoch": 0.6592219546018538, + "grad_norm": 0.08625703919327876, + "learning_rate": 1.6956601275051933e-05, + "loss": 0.4691, + "step": 1129 + }, + { + "epoch": 0.6598058535873293, + "grad_norm": 0.0894099317026028, + "learning_rate": 1.694927267037837e-05, + "loss": 0.4403, + "step": 1130 + }, + { + "epoch": 0.6603897525728049, + "grad_norm": 0.08706000022427855, + "learning_rate": 1.6941936840408465e-05, + "loss": 0.4143, + "step": 1131 + }, + { + "epoch": 0.6609736515582805, + "grad_norm": 0.09854227380192054, + "learning_rate": 1.6934593792769435e-05, + "loss": 0.4324, + "step": 1132 + }, + { + "epoch": 0.661557550543756, + "grad_norm": 0.09630656743836603, + "learning_rate": 1.6927243535095995e-05, + "loss": 0.487, + "step": 1133 + }, + { + "epoch": 0.6621414495292315, + "grad_norm": 0.08593139890790943, + "learning_rate": 1.691988607503035e-05, + "loss": 0.4304, + "step": 1134 + }, + { + "epoch": 0.662725348514707, + "grad_norm": 0.08987772253820427, + "learning_rate": 1.691252142022219e-05, + "loss": 0.4312, + "step": 1135 + }, + { + "epoch": 0.6633092475001825, + "grad_norm": 0.08997533645928246, + "learning_rate": 1.6905149578328705e-05, + "loss": 0.3874, + "step": 1136 + }, + { + "epoch": 0.663893146485658, + "grad_norm": 0.09613967499616509, + "learning_rate": 1.6897770557014535e-05, + "loss": 0.473, + "step": 1137 + }, + { + "epoch": 0.6644770454711335, + "grad_norm": 0.0968027383667893, + "learning_rate": 1.6890384363951802e-05, + "loss": 0.4056, + "step": 1138 + }, + { + "epoch": 0.665060944456609, + "grad_norm": 0.09340410947672406, + "learning_rate": 1.688299100682007e-05, + "loss": 0.4554, + "step": 1139 + }, + { + "epoch": 0.6656448434420845, + "grad_norm": 0.08955184478155377, + "learning_rate": 1.687559049330636e-05, + "loss": 0.411, + "step": 1140 + }, + { + "epoch": 0.66622874242756, + "grad_norm": 0.09219446005576475, + "learning_rate": 1.686818283110514e-05, + "loss": 0.4462, + "step": 1141 + }, + { + "epoch": 0.6668126414130355, + "grad_norm": 0.09474791276402085, + "learning_rate": 1.6860768027918293e-05, + "loss": 0.4422, + "step": 1142 + }, + { + "epoch": 0.667396540398511, + "grad_norm": 0.08914956158102033, + "learning_rate": 1.6853346091455143e-05, + "loss": 0.391, + "step": 1143 + }, + { + "epoch": 0.6679804393839865, + "grad_norm": 0.08058631022710519, + "learning_rate": 1.684591702943242e-05, + "loss": 0.4228, + "step": 1144 + }, + { + "epoch": 0.668564338369462, + "grad_norm": 0.08819357483690463, + "learning_rate": 1.683848084957427e-05, + "loss": 0.4083, + "step": 1145 + }, + { + "epoch": 0.6691482373549376, + "grad_norm": 0.09193303414954906, + "learning_rate": 1.6831037559612235e-05, + "loss": 0.4143, + "step": 1146 + }, + { + "epoch": 0.6697321363404131, + "grad_norm": 0.08611835632338681, + "learning_rate": 1.682358716728525e-05, + "loss": 0.4265, + "step": 1147 + }, + { + "epoch": 0.6703160353258886, + "grad_norm": 0.08986672931092672, + "learning_rate": 1.681612968033964e-05, + "loss": 0.437, + "step": 1148 + }, + { + "epoch": 0.6708999343113642, + "grad_norm": 0.07973262204429679, + "learning_rate": 1.6808665106529096e-05, + "loss": 0.446, + "step": 1149 + }, + { + "epoch": 0.6714838332968397, + "grad_norm": 0.09092820457709504, + "learning_rate": 1.6801193453614683e-05, + "loss": 0.4464, + "step": 1150 + }, + { + "epoch": 0.6720677322823152, + "grad_norm": 0.08739635508296026, + "learning_rate": 1.679371472936483e-05, + "loss": 0.4697, + "step": 1151 + }, + { + "epoch": 0.6726516312677907, + "grad_norm": 0.09163663913033902, + "learning_rate": 1.6786228941555318e-05, + "loss": 0.4462, + "step": 1152 + }, + { + "epoch": 0.6732355302532662, + "grad_norm": 0.09400629109279371, + "learning_rate": 1.6778736097969258e-05, + "loss": 0.4795, + "step": 1153 + }, + { + "epoch": 0.6738194292387417, + "grad_norm": 0.08319671795869193, + "learning_rate": 1.6771236206397123e-05, + "loss": 0.4294, + "step": 1154 + }, + { + "epoch": 0.6744033282242172, + "grad_norm": 0.08216176570533429, + "learning_rate": 1.676372927463668e-05, + "loss": 0.4284, + "step": 1155 + }, + { + "epoch": 0.6749872272096927, + "grad_norm": 0.09640874020081243, + "learning_rate": 1.675621531049305e-05, + "loss": 0.4535, + "step": 1156 + }, + { + "epoch": 0.6755711261951682, + "grad_norm": 0.08842909034275206, + "learning_rate": 1.674869432177864e-05, + "loss": 0.4684, + "step": 1157 + }, + { + "epoch": 0.6761550251806437, + "grad_norm": 0.08134191682177092, + "learning_rate": 1.674116631631318e-05, + "loss": 0.4289, + "step": 1158 + }, + { + "epoch": 0.6767389241661192, + "grad_norm": 0.08224859248420505, + "learning_rate": 1.6733631301923678e-05, + "loss": 0.3711, + "step": 1159 + }, + { + "epoch": 0.6773228231515948, + "grad_norm": 0.08570911368639061, + "learning_rate": 1.672608928644444e-05, + "loss": 0.4555, + "step": 1160 + }, + { + "epoch": 0.6779067221370703, + "grad_norm": 0.08638295721330393, + "learning_rate": 1.6718540277717057e-05, + "loss": 0.4214, + "step": 1161 + }, + { + "epoch": 0.6784906211225458, + "grad_norm": 0.09004091631384817, + "learning_rate": 1.671098428359037e-05, + "loss": 0.474, + "step": 1162 + }, + { + "epoch": 0.6790745201080213, + "grad_norm": 0.08986301379592666, + "learning_rate": 1.67034213119205e-05, + "loss": 0.4352, + "step": 1163 + }, + { + "epoch": 0.6796584190934968, + "grad_norm": 0.08979286574025694, + "learning_rate": 1.6695851370570822e-05, + "loss": 0.4392, + "step": 1164 + }, + { + "epoch": 0.6802423180789723, + "grad_norm": 0.08448528259031549, + "learning_rate": 1.6688274467411953e-05, + "loss": 0.3784, + "step": 1165 + }, + { + "epoch": 0.6808262170644479, + "grad_norm": 0.09095550569986459, + "learning_rate": 1.6680690610321747e-05, + "loss": 0.465, + "step": 1166 + }, + { + "epoch": 0.6814101160499234, + "grad_norm": 0.09210929219268231, + "learning_rate": 1.667309980718529e-05, + "loss": 0.4401, + "step": 1167 + }, + { + "epoch": 0.6819940150353989, + "grad_norm": 0.09040512054466067, + "learning_rate": 1.666550206589489e-05, + "loss": 0.4512, + "step": 1168 + }, + { + "epoch": 0.6825779140208744, + "grad_norm": 0.0888337264364902, + "learning_rate": 1.6657897394350073e-05, + "loss": 0.4373, + "step": 1169 + }, + { + "epoch": 0.6831618130063499, + "grad_norm": 0.08726455198905898, + "learning_rate": 1.665028580045756e-05, + "loss": 0.4195, + "step": 1170 + }, + { + "epoch": 0.6837457119918254, + "grad_norm": 0.08923700865174085, + "learning_rate": 1.664266729213128e-05, + "loss": 0.4151, + "step": 1171 + }, + { + "epoch": 0.6843296109773009, + "grad_norm": 0.09459936004671267, + "learning_rate": 1.6635041877292354e-05, + "loss": 0.415, + "step": 1172 + }, + { + "epoch": 0.6849135099627764, + "grad_norm": 0.0937594378954357, + "learning_rate": 1.662740956386906e-05, + "loss": 0.4662, + "step": 1173 + }, + { + "epoch": 0.685497408948252, + "grad_norm": 0.08890291694688138, + "learning_rate": 1.661977035979688e-05, + "loss": 0.4171, + "step": 1174 + }, + { + "epoch": 0.6860813079337275, + "grad_norm": 0.09096834108186448, + "learning_rate": 1.661212427301844e-05, + "loss": 0.4207, + "step": 1175 + }, + { + "epoch": 0.686665206919203, + "grad_norm": 0.09538130731044497, + "learning_rate": 1.6604471311483526e-05, + "loss": 0.4468, + "step": 1176 + }, + { + "epoch": 0.6872491059046785, + "grad_norm": 0.08716215680584576, + "learning_rate": 1.6596811483149077e-05, + "loss": 0.4088, + "step": 1177 + }, + { + "epoch": 0.687833004890154, + "grad_norm": 0.07604977389100905, + "learning_rate": 1.6589144795979165e-05, + "loss": 0.3727, + "step": 1178 + }, + { + "epoch": 0.6884169038756295, + "grad_norm": 0.08629855699425337, + "learning_rate": 1.6581471257944996e-05, + "loss": 0.3816, + "step": 1179 + }, + { + "epoch": 0.689000802861105, + "grad_norm": 0.09843717946195421, + "learning_rate": 1.6573790877024903e-05, + "loss": 0.4503, + "step": 1180 + }, + { + "epoch": 0.6895847018465805, + "grad_norm": 0.08170737274329572, + "learning_rate": 1.656610366120433e-05, + "loss": 0.4044, + "step": 1181 + }, + { + "epoch": 0.690168600832056, + "grad_norm": 0.09178649349392434, + "learning_rate": 1.6558409618475826e-05, + "loss": 0.4447, + "step": 1182 + }, + { + "epoch": 0.6907524998175316, + "grad_norm": 0.08993879170413502, + "learning_rate": 1.655070875683904e-05, + "loss": 0.4375, + "step": 1183 + }, + { + "epoch": 0.6913363988030071, + "grad_norm": 0.08761544518209897, + "learning_rate": 1.6543001084300703e-05, + "loss": 0.4333, + "step": 1184 + }, + { + "epoch": 0.6919202977884826, + "grad_norm": 0.08753365590260968, + "learning_rate": 1.653528660887465e-05, + "loss": 0.4424, + "step": 1185 + }, + { + "epoch": 0.6925041967739581, + "grad_norm": 0.08783193942183658, + "learning_rate": 1.652756533858176e-05, + "loss": 0.3863, + "step": 1186 + }, + { + "epoch": 0.6930880957594336, + "grad_norm": 0.08717488494583298, + "learning_rate": 1.651983728145e-05, + "loss": 0.4475, + "step": 1187 + }, + { + "epoch": 0.6936719947449091, + "grad_norm": 0.08819289137806886, + "learning_rate": 1.6512102445514376e-05, + "loss": 0.4221, + "step": 1188 + }, + { + "epoch": 0.6942558937303847, + "grad_norm": 0.08819307240033131, + "learning_rate": 1.6504360838816956e-05, + "loss": 0.4191, + "step": 1189 + }, + { + "epoch": 0.6948397927158602, + "grad_norm": 0.0927147357030515, + "learning_rate": 1.6496612469406835e-05, + "loss": 0.4625, + "step": 1190 + }, + { + "epoch": 0.6954236917013357, + "grad_norm": 0.0877477638797805, + "learning_rate": 1.648885734534015e-05, + "loss": 0.4121, + "step": 1191 + }, + { + "epoch": 0.6960075906868112, + "grad_norm": 0.0852483747473004, + "learning_rate": 1.6481095474680062e-05, + "loss": 0.4165, + "step": 1192 + }, + { + "epoch": 0.6965914896722867, + "grad_norm": 0.10578122777688247, + "learning_rate": 1.6473326865496736e-05, + "loss": 0.4337, + "step": 1193 + }, + { + "epoch": 0.6971753886577622, + "grad_norm": 0.0947143924593682, + "learning_rate": 1.6465551525867347e-05, + "loss": 0.4363, + "step": 1194 + }, + { + "epoch": 0.6977592876432377, + "grad_norm": 0.0848918234512545, + "learning_rate": 1.6457769463876078e-05, + "loss": 0.413, + "step": 1195 + }, + { + "epoch": 0.6983431866287132, + "grad_norm": 0.10471604545465725, + "learning_rate": 1.644998068761408e-05, + "loss": 0.472, + "step": 1196 + }, + { + "epoch": 0.6989270856141887, + "grad_norm": 0.08504891596407778, + "learning_rate": 1.6442185205179507e-05, + "loss": 0.4, + "step": 1197 + }, + { + "epoch": 0.6995109845996642, + "grad_norm": 0.09032905745826944, + "learning_rate": 1.6434383024677475e-05, + "loss": 0.419, + "step": 1198 + }, + { + "epoch": 0.7000948835851397, + "grad_norm": 0.09426478967538103, + "learning_rate": 1.6426574154220066e-05, + "loss": 0.4673, + "step": 1199 + }, + { + "epoch": 0.7006787825706153, + "grad_norm": 0.08470408831430835, + "learning_rate": 1.6418758601926313e-05, + "loss": 0.4111, + "step": 1200 + }, + { + "epoch": 0.7012626815560908, + "grad_norm": 0.10497814696262345, + "learning_rate": 1.64109363759222e-05, + "loss": 0.4208, + "step": 1201 + }, + { + "epoch": 0.7018465805415663, + "grad_norm": 0.09440212077890532, + "learning_rate": 1.640310748434066e-05, + "loss": 0.4163, + "step": 1202 + }, + { + "epoch": 0.7024304795270419, + "grad_norm": 0.08115280681269577, + "learning_rate": 1.639527193532154e-05, + "loss": 0.4507, + "step": 1203 + }, + { + "epoch": 0.7030143785125174, + "grad_norm": 0.10258708340599992, + "learning_rate": 1.6387429737011612e-05, + "loss": 0.4482, + "step": 1204 + }, + { + "epoch": 0.7035982774979929, + "grad_norm": 0.08247246949003272, + "learning_rate": 1.6379580897564568e-05, + "loss": 0.4165, + "step": 1205 + }, + { + "epoch": 0.7041821764834684, + "grad_norm": 0.08822487541813892, + "learning_rate": 1.6371725425141e-05, + "loss": 0.4165, + "step": 1206 + }, + { + "epoch": 0.7047660754689439, + "grad_norm": 0.09628446107468998, + "learning_rate": 1.6363863327908405e-05, + "loss": 0.48, + "step": 1207 + }, + { + "epoch": 0.7053499744544194, + "grad_norm": 0.09022420267372654, + "learning_rate": 1.6355994614041154e-05, + "loss": 0.4575, + "step": 1208 + }, + { + "epoch": 0.7059338734398949, + "grad_norm": 0.08551722802105703, + "learning_rate": 1.6348119291720504e-05, + "loss": 0.4391, + "step": 1209 + }, + { + "epoch": 0.7065177724253704, + "grad_norm": 0.09581479325655787, + "learning_rate": 1.634023736913459e-05, + "loss": 0.4383, + "step": 1210 + }, + { + "epoch": 0.7071016714108459, + "grad_norm": 0.10075555966408425, + "learning_rate": 1.6332348854478398e-05, + "loss": 0.4734, + "step": 1211 + }, + { + "epoch": 0.7076855703963214, + "grad_norm": 0.0857645015744296, + "learning_rate": 1.6324453755953772e-05, + "loss": 0.4272, + "step": 1212 + }, + { + "epoch": 0.7082694693817969, + "grad_norm": 0.09040367043391744, + "learning_rate": 1.6316552081769404e-05, + "loss": 0.4393, + "step": 1213 + }, + { + "epoch": 0.7088533683672724, + "grad_norm": 0.10229309103612072, + "learning_rate": 1.630864384014083e-05, + "loss": 0.4769, + "step": 1214 + }, + { + "epoch": 0.7094372673527479, + "grad_norm": 0.08933039105571446, + "learning_rate": 1.6300729039290386e-05, + "loss": 0.4159, + "step": 1215 + }, + { + "epoch": 0.7100211663382235, + "grad_norm": 0.08530940071061638, + "learning_rate": 1.6292807687447258e-05, + "loss": 0.4071, + "step": 1216 + }, + { + "epoch": 0.710605065323699, + "grad_norm": 0.08630546249077091, + "learning_rate": 1.6284879792847433e-05, + "loss": 0.3924, + "step": 1217 + }, + { + "epoch": 0.7111889643091746, + "grad_norm": 0.09901760717543366, + "learning_rate": 1.62769453637337e-05, + "loss": 0.4929, + "step": 1218 + }, + { + "epoch": 0.7117728632946501, + "grad_norm": 0.08907217191441917, + "learning_rate": 1.626900440835564e-05, + "loss": 0.4652, + "step": 1219 + }, + { + "epoch": 0.7123567622801256, + "grad_norm": 0.09111013922376361, + "learning_rate": 1.6261056934969626e-05, + "loss": 0.4276, + "step": 1220 + }, + { + "epoch": 0.7129406612656011, + "grad_norm": 0.09263735598097972, + "learning_rate": 1.6253102951838794e-05, + "loss": 0.42, + "step": 1221 + }, + { + "epoch": 0.7135245602510766, + "grad_norm": 0.08871531865701, + "learning_rate": 1.6245142467233067e-05, + "loss": 0.4206, + "step": 1222 + }, + { + "epoch": 0.7141084592365521, + "grad_norm": 0.08314280282703464, + "learning_rate": 1.6237175489429114e-05, + "loss": 0.4094, + "step": 1223 + }, + { + "epoch": 0.7146923582220276, + "grad_norm": 0.08543186713144597, + "learning_rate": 1.6229202026710356e-05, + "loss": 0.4186, + "step": 1224 + }, + { + "epoch": 0.7152762572075031, + "grad_norm": 0.08464334506486154, + "learning_rate": 1.622122208736697e-05, + "loss": 0.3672, + "step": 1225 + }, + { + "epoch": 0.7158601561929786, + "grad_norm": 0.08475675726446479, + "learning_rate": 1.6213235679695847e-05, + "loss": 0.4211, + "step": 1226 + }, + { + "epoch": 0.7164440551784541, + "grad_norm": 0.08918359508660942, + "learning_rate": 1.620524281200062e-05, + "loss": 0.407, + "step": 1227 + }, + { + "epoch": 0.7170279541639296, + "grad_norm": 0.0841365729437497, + "learning_rate": 1.6197243492591627e-05, + "loss": 0.4286, + "step": 1228 + }, + { + "epoch": 0.7176118531494051, + "grad_norm": 0.09692060314570868, + "learning_rate": 1.618923772978592e-05, + "loss": 0.4462, + "step": 1229 + }, + { + "epoch": 0.7181957521348806, + "grad_norm": 0.07785638307126284, + "learning_rate": 1.618122553190725e-05, + "loss": 0.376, + "step": 1230 + }, + { + "epoch": 0.7187796511203561, + "grad_norm": 0.08970265083624229, + "learning_rate": 1.617320690728606e-05, + "loss": 0.4178, + "step": 1231 + }, + { + "epoch": 0.7193635501058316, + "grad_norm": 0.08833127145548246, + "learning_rate": 1.6165181864259463e-05, + "loss": 0.4489, + "step": 1232 + }, + { + "epoch": 0.7199474490913073, + "grad_norm": 0.08800187500655653, + "learning_rate": 1.6157150411171268e-05, + "loss": 0.4456, + "step": 1233 + }, + { + "epoch": 0.7205313480767828, + "grad_norm": 0.09760789406943039, + "learning_rate": 1.614911255637193e-05, + "loss": 0.4379, + "step": 1234 + }, + { + "epoch": 0.7211152470622583, + "grad_norm": 0.09294331752203233, + "learning_rate": 1.6141068308218565e-05, + "loss": 0.4089, + "step": 1235 + }, + { + "epoch": 0.7216991460477338, + "grad_norm": 0.09496275113432627, + "learning_rate": 1.6133017675074935e-05, + "loss": 0.4433, + "step": 1236 + }, + { + "epoch": 0.7222830450332093, + "grad_norm": 0.08699353112395906, + "learning_rate": 1.6124960665311447e-05, + "loss": 0.4365, + "step": 1237 + }, + { + "epoch": 0.7228669440186848, + "grad_norm": 0.08224555018489191, + "learning_rate": 1.6116897287305132e-05, + "loss": 0.3832, + "step": 1238 + }, + { + "epoch": 0.7234508430041603, + "grad_norm": 0.10353135174510854, + "learning_rate": 1.6108827549439642e-05, + "loss": 0.4914, + "step": 1239 + }, + { + "epoch": 0.7240347419896358, + "grad_norm": 0.090457849592765, + "learning_rate": 1.6100751460105244e-05, + "loss": 0.4444, + "step": 1240 + }, + { + "epoch": 0.7246186409751113, + "grad_norm": 0.08896872389060739, + "learning_rate": 1.6092669027698812e-05, + "loss": 0.4269, + "step": 1241 + }, + { + "epoch": 0.7252025399605868, + "grad_norm": 0.08279284689040584, + "learning_rate": 1.6084580260623805e-05, + "loss": 0.4374, + "step": 1242 + }, + { + "epoch": 0.7257864389460623, + "grad_norm": 0.08745709263717832, + "learning_rate": 1.6076485167290278e-05, + "loss": 0.414, + "step": 1243 + }, + { + "epoch": 0.7263703379315378, + "grad_norm": 0.08457157087848796, + "learning_rate": 1.6068383756114857e-05, + "loss": 0.4338, + "step": 1244 + }, + { + "epoch": 0.7269542369170133, + "grad_norm": 0.08938025460073754, + "learning_rate": 1.606027603552074e-05, + "loss": 0.4377, + "step": 1245 + }, + { + "epoch": 0.7275381359024888, + "grad_norm": 0.0878335839140125, + "learning_rate": 1.6052162013937688e-05, + "loss": 0.4409, + "step": 1246 + }, + { + "epoch": 0.7281220348879643, + "grad_norm": 0.08844181174520396, + "learning_rate": 1.6044041699802005e-05, + "loss": 0.4291, + "step": 1247 + }, + { + "epoch": 0.7287059338734398, + "grad_norm": 0.08953500055302316, + "learning_rate": 1.6035915101556544e-05, + "loss": 0.4264, + "step": 1248 + }, + { + "epoch": 0.7292898328589154, + "grad_norm": 0.08708263184414979, + "learning_rate": 1.6027782227650696e-05, + "loss": 0.4521, + "step": 1249 + }, + { + "epoch": 0.729873731844391, + "grad_norm": 0.08546823620303289, + "learning_rate": 1.601964308654036e-05, + "loss": 0.429, + "step": 1250 + }, + { + "epoch": 0.7304576308298665, + "grad_norm": 0.0843786323019938, + "learning_rate": 1.601149768668797e-05, + "loss": 0.4207, + "step": 1251 + }, + { + "epoch": 0.731041529815342, + "grad_norm": 0.09140473117865465, + "learning_rate": 1.6003346036562457e-05, + "loss": 0.5135, + "step": 1252 + }, + { + "epoch": 0.7316254288008175, + "grad_norm": 0.098557988521247, + "learning_rate": 1.599518814463925e-05, + "loss": 0.4679, + "step": 1253 + }, + { + "epoch": 0.732209327786293, + "grad_norm": 0.09296224977421759, + "learning_rate": 1.598702401940028e-05, + "loss": 0.4637, + "step": 1254 + }, + { + "epoch": 0.7327932267717685, + "grad_norm": 0.09169957923278857, + "learning_rate": 1.5978853669333938e-05, + "loss": 0.4381, + "step": 1255 + }, + { + "epoch": 0.733377125757244, + "grad_norm": 0.09913620224195051, + "learning_rate": 1.597067710293511e-05, + "loss": 0.4437, + "step": 1256 + }, + { + "epoch": 0.7339610247427195, + "grad_norm": 0.08734547734783811, + "learning_rate": 1.5962494328705123e-05, + "loss": 0.3621, + "step": 1257 + }, + { + "epoch": 0.734544923728195, + "grad_norm": 0.09382538622897467, + "learning_rate": 1.5954305355151775e-05, + "loss": 0.435, + "step": 1258 + }, + { + "epoch": 0.7351288227136705, + "grad_norm": 0.09001907530979908, + "learning_rate": 1.5946110190789306e-05, + "loss": 0.4655, + "step": 1259 + }, + { + "epoch": 0.735712721699146, + "grad_norm": 0.09874749117020298, + "learning_rate": 1.5937908844138386e-05, + "loss": 0.4783, + "step": 1260 + }, + { + "epoch": 0.7362966206846215, + "grad_norm": 0.08978967354479706, + "learning_rate": 1.5929701323726113e-05, + "loss": 0.4436, + "step": 1261 + }, + { + "epoch": 0.736880519670097, + "grad_norm": 0.08570848594128827, + "learning_rate": 1.5921487638086024e-05, + "loss": 0.449, + "step": 1262 + }, + { + "epoch": 0.7374644186555726, + "grad_norm": 0.08270192553326293, + "learning_rate": 1.5913267795758037e-05, + "loss": 0.4268, + "step": 1263 + }, + { + "epoch": 0.7380483176410481, + "grad_norm": 0.09281566394683134, + "learning_rate": 1.590504180528849e-05, + "loss": 0.4339, + "step": 1264 + }, + { + "epoch": 0.7386322166265236, + "grad_norm": 0.08806286647334757, + "learning_rate": 1.5896809675230106e-05, + "loss": 0.4131, + "step": 1265 + }, + { + "epoch": 0.7392161156119991, + "grad_norm": 0.08060328288592548, + "learning_rate": 1.5888571414141997e-05, + "loss": 0.3893, + "step": 1266 + }, + { + "epoch": 0.7398000145974747, + "grad_norm": 0.08970809081221105, + "learning_rate": 1.588032703058964e-05, + "loss": 0.4256, + "step": 1267 + }, + { + "epoch": 0.7403839135829502, + "grad_norm": 0.08727080153795602, + "learning_rate": 1.587207653314489e-05, + "loss": 0.4231, + "step": 1268 + }, + { + "epoch": 0.7409678125684257, + "grad_norm": 0.09016985547966007, + "learning_rate": 1.586381993038595e-05, + "loss": 0.4436, + "step": 1269 + }, + { + "epoch": 0.7415517115539012, + "grad_norm": 0.08749433172156437, + "learning_rate": 1.5855557230897373e-05, + "loss": 0.429, + "step": 1270 + }, + { + "epoch": 0.7421356105393767, + "grad_norm": 0.11324204763571995, + "learning_rate": 1.584728844327005e-05, + "loss": 0.4346, + "step": 1271 + }, + { + "epoch": 0.7427195095248522, + "grad_norm": 0.09001375981980829, + "learning_rate": 1.5839013576101206e-05, + "loss": 0.431, + "step": 1272 + }, + { + "epoch": 0.7433034085103277, + "grad_norm": 0.08363708962705085, + "learning_rate": 1.5830732637994382e-05, + "loss": 0.4206, + "step": 1273 + }, + { + "epoch": 0.7438873074958032, + "grad_norm": 0.0864935087729451, + "learning_rate": 1.5822445637559435e-05, + "loss": 0.4392, + "step": 1274 + }, + { + "epoch": 0.7444712064812787, + "grad_norm": 0.0875151650779983, + "learning_rate": 1.581415258341252e-05, + "loss": 0.4168, + "step": 1275 + }, + { + "epoch": 0.7450551054667542, + "grad_norm": 0.08350899228306129, + "learning_rate": 1.5805853484176093e-05, + "loss": 0.4223, + "step": 1276 + }, + { + "epoch": 0.7456390044522297, + "grad_norm": 0.08867158399903902, + "learning_rate": 1.5797548348478893e-05, + "loss": 0.4343, + "step": 1277 + }, + { + "epoch": 0.7462229034377053, + "grad_norm": 0.08787331260262844, + "learning_rate": 1.578923718495593e-05, + "loss": 0.4472, + "step": 1278 + }, + { + "epoch": 0.7468068024231808, + "grad_norm": 0.09492878200301828, + "learning_rate": 1.5780920002248484e-05, + "loss": 0.4596, + "step": 1279 + }, + { + "epoch": 0.7473907014086563, + "grad_norm": 0.08021225574192688, + "learning_rate": 1.5772596809004103e-05, + "loss": 0.4121, + "step": 1280 + }, + { + "epoch": 0.7479746003941318, + "grad_norm": 0.09033049742837673, + "learning_rate": 1.5764267613876565e-05, + "loss": 0.4242, + "step": 1281 + }, + { + "epoch": 0.7485584993796073, + "grad_norm": 0.08486274982331889, + "learning_rate": 1.5755932425525907e-05, + "loss": 0.4316, + "step": 1282 + }, + { + "epoch": 0.7491423983650829, + "grad_norm": 0.08720314079511025, + "learning_rate": 1.574759125261838e-05, + "loss": 0.4294, + "step": 1283 + }, + { + "epoch": 0.7497262973505584, + "grad_norm": 0.0862571845226195, + "learning_rate": 1.573924410382648e-05, + "loss": 0.4394, + "step": 1284 + }, + { + "epoch": 0.7503101963360339, + "grad_norm": 0.09328802337685337, + "learning_rate": 1.5730890987828893e-05, + "loss": 0.4556, + "step": 1285 + }, + { + "epoch": 0.7508940953215094, + "grad_norm": 0.09296372323282448, + "learning_rate": 1.5722531913310523e-05, + "loss": 0.455, + "step": 1286 + }, + { + "epoch": 0.7514779943069849, + "grad_norm": 0.08192549285717567, + "learning_rate": 1.571416688896246e-05, + "loss": 0.3777, + "step": 1287 + }, + { + "epoch": 0.7520618932924604, + "grad_norm": 0.08850136552590634, + "learning_rate": 1.5705795923481995e-05, + "loss": 0.3624, + "step": 1288 + }, + { + "epoch": 0.7526457922779359, + "grad_norm": 0.09185938280083386, + "learning_rate": 1.5697419025572577e-05, + "loss": 0.4462, + "step": 1289 + }, + { + "epoch": 0.7532296912634114, + "grad_norm": 0.09007489464689655, + "learning_rate": 1.5689036203943836e-05, + "loss": 0.4421, + "step": 1290 + }, + { + "epoch": 0.7538135902488869, + "grad_norm": 0.08828062642673987, + "learning_rate": 1.568064746731156e-05, + "loss": 0.4083, + "step": 1291 + }, + { + "epoch": 0.7543974892343625, + "grad_norm": 0.09410962507968422, + "learning_rate": 1.5672252824397683e-05, + "loss": 0.4182, + "step": 1292 + }, + { + "epoch": 0.754981388219838, + "grad_norm": 0.10477742739820071, + "learning_rate": 1.5663852283930275e-05, + "loss": 0.4461, + "step": 1293 + }, + { + "epoch": 0.7555652872053135, + "grad_norm": 0.09589326710828623, + "learning_rate": 1.5655445854643554e-05, + "loss": 0.5114, + "step": 1294 + }, + { + "epoch": 0.756149186190789, + "grad_norm": 0.0886982320174348, + "learning_rate": 1.5647033545277847e-05, + "loss": 0.4276, + "step": 1295 + }, + { + "epoch": 0.7567330851762645, + "grad_norm": 0.08665379739718508, + "learning_rate": 1.56386153645796e-05, + "loss": 0.4176, + "step": 1296 + }, + { + "epoch": 0.75731698416174, + "grad_norm": 0.08760782439381894, + "learning_rate": 1.563019132130136e-05, + "loss": 0.4051, + "step": 1297 + }, + { + "epoch": 0.7579008831472155, + "grad_norm": 0.08912573255879198, + "learning_rate": 1.562176142420177e-05, + "loss": 0.4353, + "step": 1298 + }, + { + "epoch": 0.758484782132691, + "grad_norm": 0.08956971711186638, + "learning_rate": 1.5613325682045563e-05, + "loss": 0.4374, + "step": 1299 + }, + { + "epoch": 0.7590686811181666, + "grad_norm": 0.08444998934665847, + "learning_rate": 1.5604884103603547e-05, + "loss": 0.4246, + "step": 1300 + }, + { + "epoch": 0.7596525801036421, + "grad_norm": 0.07911115617473842, + "learning_rate": 1.55964366976526e-05, + "loss": 0.4188, + "step": 1301 + }, + { + "epoch": 0.7602364790891176, + "grad_norm": 0.0885375540667988, + "learning_rate": 1.5587983472975653e-05, + "loss": 0.4353, + "step": 1302 + }, + { + "epoch": 0.7608203780745931, + "grad_norm": 0.08595217408594741, + "learning_rate": 1.5579524438361693e-05, + "loss": 0.3878, + "step": 1303 + }, + { + "epoch": 0.7614042770600686, + "grad_norm": 0.0927975221847443, + "learning_rate": 1.5571059602605746e-05, + "loss": 0.4539, + "step": 1304 + }, + { + "epoch": 0.7619881760455441, + "grad_norm": 0.09742033951208727, + "learning_rate": 1.556258897450887e-05, + "loss": 0.4402, + "step": 1305 + }, + { + "epoch": 0.7625720750310196, + "grad_norm": 0.08300099236562629, + "learning_rate": 1.5554112562878144e-05, + "loss": 0.4012, + "step": 1306 + }, + { + "epoch": 0.7631559740164952, + "grad_norm": 0.09195540527066631, + "learning_rate": 1.5545630376526665e-05, + "loss": 0.4333, + "step": 1307 + }, + { + "epoch": 0.7637398730019707, + "grad_norm": 0.0917328051425079, + "learning_rate": 1.553714242427352e-05, + "loss": 0.4227, + "step": 1308 + }, + { + "epoch": 0.7643237719874462, + "grad_norm": 0.09100570839970747, + "learning_rate": 1.5528648714943807e-05, + "loss": 0.4661, + "step": 1309 + }, + { + "epoch": 0.7649076709729217, + "grad_norm": 0.08933543497082827, + "learning_rate": 1.5520149257368608e-05, + "loss": 0.4398, + "step": 1310 + }, + { + "epoch": 0.7654915699583972, + "grad_norm": 0.08472000298570648, + "learning_rate": 1.5511644060384968e-05, + "loss": 0.4253, + "step": 1311 + }, + { + "epoch": 0.7660754689438727, + "grad_norm": 0.08576960946266672, + "learning_rate": 1.5503133132835916e-05, + "loss": 0.4345, + "step": 1312 + }, + { + "epoch": 0.7666593679293482, + "grad_norm": 0.08367397062264997, + "learning_rate": 1.5494616483570428e-05, + "loss": 0.4131, + "step": 1313 + }, + { + "epoch": 0.7672432669148237, + "grad_norm": 0.0831255952625161, + "learning_rate": 1.5486094121443434e-05, + "loss": 0.3912, + "step": 1314 + }, + { + "epoch": 0.7678271659002992, + "grad_norm": 0.08498139131137031, + "learning_rate": 1.5477566055315808e-05, + "loss": 0.4193, + "step": 1315 + }, + { + "epoch": 0.7684110648857747, + "grad_norm": 0.08222735987606651, + "learning_rate": 1.5469032294054336e-05, + "loss": 0.423, + "step": 1316 + }, + { + "epoch": 0.7689949638712503, + "grad_norm": 0.0840768968733875, + "learning_rate": 1.5460492846531748e-05, + "loss": 0.4248, + "step": 1317 + }, + { + "epoch": 0.7695788628567258, + "grad_norm": 0.0832851204743825, + "learning_rate": 1.5451947721626676e-05, + "loss": 0.4368, + "step": 1318 + }, + { + "epoch": 0.7701627618422013, + "grad_norm": 0.08688821774405606, + "learning_rate": 1.5443396928223655e-05, + "loss": 0.4358, + "step": 1319 + }, + { + "epoch": 0.7707466608276768, + "grad_norm": 0.0882140016791304, + "learning_rate": 1.5434840475213113e-05, + "loss": 0.4089, + "step": 1320 + }, + { + "epoch": 0.7713305598131523, + "grad_norm": 0.08969628293290516, + "learning_rate": 1.5426278371491363e-05, + "loss": 0.4212, + "step": 1321 + }, + { + "epoch": 0.7719144587986279, + "grad_norm": 0.0873576021814547, + "learning_rate": 1.5417710625960598e-05, + "loss": 0.4323, + "step": 1322 + }, + { + "epoch": 0.7724983577841034, + "grad_norm": 0.08933675428680202, + "learning_rate": 1.5409137247528868e-05, + "loss": 0.4166, + "step": 1323 + }, + { + "epoch": 0.7730822567695789, + "grad_norm": 0.09068594113990115, + "learning_rate": 1.5400558245110083e-05, + "loss": 0.4106, + "step": 1324 + }, + { + "epoch": 0.7736661557550544, + "grad_norm": 0.08883704730934641, + "learning_rate": 1.5391973627624004e-05, + "loss": 0.4271, + "step": 1325 + }, + { + "epoch": 0.7742500547405299, + "grad_norm": 0.08817814165455336, + "learning_rate": 1.538338340399623e-05, + "loss": 0.4124, + "step": 1326 + }, + { + "epoch": 0.7748339537260054, + "grad_norm": 0.09531433636073972, + "learning_rate": 1.5374787583158188e-05, + "loss": 0.4574, + "step": 1327 + }, + { + "epoch": 0.7754178527114809, + "grad_norm": 0.09506266034951548, + "learning_rate": 1.5366186174047114e-05, + "loss": 0.4619, + "step": 1328 + }, + { + "epoch": 0.7760017516969564, + "grad_norm": 0.09075039175759929, + "learning_rate": 1.535757918560607e-05, + "loss": 0.4607, + "step": 1329 + }, + { + "epoch": 0.7765856506824319, + "grad_norm": 0.14411107177615975, + "learning_rate": 1.534896662678391e-05, + "loss": 0.4914, + "step": 1330 + }, + { + "epoch": 0.7771695496679074, + "grad_norm": 0.08691551452569209, + "learning_rate": 1.534034850653528e-05, + "loss": 0.4395, + "step": 1331 + }, + { + "epoch": 0.7777534486533829, + "grad_norm": 0.10414137733149162, + "learning_rate": 1.533172483382062e-05, + "loss": 0.4039, + "step": 1332 + }, + { + "epoch": 0.7783373476388584, + "grad_norm": 0.08692928095940348, + "learning_rate": 1.532309561760612e-05, + "loss": 0.4374, + "step": 1333 + }, + { + "epoch": 0.778921246624334, + "grad_norm": 0.08972606783410254, + "learning_rate": 1.5314460866863758e-05, + "loss": 0.479, + "step": 1334 + }, + { + "epoch": 0.7795051456098095, + "grad_norm": 0.078611643777947, + "learning_rate": 1.530582059057125e-05, + "loss": 0.3822, + "step": 1335 + }, + { + "epoch": 0.780089044595285, + "grad_norm": 0.09733500574005255, + "learning_rate": 1.5297174797712057e-05, + "loss": 0.4725, + "step": 1336 + }, + { + "epoch": 0.7806729435807606, + "grad_norm": 0.10013790356773264, + "learning_rate": 1.5288523497275392e-05, + "loss": 0.4168, + "step": 1337 + }, + { + "epoch": 0.7812568425662361, + "grad_norm": 0.0940818053373547, + "learning_rate": 1.5279866698256177e-05, + "loss": 0.4591, + "step": 1338 + }, + { + "epoch": 0.7818407415517116, + "grad_norm": 0.08219905060192716, + "learning_rate": 1.5271204409655055e-05, + "loss": 0.3747, + "step": 1339 + }, + { + "epoch": 0.7824246405371871, + "grad_norm": 0.08971097687174456, + "learning_rate": 1.5262536640478386e-05, + "loss": 0.4178, + "step": 1340 + }, + { + "epoch": 0.7830085395226626, + "grad_norm": 0.08854599668297773, + "learning_rate": 1.5253863399738218e-05, + "loss": 0.4276, + "step": 1341 + }, + { + "epoch": 0.7835924385081381, + "grad_norm": 0.08693941039757878, + "learning_rate": 1.5245184696452286e-05, + "loss": 0.419, + "step": 1342 + }, + { + "epoch": 0.7841763374936136, + "grad_norm": 0.0907392409577035, + "learning_rate": 1.5236500539644015e-05, + "loss": 0.4846, + "step": 1343 + }, + { + "epoch": 0.7847602364790891, + "grad_norm": 0.08257019193407462, + "learning_rate": 1.5227810938342493e-05, + "loss": 0.4502, + "step": 1344 + }, + { + "epoch": 0.7853441354645646, + "grad_norm": 0.08990360999674157, + "learning_rate": 1.5219115901582471e-05, + "loss": 0.4427, + "step": 1345 + }, + { + "epoch": 0.7859280344500401, + "grad_norm": 0.09151262350104217, + "learning_rate": 1.5210415438404354e-05, + "loss": 0.4506, + "step": 1346 + }, + { + "epoch": 0.7865119334355156, + "grad_norm": 0.08526337387025708, + "learning_rate": 1.5201709557854178e-05, + "loss": 0.4367, + "step": 1347 + }, + { + "epoch": 0.7870958324209911, + "grad_norm": 0.08945182213995881, + "learning_rate": 1.5192998268983625e-05, + "loss": 0.4363, + "step": 1348 + }, + { + "epoch": 0.7876797314064666, + "grad_norm": 0.08676262278749829, + "learning_rate": 1.5184281580849999e-05, + "loss": 0.4024, + "step": 1349 + }, + { + "epoch": 0.7882636303919422, + "grad_norm": 0.08405646415424177, + "learning_rate": 1.51755595025162e-05, + "loss": 0.4297, + "step": 1350 + }, + { + "epoch": 0.7888475293774178, + "grad_norm": 0.08048245052903868, + "learning_rate": 1.5166832043050757e-05, + "loss": 0.3828, + "step": 1351 + }, + { + "epoch": 0.7894314283628933, + "grad_norm": 0.08680176192525417, + "learning_rate": 1.5158099211527776e-05, + "loss": 0.4133, + "step": 1352 + }, + { + "epoch": 0.7900153273483688, + "grad_norm": 0.08198262291002213, + "learning_rate": 1.5149361017026957e-05, + "loss": 0.4175, + "step": 1353 + }, + { + "epoch": 0.7905992263338443, + "grad_norm": 0.08938194056457902, + "learning_rate": 1.5140617468633579e-05, + "loss": 0.4232, + "step": 1354 + }, + { + "epoch": 0.7911831253193198, + "grad_norm": 0.09315861796281333, + "learning_rate": 1.513186857543847e-05, + "loss": 0.4528, + "step": 1355 + }, + { + "epoch": 0.7917670243047953, + "grad_norm": 0.09026431450982407, + "learning_rate": 1.5123114346538037e-05, + "loss": 0.4194, + "step": 1356 + }, + { + "epoch": 0.7923509232902708, + "grad_norm": 0.08235814984120672, + "learning_rate": 1.5114354791034225e-05, + "loss": 0.4079, + "step": 1357 + }, + { + "epoch": 0.7929348222757463, + "grad_norm": 0.0939647778517026, + "learning_rate": 1.5105589918034511e-05, + "loss": 0.396, + "step": 1358 + }, + { + "epoch": 0.7935187212612218, + "grad_norm": 0.09672719463403046, + "learning_rate": 1.5096819736651913e-05, + "loss": 0.4383, + "step": 1359 + }, + { + "epoch": 0.7941026202466973, + "grad_norm": 0.08677194533463, + "learning_rate": 1.5088044256004958e-05, + "loss": 0.4328, + "step": 1360 + }, + { + "epoch": 0.7946865192321728, + "grad_norm": 0.08794424672727218, + "learning_rate": 1.5079263485217693e-05, + "loss": 0.442, + "step": 1361 + }, + { + "epoch": 0.7952704182176483, + "grad_norm": 0.08576463844196482, + "learning_rate": 1.507047743341965e-05, + "loss": 0.3859, + "step": 1362 + }, + { + "epoch": 0.7958543172031238, + "grad_norm": 0.08811529005607563, + "learning_rate": 1.506168610974587e-05, + "loss": 0.4444, + "step": 1363 + }, + { + "epoch": 0.7964382161885993, + "grad_norm": 0.08739369695646533, + "learning_rate": 1.505288952333686e-05, + "loss": 0.4102, + "step": 1364 + }, + { + "epoch": 0.7970221151740748, + "grad_norm": 0.08721885258858723, + "learning_rate": 1.5044087683338609e-05, + "loss": 0.4305, + "step": 1365 + }, + { + "epoch": 0.7976060141595503, + "grad_norm": 0.08691349111282216, + "learning_rate": 1.5035280598902557e-05, + "loss": 0.3984, + "step": 1366 + }, + { + "epoch": 0.798189913145026, + "grad_norm": 0.08338766248298338, + "learning_rate": 1.5026468279185615e-05, + "loss": 0.4013, + "step": 1367 + }, + { + "epoch": 0.7987738121305015, + "grad_norm": 0.08927628042792697, + "learning_rate": 1.5017650733350121e-05, + "loss": 0.4337, + "step": 1368 + }, + { + "epoch": 0.799357711115977, + "grad_norm": 0.09623162242290464, + "learning_rate": 1.5008827970563848e-05, + "loss": 0.5136, + "step": 1369 + }, + { + "epoch": 0.7999416101014525, + "grad_norm": 0.08649308403043339, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.4515, + "step": 1370 + }, + { + "epoch": 0.800525509086928, + "grad_norm": 0.08741496607304533, + "learning_rate": 1.4991166830837198e-05, + "loss": 0.4038, + "step": 1371 + }, + { + "epoch": 0.8011094080724035, + "grad_norm": 0.08457288530479583, + "learning_rate": 1.4982328472259453e-05, + "loss": 0.3917, + "step": 1372 + }, + { + "epoch": 0.801693307057879, + "grad_norm": 0.0949345996428081, + "learning_rate": 1.4973484933456191e-05, + "loss": 0.4135, + "step": 1373 + }, + { + "epoch": 0.8022772060433545, + "grad_norm": 0.0948515743445431, + "learning_rate": 1.4964636223622206e-05, + "loss": 0.4838, + "step": 1374 + }, + { + "epoch": 0.80286110502883, + "grad_norm": 0.08441804121170958, + "learning_rate": 1.4955782351957681e-05, + "loss": 0.3819, + "step": 1375 + }, + { + "epoch": 0.8034450040143055, + "grad_norm": 0.09358074983383914, + "learning_rate": 1.4946923327668164e-05, + "loss": 0.4227, + "step": 1376 + }, + { + "epoch": 0.804028902999781, + "grad_norm": 0.0869519899918237, + "learning_rate": 1.4938059159964555e-05, + "loss": 0.4267, + "step": 1377 + }, + { + "epoch": 0.8046128019852565, + "grad_norm": 0.08426099489376895, + "learning_rate": 1.4929189858063103e-05, + "loss": 0.3805, + "step": 1378 + }, + { + "epoch": 0.805196700970732, + "grad_norm": 0.09100906273228264, + "learning_rate": 1.4920315431185398e-05, + "loss": 0.4049, + "step": 1379 + }, + { + "epoch": 0.8057805999562075, + "grad_norm": 0.08898692446138388, + "learning_rate": 1.4911435888558356e-05, + "loss": 0.4532, + "step": 1380 + }, + { + "epoch": 0.806364498941683, + "grad_norm": 0.08347311748075427, + "learning_rate": 1.4902551239414218e-05, + "loss": 0.4303, + "step": 1381 + }, + { + "epoch": 0.8069483979271586, + "grad_norm": 0.08553354262610621, + "learning_rate": 1.4893661492990527e-05, + "loss": 0.4462, + "step": 1382 + }, + { + "epoch": 0.8075322969126341, + "grad_norm": 0.08908836082478563, + "learning_rate": 1.4884766658530126e-05, + "loss": 0.4117, + "step": 1383 + }, + { + "epoch": 0.8081161958981097, + "grad_norm": 0.08341714800691159, + "learning_rate": 1.487586674528115e-05, + "loss": 0.3765, + "step": 1384 + }, + { + "epoch": 0.8087000948835852, + "grad_norm": 0.09756953271569536, + "learning_rate": 1.4866961762497018e-05, + "loss": 0.4286, + "step": 1385 + }, + { + "epoch": 0.8092839938690607, + "grad_norm": 0.08353154022864129, + "learning_rate": 1.4858051719436418e-05, + "loss": 0.3859, + "step": 1386 + }, + { + "epoch": 0.8098678928545362, + "grad_norm": 0.08445758773880473, + "learning_rate": 1.4849136625363297e-05, + "loss": 0.3903, + "step": 1387 + }, + { + "epoch": 0.8104517918400117, + "grad_norm": 0.09371855279917882, + "learning_rate": 1.484021648954685e-05, + "loss": 0.4356, + "step": 1388 + }, + { + "epoch": 0.8110356908254872, + "grad_norm": 0.0984257565225866, + "learning_rate": 1.4831291321261523e-05, + "loss": 0.4843, + "step": 1389 + }, + { + "epoch": 0.8116195898109627, + "grad_norm": 0.08878965118518649, + "learning_rate": 1.4822361129786992e-05, + "loss": 0.4207, + "step": 1390 + }, + { + "epoch": 0.8122034887964382, + "grad_norm": 0.08802699597727595, + "learning_rate": 1.4813425924408151e-05, + "loss": 0.4189, + "step": 1391 + }, + { + "epoch": 0.8127873877819137, + "grad_norm": 0.08550268666206581, + "learning_rate": 1.480448571441511e-05, + "loss": 0.4375, + "step": 1392 + }, + { + "epoch": 0.8133712867673892, + "grad_norm": 0.0865321792867951, + "learning_rate": 1.4795540509103182e-05, + "loss": 0.4546, + "step": 1393 + }, + { + "epoch": 0.8139551857528647, + "grad_norm": 0.0829195018321343, + "learning_rate": 1.4786590317772875e-05, + "loss": 0.3894, + "step": 1394 + }, + { + "epoch": 0.8145390847383402, + "grad_norm": 0.08525448149791623, + "learning_rate": 1.4777635149729878e-05, + "loss": 0.4214, + "step": 1395 + }, + { + "epoch": 0.8151229837238158, + "grad_norm": 0.0857253167818637, + "learning_rate": 1.4768675014285063e-05, + "loss": 0.4163, + "step": 1396 + }, + { + "epoch": 0.8157068827092913, + "grad_norm": 0.08104362862052306, + "learning_rate": 1.4759709920754453e-05, + "loss": 0.4373, + "step": 1397 + }, + { + "epoch": 0.8162907816947668, + "grad_norm": 0.08418748063638393, + "learning_rate": 1.4750739878459233e-05, + "loss": 0.4506, + "step": 1398 + }, + { + "epoch": 0.8168746806802423, + "grad_norm": 0.08359343580727807, + "learning_rate": 1.4741764896725736e-05, + "loss": 0.4154, + "step": 1399 + }, + { + "epoch": 0.8174585796657178, + "grad_norm": 0.08497476501621447, + "learning_rate": 1.473278498488543e-05, + "loss": 0.4361, + "step": 1400 + }, + { + "epoch": 0.8180424786511934, + "grad_norm": 0.08725902592329478, + "learning_rate": 1.4723800152274905e-05, + "loss": 0.4232, + "step": 1401 + }, + { + "epoch": 0.8186263776366689, + "grad_norm": 0.07982829795502507, + "learning_rate": 1.471481040823587e-05, + "loss": 0.4145, + "step": 1402 + }, + { + "epoch": 0.8192102766221444, + "grad_norm": 0.08254828194439041, + "learning_rate": 1.4705815762115138e-05, + "loss": 0.4342, + "step": 1403 + }, + { + "epoch": 0.8197941756076199, + "grad_norm": 0.07784782125425614, + "learning_rate": 1.4696816223264622e-05, + "loss": 0.3773, + "step": 1404 + }, + { + "epoch": 0.8203780745930954, + "grad_norm": 0.08559965706338227, + "learning_rate": 1.4687811801041323e-05, + "loss": 0.3981, + "step": 1405 + }, + { + "epoch": 0.8209619735785709, + "grad_norm": 0.08376924157650358, + "learning_rate": 1.4678802504807313e-05, + "loss": 0.3916, + "step": 1406 + }, + { + "epoch": 0.8215458725640464, + "grad_norm": 0.08400265146672667, + "learning_rate": 1.4669788343929736e-05, + "loss": 0.4338, + "step": 1407 + }, + { + "epoch": 0.8221297715495219, + "grad_norm": 0.07927374863188365, + "learning_rate": 1.4660769327780796e-05, + "loss": 0.4, + "step": 1408 + }, + { + "epoch": 0.8227136705349974, + "grad_norm": 0.08921189904009948, + "learning_rate": 1.465174546573774e-05, + "loss": 0.4516, + "step": 1409 + }, + { + "epoch": 0.823297569520473, + "grad_norm": 0.08514832895598788, + "learning_rate": 1.4642716767182858e-05, + "loss": 0.3979, + "step": 1410 + }, + { + "epoch": 0.8238814685059485, + "grad_norm": 0.08563758866247223, + "learning_rate": 1.4633683241503464e-05, + "loss": 0.4556, + "step": 1411 + }, + { + "epoch": 0.824465367491424, + "grad_norm": 0.08704493784897764, + "learning_rate": 1.4624644898091898e-05, + "loss": 0.4021, + "step": 1412 + }, + { + "epoch": 0.8250492664768995, + "grad_norm": 0.08306763015677239, + "learning_rate": 1.4615601746345501e-05, + "loss": 0.4051, + "step": 1413 + }, + { + "epoch": 0.825633165462375, + "grad_norm": 0.08382663139072355, + "learning_rate": 1.4606553795666616e-05, + "loss": 0.3872, + "step": 1414 + }, + { + "epoch": 0.8262170644478505, + "grad_norm": 0.08774470980459309, + "learning_rate": 1.4597501055462577e-05, + "loss": 0.4402, + "step": 1415 + }, + { + "epoch": 0.826800963433326, + "grad_norm": 0.0849742521985299, + "learning_rate": 1.45884435351457e-05, + "loss": 0.384, + "step": 1416 + }, + { + "epoch": 0.8273848624188015, + "grad_norm": 0.09535451213720403, + "learning_rate": 1.4579381244133265e-05, + "loss": 0.4351, + "step": 1417 + }, + { + "epoch": 0.8279687614042771, + "grad_norm": 0.0930457821558439, + "learning_rate": 1.457031419184752e-05, + "loss": 0.469, + "step": 1418 + }, + { + "epoch": 0.8285526603897526, + "grad_norm": 0.08148377954983087, + "learning_rate": 1.4561242387715652e-05, + "loss": 0.3893, + "step": 1419 + }, + { + "epoch": 0.8291365593752281, + "grad_norm": 0.08372416889157165, + "learning_rate": 1.45521658411698e-05, + "loss": 0.4331, + "step": 1420 + }, + { + "epoch": 0.8297204583607036, + "grad_norm": 0.09120487405462163, + "learning_rate": 1.4543084561647028e-05, + "loss": 0.4506, + "step": 1421 + }, + { + "epoch": 0.8303043573461791, + "grad_norm": 0.08727906670842314, + "learning_rate": 1.4533998558589319e-05, + "loss": 0.387, + "step": 1422 + }, + { + "epoch": 0.8308882563316546, + "grad_norm": 0.08093921923468177, + "learning_rate": 1.4524907841443576e-05, + "loss": 0.4134, + "step": 1423 + }, + { + "epoch": 0.8314721553171301, + "grad_norm": 0.08465365095453808, + "learning_rate": 1.4515812419661595e-05, + "loss": 0.3828, + "step": 1424 + }, + { + "epoch": 0.8320560543026057, + "grad_norm": 0.08434847298354427, + "learning_rate": 1.4506712302700064e-05, + "loss": 0.4194, + "step": 1425 + }, + { + "epoch": 0.8326399532880812, + "grad_norm": 0.10213354078796132, + "learning_rate": 1.4497607500020556e-05, + "loss": 0.4133, + "step": 1426 + }, + { + "epoch": 0.8332238522735567, + "grad_norm": 0.10570374599572276, + "learning_rate": 1.4488498021089514e-05, + "loss": 0.4095, + "step": 1427 + }, + { + "epoch": 0.8338077512590322, + "grad_norm": 0.07862099092594257, + "learning_rate": 1.4479383875378245e-05, + "loss": 0.405, + "step": 1428 + }, + { + "epoch": 0.8343916502445077, + "grad_norm": 0.08521245026399711, + "learning_rate": 1.4470265072362906e-05, + "loss": 0.4264, + "step": 1429 + }, + { + "epoch": 0.8349755492299832, + "grad_norm": 0.08477594459925808, + "learning_rate": 1.4461141621524498e-05, + "loss": 0.4358, + "step": 1430 + }, + { + "epoch": 0.8355594482154587, + "grad_norm": 0.08583769995139516, + "learning_rate": 1.4452013532348849e-05, + "loss": 0.4817, + "step": 1431 + }, + { + "epoch": 0.8361433472009342, + "grad_norm": 0.08509424096018782, + "learning_rate": 1.444288081432662e-05, + "loss": 0.3982, + "step": 1432 + }, + { + "epoch": 0.8367272461864097, + "grad_norm": 0.08774052925352913, + "learning_rate": 1.443374347695328e-05, + "loss": 0.4228, + "step": 1433 + }, + { + "epoch": 0.8373111451718853, + "grad_norm": 0.08782953692489166, + "learning_rate": 1.442460152972909e-05, + "loss": 0.4253, + "step": 1434 + }, + { + "epoch": 0.8378950441573608, + "grad_norm": 0.08191628478036919, + "learning_rate": 1.4415454982159121e-05, + "loss": 0.4234, + "step": 1435 + }, + { + "epoch": 0.8384789431428363, + "grad_norm": 0.08421800481572464, + "learning_rate": 1.4406303843753215e-05, + "loss": 0.4367, + "step": 1436 + }, + { + "epoch": 0.8390628421283118, + "grad_norm": 0.0858773834166193, + "learning_rate": 1.4397148124025997e-05, + "loss": 0.4442, + "step": 1437 + }, + { + "epoch": 0.8396467411137873, + "grad_norm": 0.08682633963992839, + "learning_rate": 1.4387987832496848e-05, + "loss": 0.4122, + "step": 1438 + }, + { + "epoch": 0.8402306400992628, + "grad_norm": 0.08660306643709208, + "learning_rate": 1.4378822978689901e-05, + "loss": 0.3945, + "step": 1439 + }, + { + "epoch": 0.8408145390847384, + "grad_norm": 0.08959508599750565, + "learning_rate": 1.436965357213404e-05, + "loss": 0.4129, + "step": 1440 + }, + { + "epoch": 0.8413984380702139, + "grad_norm": 0.0888136519713605, + "learning_rate": 1.4360479622362877e-05, + "loss": 0.4278, + "step": 1441 + }, + { + "epoch": 0.8419823370556894, + "grad_norm": 0.09010149044286533, + "learning_rate": 1.4351301138914749e-05, + "loss": 0.4273, + "step": 1442 + }, + { + "epoch": 0.8425662360411649, + "grad_norm": 0.09325272168569715, + "learning_rate": 1.4342118131332704e-05, + "loss": 0.3861, + "step": 1443 + }, + { + "epoch": 0.8431501350266404, + "grad_norm": 0.0875109817446603, + "learning_rate": 1.4332930609164503e-05, + "loss": 0.3917, + "step": 1444 + }, + { + "epoch": 0.8437340340121159, + "grad_norm": 0.08287898400866792, + "learning_rate": 1.4323738581962593e-05, + "loss": 0.4083, + "step": 1445 + }, + { + "epoch": 0.8443179329975914, + "grad_norm": 0.08445051095594383, + "learning_rate": 1.4314542059284102e-05, + "loss": 0.4222, + "step": 1446 + }, + { + "epoch": 0.8449018319830669, + "grad_norm": 0.08697744626239587, + "learning_rate": 1.4305341050690845e-05, + "loss": 0.4844, + "step": 1447 + }, + { + "epoch": 0.8454857309685424, + "grad_norm": 0.10530850373930135, + "learning_rate": 1.429613556574928e-05, + "loss": 0.5045, + "step": 1448 + }, + { + "epoch": 0.8460696299540179, + "grad_norm": 0.09188925680308502, + "learning_rate": 1.4286925614030542e-05, + "loss": 0.4556, + "step": 1449 + }, + { + "epoch": 0.8466535289394934, + "grad_norm": 0.08697914303787044, + "learning_rate": 1.4277711205110398e-05, + "loss": 0.4252, + "step": 1450 + }, + { + "epoch": 0.847237427924969, + "grad_norm": 0.0842751040468012, + "learning_rate": 1.4268492348569252e-05, + "loss": 0.4161, + "step": 1451 + }, + { + "epoch": 0.8478213269104445, + "grad_norm": 0.09385694788447634, + "learning_rate": 1.425926905399213e-05, + "loss": 0.4233, + "step": 1452 + }, + { + "epoch": 0.84840522589592, + "grad_norm": 0.08566509672001346, + "learning_rate": 1.4250041330968674e-05, + "loss": 0.4377, + "step": 1453 + }, + { + "epoch": 0.8489891248813956, + "grad_norm": 0.09067484937538668, + "learning_rate": 1.424080918909313e-05, + "loss": 0.4394, + "step": 1454 + }, + { + "epoch": 0.8495730238668711, + "grad_norm": 0.08233011710356745, + "learning_rate": 1.4231572637964338e-05, + "loss": 0.4001, + "step": 1455 + }, + { + "epoch": 0.8501569228523466, + "grad_norm": 0.08966522660614556, + "learning_rate": 1.4222331687185723e-05, + "loss": 0.4323, + "step": 1456 + }, + { + "epoch": 0.8507408218378221, + "grad_norm": 0.08578450331793387, + "learning_rate": 1.421308634636529e-05, + "loss": 0.4121, + "step": 1457 + }, + { + "epoch": 0.8513247208232976, + "grad_norm": 0.0956093503473396, + "learning_rate": 1.4203836625115595e-05, + "loss": 0.4565, + "step": 1458 + }, + { + "epoch": 0.8519086198087731, + "grad_norm": 0.08528895306872959, + "learning_rate": 1.419458253305376e-05, + "loss": 0.4211, + "step": 1459 + }, + { + "epoch": 0.8524925187942486, + "grad_norm": 0.08577558587230055, + "learning_rate": 1.4185324079801447e-05, + "loss": 0.4305, + "step": 1460 + }, + { + "epoch": 0.8530764177797241, + "grad_norm": 0.08769460161134433, + "learning_rate": 1.4176061274984858e-05, + "loss": 0.4519, + "step": 1461 + }, + { + "epoch": 0.8536603167651996, + "grad_norm": 0.08436324642562373, + "learning_rate": 1.4166794128234705e-05, + "loss": 0.4078, + "step": 1462 + }, + { + "epoch": 0.8542442157506751, + "grad_norm": 0.08253800690837028, + "learning_rate": 1.415752264918623e-05, + "loss": 0.4049, + "step": 1463 + }, + { + "epoch": 0.8548281147361506, + "grad_norm": 0.08750289845288964, + "learning_rate": 1.4148246847479173e-05, + "loss": 0.4329, + "step": 1464 + }, + { + "epoch": 0.8554120137216261, + "grad_norm": 0.0865752600418391, + "learning_rate": 1.4138966732757766e-05, + "loss": 0.415, + "step": 1465 + }, + { + "epoch": 0.8559959127071016, + "grad_norm": 0.08235814318886843, + "learning_rate": 1.4129682314670731e-05, + "loss": 0.4299, + "step": 1466 + }, + { + "epoch": 0.8565798116925771, + "grad_norm": 0.08465543426810686, + "learning_rate": 1.412039360287126e-05, + "loss": 0.4002, + "step": 1467 + }, + { + "epoch": 0.8571637106780527, + "grad_norm": 0.08555435217129786, + "learning_rate": 1.411110060701701e-05, + "loss": 0.4307, + "step": 1468 + }, + { + "epoch": 0.8577476096635283, + "grad_norm": 0.0882503357109783, + "learning_rate": 1.4101803336770092e-05, + "loss": 0.4033, + "step": 1469 + }, + { + "epoch": 0.8583315086490038, + "grad_norm": 0.08784722097168893, + "learning_rate": 1.4092501801797063e-05, + "loss": 0.3898, + "step": 1470 + }, + { + "epoch": 0.8589154076344793, + "grad_norm": 0.08934590389336634, + "learning_rate": 1.4083196011768913e-05, + "loss": 0.4143, + "step": 1471 + }, + { + "epoch": 0.8594993066199548, + "grad_norm": 0.08946491401334677, + "learning_rate": 1.4073885976361056e-05, + "loss": 0.4251, + "step": 1472 + }, + { + "epoch": 0.8600832056054303, + "grad_norm": 0.08395350953512487, + "learning_rate": 1.4064571705253323e-05, + "loss": 0.3885, + "step": 1473 + }, + { + "epoch": 0.8606671045909058, + "grad_norm": 0.08030836277564558, + "learning_rate": 1.405525320812994e-05, + "loss": 0.4183, + "step": 1474 + }, + { + "epoch": 0.8612510035763813, + "grad_norm": 0.07955413398676385, + "learning_rate": 1.4045930494679538e-05, + "loss": 0.3767, + "step": 1475 + }, + { + "epoch": 0.8618349025618568, + "grad_norm": 0.09637253020036447, + "learning_rate": 1.4036603574595122e-05, + "loss": 0.4402, + "step": 1476 + }, + { + "epoch": 0.8624188015473323, + "grad_norm": 0.08129002657756883, + "learning_rate": 1.4027272457574082e-05, + "loss": 0.3898, + "step": 1477 + }, + { + "epoch": 0.8630027005328078, + "grad_norm": 0.08844066647217767, + "learning_rate": 1.4017937153318157e-05, + "loss": 0.4602, + "step": 1478 + }, + { + "epoch": 0.8635865995182833, + "grad_norm": 0.08220542215399776, + "learning_rate": 1.4008597671533455e-05, + "loss": 0.3988, + "step": 1479 + }, + { + "epoch": 0.8641704985037588, + "grad_norm": 0.08773716959372321, + "learning_rate": 1.3999254021930416e-05, + "loss": 0.3939, + "step": 1480 + }, + { + "epoch": 0.8647543974892343, + "grad_norm": 0.09608229485044872, + "learning_rate": 1.3989906214223817e-05, + "loss": 0.4125, + "step": 1481 + }, + { + "epoch": 0.8653382964747098, + "grad_norm": 0.08629619718530061, + "learning_rate": 1.3980554258132761e-05, + "loss": 0.4194, + "step": 1482 + }, + { + "epoch": 0.8659221954601853, + "grad_norm": 0.09041891379497835, + "learning_rate": 1.3971198163380659e-05, + "loss": 0.4761, + "step": 1483 + }, + { + "epoch": 0.8665060944456608, + "grad_norm": 0.08436021841408944, + "learning_rate": 1.3961837939695231e-05, + "loss": 0.417, + "step": 1484 + }, + { + "epoch": 0.8670899934311365, + "grad_norm": 0.08289531112040509, + "learning_rate": 1.3952473596808485e-05, + "loss": 0.4251, + "step": 1485 + }, + { + "epoch": 0.867673892416612, + "grad_norm": 0.0803287286256555, + "learning_rate": 1.3943105144456715e-05, + "loss": 0.4125, + "step": 1486 + }, + { + "epoch": 0.8682577914020875, + "grad_norm": 0.08592363060467897, + "learning_rate": 1.3933732592380485e-05, + "loss": 0.4153, + "step": 1487 + }, + { + "epoch": 0.868841690387563, + "grad_norm": 0.08596192823446483, + "learning_rate": 1.3924355950324623e-05, + "loss": 0.4277, + "step": 1488 + }, + { + "epoch": 0.8694255893730385, + "grad_norm": 0.08453315695265243, + "learning_rate": 1.391497522803821e-05, + "loss": 0.3938, + "step": 1489 + }, + { + "epoch": 0.870009488358514, + "grad_norm": 0.09060669708021528, + "learning_rate": 1.390559043527457e-05, + "loss": 0.4141, + "step": 1490 + }, + { + "epoch": 0.8705933873439895, + "grad_norm": 0.10082843539788652, + "learning_rate": 1.3896201581791253e-05, + "loss": 0.4351, + "step": 1491 + }, + { + "epoch": 0.871177286329465, + "grad_norm": 0.08769100493016807, + "learning_rate": 1.388680867735004e-05, + "loss": 0.4541, + "step": 1492 + }, + { + "epoch": 0.8717611853149405, + "grad_norm": 0.08456244282831253, + "learning_rate": 1.3877411731716917e-05, + "loss": 0.4219, + "step": 1493 + }, + { + "epoch": 0.872345084300416, + "grad_norm": 0.09300799785051299, + "learning_rate": 1.3868010754662077e-05, + "loss": 0.435, + "step": 1494 + }, + { + "epoch": 0.8729289832858915, + "grad_norm": 0.08118916584500656, + "learning_rate": 1.3858605755959902e-05, + "loss": 0.4459, + "step": 1495 + }, + { + "epoch": 0.873512882271367, + "grad_norm": 0.08564990310561041, + "learning_rate": 1.3849196745388953e-05, + "loss": 0.4228, + "step": 1496 + }, + { + "epoch": 0.8740967812568425, + "grad_norm": 0.0816083641768999, + "learning_rate": 1.3839783732731966e-05, + "loss": 0.4189, + "step": 1497 + }, + { + "epoch": 0.874680680242318, + "grad_norm": 0.07818766994373033, + "learning_rate": 1.3830366727775835e-05, + "loss": 0.3965, + "step": 1498 + }, + { + "epoch": 0.8752645792277935, + "grad_norm": 0.08893443677818608, + "learning_rate": 1.3820945740311609e-05, + "loss": 0.409, + "step": 1499 + }, + { + "epoch": 0.875848478213269, + "grad_norm": 0.08754612690058106, + "learning_rate": 1.3811520780134471e-05, + "loss": 0.3961, + "step": 1500 + }, + { + "epoch": 0.8764323771987447, + "grad_norm": 0.09170113405976463, + "learning_rate": 1.3802091857043745e-05, + "loss": 0.4679, + "step": 1501 + }, + { + "epoch": 0.8770162761842202, + "grad_norm": 0.08016048535665014, + "learning_rate": 1.3792658980842861e-05, + "loss": 0.4053, + "step": 1502 + }, + { + "epoch": 0.8776001751696957, + "grad_norm": 0.09186720929355792, + "learning_rate": 1.3783222161339375e-05, + "loss": 0.475, + "step": 1503 + }, + { + "epoch": 0.8781840741551712, + "grad_norm": 0.08629188546979366, + "learning_rate": 1.3773781408344931e-05, + "loss": 0.3951, + "step": 1504 + }, + { + "epoch": 0.8787679731406467, + "grad_norm": 0.09751230564618096, + "learning_rate": 1.3764336731675266e-05, + "loss": 0.457, + "step": 1505 + }, + { + "epoch": 0.8793518721261222, + "grad_norm": 0.09120774679759135, + "learning_rate": 1.3754888141150197e-05, + "loss": 0.43, + "step": 1506 + }, + { + "epoch": 0.8799357711115977, + "grad_norm": 0.0800207140941132, + "learning_rate": 1.3745435646593613e-05, + "loss": 0.423, + "step": 1507 + }, + { + "epoch": 0.8805196700970732, + "grad_norm": 0.08351105044450488, + "learning_rate": 1.373597925783346e-05, + "loss": 0.3996, + "step": 1508 + }, + { + "epoch": 0.8811035690825487, + "grad_norm": 0.08868054175291099, + "learning_rate": 1.3726518984701731e-05, + "loss": 0.4188, + "step": 1509 + }, + { + "epoch": 0.8816874680680242, + "grad_norm": 0.08904265223364413, + "learning_rate": 1.3717054837034459e-05, + "loss": 0.434, + "step": 1510 + }, + { + "epoch": 0.8822713670534997, + "grad_norm": 0.08609265685199129, + "learning_rate": 1.3707586824671703e-05, + "loss": 0.4004, + "step": 1511 + }, + { + "epoch": 0.8828552660389752, + "grad_norm": 0.08392005466270408, + "learning_rate": 1.369811495745755e-05, + "loss": 0.3809, + "step": 1512 + }, + { + "epoch": 0.8834391650244507, + "grad_norm": 0.08084757854133912, + "learning_rate": 1.3688639245240078e-05, + "loss": 0.4425, + "step": 1513 + }, + { + "epoch": 0.8840230640099263, + "grad_norm": 0.09249434721429599, + "learning_rate": 1.3679159697871383e-05, + "loss": 0.5311, + "step": 1514 + }, + { + "epoch": 0.8846069629954018, + "grad_norm": 0.09240790630834922, + "learning_rate": 1.3669676325207531e-05, + "loss": 0.4265, + "step": 1515 + }, + { + "epoch": 0.8851908619808773, + "grad_norm": 0.08399397664961844, + "learning_rate": 1.3660189137108578e-05, + "loss": 0.3997, + "step": 1516 + }, + { + "epoch": 0.8857747609663528, + "grad_norm": 0.08312681794463819, + "learning_rate": 1.3650698143438534e-05, + "loss": 0.4042, + "step": 1517 + }, + { + "epoch": 0.8863586599518284, + "grad_norm": 0.09123531155032032, + "learning_rate": 1.3641203354065378e-05, + "loss": 0.4088, + "step": 1518 + }, + { + "epoch": 0.8869425589373039, + "grad_norm": 0.0907495891243876, + "learning_rate": 1.3631704778861028e-05, + "loss": 0.4681, + "step": 1519 + }, + { + "epoch": 0.8875264579227794, + "grad_norm": 0.08259584753542377, + "learning_rate": 1.3622202427701344e-05, + "loss": 0.3927, + "step": 1520 + }, + { + "epoch": 0.8881103569082549, + "grad_norm": 0.08160064406414035, + "learning_rate": 1.3612696310466103e-05, + "loss": 0.427, + "step": 1521 + }, + { + "epoch": 0.8886942558937304, + "grad_norm": 0.08075374658515486, + "learning_rate": 1.360318643703901e-05, + "loss": 0.395, + "step": 1522 + }, + { + "epoch": 0.8892781548792059, + "grad_norm": 0.08985409802103089, + "learning_rate": 1.3593672817307661e-05, + "loss": 0.4085, + "step": 1523 + }, + { + "epoch": 0.8898620538646814, + "grad_norm": 0.09298746378061125, + "learning_rate": 1.3584155461163562e-05, + "loss": 0.4675, + "step": 1524 + }, + { + "epoch": 0.8904459528501569, + "grad_norm": 0.07877047374685818, + "learning_rate": 1.3574634378502092e-05, + "loss": 0.4495, + "step": 1525 + }, + { + "epoch": 0.8910298518356324, + "grad_norm": 0.0784304462790551, + "learning_rate": 1.3565109579222511e-05, + "loss": 0.4055, + "step": 1526 + }, + { + "epoch": 0.8916137508211079, + "grad_norm": 0.08215095104440405, + "learning_rate": 1.3555581073227942e-05, + "loss": 0.3972, + "step": 1527 + }, + { + "epoch": 0.8921976498065834, + "grad_norm": 0.08357551676328764, + "learning_rate": 1.3546048870425356e-05, + "loss": 0.4249, + "step": 1528 + }, + { + "epoch": 0.892781548792059, + "grad_norm": 0.0769121250463885, + "learning_rate": 1.353651298072558e-05, + "loss": 0.3541, + "step": 1529 + }, + { + "epoch": 0.8933654477775345, + "grad_norm": 0.0807198466285613, + "learning_rate": 1.3526973414043263e-05, + "loss": 0.3913, + "step": 1530 + }, + { + "epoch": 0.89394934676301, + "grad_norm": 0.09236762819713512, + "learning_rate": 1.3517430180296886e-05, + "loss": 0.4058, + "step": 1531 + }, + { + "epoch": 0.8945332457484855, + "grad_norm": 0.08958412088239824, + "learning_rate": 1.3507883289408732e-05, + "loss": 0.4359, + "step": 1532 + }, + { + "epoch": 0.895117144733961, + "grad_norm": 0.07939706962398332, + "learning_rate": 1.3498332751304895e-05, + "loss": 0.3846, + "step": 1533 + }, + { + "epoch": 0.8957010437194365, + "grad_norm": 0.08019549278462072, + "learning_rate": 1.3488778575915258e-05, + "loss": 0.408, + "step": 1534 + }, + { + "epoch": 0.8962849427049121, + "grad_norm": 0.0803823502297556, + "learning_rate": 1.3479220773173485e-05, + "loss": 0.4148, + "step": 1535 + }, + { + "epoch": 0.8968688416903876, + "grad_norm": 0.07842346163655932, + "learning_rate": 1.3469659353017019e-05, + "loss": 0.3845, + "step": 1536 + }, + { + "epoch": 0.8974527406758631, + "grad_norm": 0.08737954424250452, + "learning_rate": 1.346009432538705e-05, + "loss": 0.4691, + "step": 1537 + }, + { + "epoch": 0.8980366396613386, + "grad_norm": 0.07585950875928935, + "learning_rate": 1.345052570022853e-05, + "loss": 0.3489, + "step": 1538 + }, + { + "epoch": 0.8986205386468141, + "grad_norm": 0.07936556841663144, + "learning_rate": 1.3440953487490145e-05, + "loss": 0.4033, + "step": 1539 + }, + { + "epoch": 0.8992044376322896, + "grad_norm": 0.077294153998054, + "learning_rate": 1.343137769712432e-05, + "loss": 0.3921, + "step": 1540 + }, + { + "epoch": 0.8997883366177651, + "grad_norm": 0.08496343829142769, + "learning_rate": 1.342179833908719e-05, + "loss": 0.3609, + "step": 1541 + }, + { + "epoch": 0.9003722356032406, + "grad_norm": 0.0799394797595773, + "learning_rate": 1.3412215423338601e-05, + "loss": 0.3511, + "step": 1542 + }, + { + "epoch": 0.9009561345887162, + "grad_norm": 0.082407363888288, + "learning_rate": 1.3402628959842106e-05, + "loss": 0.4182, + "step": 1543 + }, + { + "epoch": 0.9015400335741917, + "grad_norm": 0.07939289902494467, + "learning_rate": 1.3393038958564934e-05, + "loss": 0.3818, + "step": 1544 + }, + { + "epoch": 0.9021239325596672, + "grad_norm": 0.09084137223727427, + "learning_rate": 1.3383445429478008e-05, + "loss": 0.4878, + "step": 1545 + }, + { + "epoch": 0.9027078315451427, + "grad_norm": 0.08369821797715353, + "learning_rate": 1.33738483825559e-05, + "loss": 0.4241, + "step": 1546 + }, + { + "epoch": 0.9032917305306182, + "grad_norm": 0.09530919413007277, + "learning_rate": 1.3364247827776854e-05, + "loss": 0.5088, + "step": 1547 + }, + { + "epoch": 0.9038756295160937, + "grad_norm": 0.08340205261326063, + "learning_rate": 1.3354643775122762e-05, + "loss": 0.4057, + "step": 1548 + }, + { + "epoch": 0.9044595285015692, + "grad_norm": 0.09013328870579924, + "learning_rate": 1.3345036234579138e-05, + "loss": 0.4254, + "step": 1549 + }, + { + "epoch": 0.9050434274870447, + "grad_norm": 0.0825303286297938, + "learning_rate": 1.333542521613514e-05, + "loss": 0.4062, + "step": 1550 + }, + { + "epoch": 0.9056273264725202, + "grad_norm": 0.08926593182937041, + "learning_rate": 1.332581072978353e-05, + "loss": 0.4047, + "step": 1551 + }, + { + "epoch": 0.9062112254579958, + "grad_norm": 0.08003134098715958, + "learning_rate": 1.331619278552068e-05, + "loss": 0.4089, + "step": 1552 + }, + { + "epoch": 0.9067951244434713, + "grad_norm": 0.08861415952636946, + "learning_rate": 1.3306571393346557e-05, + "loss": 0.437, + "step": 1553 + }, + { + "epoch": 0.9073790234289468, + "grad_norm": 0.0878746593672077, + "learning_rate": 1.3296946563264715e-05, + "loss": 0.416, + "step": 1554 + }, + { + "epoch": 0.9079629224144223, + "grad_norm": 0.09647110704984431, + "learning_rate": 1.3287318305282277e-05, + "loss": 0.4474, + "step": 1555 + }, + { + "epoch": 0.9085468213998978, + "grad_norm": 0.08412129410456158, + "learning_rate": 1.3277686629409936e-05, + "loss": 0.4187, + "step": 1556 + }, + { + "epoch": 0.9091307203853733, + "grad_norm": 0.08429332778894305, + "learning_rate": 1.3268051545661937e-05, + "loss": 0.4346, + "step": 1557 + }, + { + "epoch": 0.9097146193708489, + "grad_norm": 0.08837139707677354, + "learning_rate": 1.3258413064056066e-05, + "loss": 0.4672, + "step": 1558 + }, + { + "epoch": 0.9102985183563244, + "grad_norm": 0.08402789784662845, + "learning_rate": 1.3248771194613641e-05, + "loss": 0.3851, + "step": 1559 + }, + { + "epoch": 0.9108824173417999, + "grad_norm": 0.07925606723263202, + "learning_rate": 1.3239125947359506e-05, + "loss": 0.4296, + "step": 1560 + }, + { + "epoch": 0.9114663163272754, + "grad_norm": 0.08717252479328202, + "learning_rate": 1.3229477332322016e-05, + "loss": 0.4128, + "step": 1561 + }, + { + "epoch": 0.9120502153127509, + "grad_norm": 0.08219906142726761, + "learning_rate": 1.3219825359533025e-05, + "loss": 0.3941, + "step": 1562 + }, + { + "epoch": 0.9126341142982264, + "grad_norm": 0.09046643140951549, + "learning_rate": 1.3210170039027886e-05, + "loss": 0.4214, + "step": 1563 + }, + { + "epoch": 0.9132180132837019, + "grad_norm": 0.08163773524901452, + "learning_rate": 1.320051138084542e-05, + "loss": 0.4495, + "step": 1564 + }, + { + "epoch": 0.9138019122691774, + "grad_norm": 0.0827957166651148, + "learning_rate": 1.3190849395027926e-05, + "loss": 0.421, + "step": 1565 + }, + { + "epoch": 0.9143858112546529, + "grad_norm": 0.08645654765263071, + "learning_rate": 1.3181184091621165e-05, + "loss": 0.4241, + "step": 1566 + }, + { + "epoch": 0.9149697102401284, + "grad_norm": 0.08999810007711327, + "learning_rate": 1.3171515480674342e-05, + "loss": 0.4418, + "step": 1567 + }, + { + "epoch": 0.915553609225604, + "grad_norm": 0.09238833656373456, + "learning_rate": 1.3161843572240107e-05, + "loss": 0.4234, + "step": 1568 + }, + { + "epoch": 0.9161375082110795, + "grad_norm": 0.09073313986873574, + "learning_rate": 1.3152168376374528e-05, + "loss": 0.4012, + "step": 1569 + }, + { + "epoch": 0.916721407196555, + "grad_norm": 0.08918923856041877, + "learning_rate": 1.3142489903137101e-05, + "loss": 0.4622, + "step": 1570 + }, + { + "epoch": 0.9173053061820305, + "grad_norm": 0.09241025914018566, + "learning_rate": 1.313280816259073e-05, + "loss": 0.4245, + "step": 1571 + }, + { + "epoch": 0.917889205167506, + "grad_norm": 0.08401437724027508, + "learning_rate": 1.3123123164801706e-05, + "loss": 0.4007, + "step": 1572 + }, + { + "epoch": 0.9184731041529816, + "grad_norm": 0.08503236094155447, + "learning_rate": 1.3113434919839715e-05, + "loss": 0.4196, + "step": 1573 + }, + { + "epoch": 0.9190570031384571, + "grad_norm": 0.08426491664808199, + "learning_rate": 1.310374343777782e-05, + "loss": 0.4113, + "step": 1574 + }, + { + "epoch": 0.9196409021239326, + "grad_norm": 0.0901564859587459, + "learning_rate": 1.3094048728692443e-05, + "loss": 0.4174, + "step": 1575 + }, + { + "epoch": 0.9202248011094081, + "grad_norm": 0.08521020691385993, + "learning_rate": 1.3084350802663365e-05, + "loss": 0.4053, + "step": 1576 + }, + { + "epoch": 0.9208087000948836, + "grad_norm": 0.08464901871933438, + "learning_rate": 1.3074649669773716e-05, + "loss": 0.4126, + "step": 1577 + }, + { + "epoch": 0.9213925990803591, + "grad_norm": 0.093995739837578, + "learning_rate": 1.306494534010995e-05, + "loss": 0.4356, + "step": 1578 + }, + { + "epoch": 0.9219764980658346, + "grad_norm": 0.0847266904287278, + "learning_rate": 1.3055237823761855e-05, + "loss": 0.4258, + "step": 1579 + }, + { + "epoch": 0.9225603970513101, + "grad_norm": 0.0836067695614812, + "learning_rate": 1.3045527130822524e-05, + "loss": 0.423, + "step": 1580 + }, + { + "epoch": 0.9231442960367856, + "grad_norm": 0.08538160776256865, + "learning_rate": 1.303581327138836e-05, + "loss": 0.4001, + "step": 1581 + }, + { + "epoch": 0.9237281950222611, + "grad_norm": 0.08117812181426626, + "learning_rate": 1.3026096255559055e-05, + "loss": 0.4507, + "step": 1582 + }, + { + "epoch": 0.9243120940077366, + "grad_norm": 0.08043609882661064, + "learning_rate": 1.3016376093437577e-05, + "loss": 0.3764, + "step": 1583 + }, + { + "epoch": 0.9248959929932121, + "grad_norm": 0.08646547339452423, + "learning_rate": 1.3006652795130179e-05, + "loss": 0.4359, + "step": 1584 + }, + { + "epoch": 0.9254798919786877, + "grad_norm": 0.07759926191193921, + "learning_rate": 1.299692637074636e-05, + "loss": 0.4211, + "step": 1585 + }, + { + "epoch": 0.9260637909641632, + "grad_norm": 0.08008051422454983, + "learning_rate": 1.2987196830398884e-05, + "loss": 0.3921, + "step": 1586 + }, + { + "epoch": 0.9266476899496388, + "grad_norm": 0.08572157272231852, + "learning_rate": 1.297746418420374e-05, + "loss": 0.3949, + "step": 1587 + }, + { + "epoch": 0.9272315889351143, + "grad_norm": 0.07999878349474612, + "learning_rate": 1.2967728442280154e-05, + "loss": 0.3748, + "step": 1588 + }, + { + "epoch": 0.9278154879205898, + "grad_norm": 0.07269838029393746, + "learning_rate": 1.2957989614750569e-05, + "loss": 0.3924, + "step": 1589 + }, + { + "epoch": 0.9283993869060653, + "grad_norm": 0.08993654460883971, + "learning_rate": 1.2948247711740638e-05, + "loss": 0.4547, + "step": 1590 + }, + { + "epoch": 0.9289832858915408, + "grad_norm": 0.09085219095641311, + "learning_rate": 1.2938502743379212e-05, + "loss": 0.4525, + "step": 1591 + }, + { + "epoch": 0.9295671848770163, + "grad_norm": 0.08387747641619164, + "learning_rate": 1.2928754719798324e-05, + "loss": 0.4395, + "step": 1592 + }, + { + "epoch": 0.9301510838624918, + "grad_norm": 0.09046706412356152, + "learning_rate": 1.291900365113319e-05, + "loss": 0.4373, + "step": 1593 + }, + { + "epoch": 0.9307349828479673, + "grad_norm": 0.08023863002861284, + "learning_rate": 1.2909249547522184e-05, + "loss": 0.3951, + "step": 1594 + }, + { + "epoch": 0.9313188818334428, + "grad_norm": 0.08999716369293896, + "learning_rate": 1.2899492419106848e-05, + "loss": 0.4415, + "step": 1595 + }, + { + "epoch": 0.9319027808189183, + "grad_norm": 0.08722537012094478, + "learning_rate": 1.2889732276031856e-05, + "loss": 0.4401, + "step": 1596 + }, + { + "epoch": 0.9324866798043938, + "grad_norm": 0.085937185462363, + "learning_rate": 1.2879969128445025e-05, + "loss": 0.3969, + "step": 1597 + }, + { + "epoch": 0.9330705787898693, + "grad_norm": 0.0843923089069521, + "learning_rate": 1.2870202986497291e-05, + "loss": 0.4036, + "step": 1598 + }, + { + "epoch": 0.9336544777753448, + "grad_norm": 0.08035453247147284, + "learning_rate": 1.2860433860342705e-05, + "loss": 0.4046, + "step": 1599 + }, + { + "epoch": 0.9342383767608203, + "grad_norm": 0.10479452837840691, + "learning_rate": 1.2850661760138423e-05, + "loss": 0.4238, + "step": 1600 + }, + { + "epoch": 0.9348222757462958, + "grad_norm": 0.09152209009531563, + "learning_rate": 1.284088669604469e-05, + "loss": 0.4092, + "step": 1601 + }, + { + "epoch": 0.9354061747317715, + "grad_norm": 0.08988822835947058, + "learning_rate": 1.283110867822483e-05, + "loss": 0.4306, + "step": 1602 + }, + { + "epoch": 0.935990073717247, + "grad_norm": 0.08677324176523413, + "learning_rate": 1.2821327716845246e-05, + "loss": 0.4191, + "step": 1603 + }, + { + "epoch": 0.9365739727027225, + "grad_norm": 0.10128762226487965, + "learning_rate": 1.2811543822075396e-05, + "loss": 0.475, + "step": 1604 + }, + { + "epoch": 0.937157871688198, + "grad_norm": 0.08556094864600355, + "learning_rate": 1.2801757004087792e-05, + "loss": 0.3719, + "step": 1605 + }, + { + "epoch": 0.9377417706736735, + "grad_norm": 0.08883033327323821, + "learning_rate": 1.2791967273057978e-05, + "loss": 0.4457, + "step": 1606 + }, + { + "epoch": 0.938325669659149, + "grad_norm": 0.08728769268842641, + "learning_rate": 1.2782174639164528e-05, + "loss": 0.3748, + "step": 1607 + }, + { + "epoch": 0.9389095686446245, + "grad_norm": 0.08568954766021829, + "learning_rate": 1.2772379112589043e-05, + "loss": 0.4093, + "step": 1608 + }, + { + "epoch": 0.9394934676301, + "grad_norm": 0.08334157459224775, + "learning_rate": 1.2762580703516127e-05, + "loss": 0.4087, + "step": 1609 + }, + { + "epoch": 0.9400773666155755, + "grad_norm": 0.08832730924097391, + "learning_rate": 1.2752779422133377e-05, + "loss": 0.3814, + "step": 1610 + }, + { + "epoch": 0.940661265601051, + "grad_norm": 0.08467665770473497, + "learning_rate": 1.2742975278631378e-05, + "loss": 0.3906, + "step": 1611 + }, + { + "epoch": 0.9412451645865265, + "grad_norm": 0.08401002989456251, + "learning_rate": 1.2733168283203692e-05, + "loss": 0.3782, + "step": 1612 + }, + { + "epoch": 0.941829063572002, + "grad_norm": 0.08449313525583382, + "learning_rate": 1.272335844604685e-05, + "loss": 0.4449, + "step": 1613 + }, + { + "epoch": 0.9424129625574775, + "grad_norm": 0.08608848089469454, + "learning_rate": 1.2713545777360334e-05, + "loss": 0.4357, + "step": 1614 + }, + { + "epoch": 0.942996861542953, + "grad_norm": 0.08203619940825888, + "learning_rate": 1.2703730287346565e-05, + "loss": 0.3933, + "step": 1615 + }, + { + "epoch": 0.9435807605284285, + "grad_norm": 0.08499929252915848, + "learning_rate": 1.2693911986210905e-05, + "loss": 0.3693, + "step": 1616 + }, + { + "epoch": 0.944164659513904, + "grad_norm": 0.08870992917302781, + "learning_rate": 1.2684090884161636e-05, + "loss": 0.465, + "step": 1617 + }, + { + "epoch": 0.9447485584993796, + "grad_norm": 0.08992503662394916, + "learning_rate": 1.2674266991409949e-05, + "loss": 0.4673, + "step": 1618 + }, + { + "epoch": 0.9453324574848552, + "grad_norm": 0.08362586773304165, + "learning_rate": 1.2664440318169949e-05, + "loss": 0.387, + "step": 1619 + }, + { + "epoch": 0.9459163564703307, + "grad_norm": 0.07854330662118399, + "learning_rate": 1.265461087465861e-05, + "loss": 0.4095, + "step": 1620 + }, + { + "epoch": 0.9465002554558062, + "grad_norm": 0.08333242377355662, + "learning_rate": 1.2644778671095808e-05, + "loss": 0.4229, + "step": 1621 + }, + { + "epoch": 0.9470841544412817, + "grad_norm": 0.0859449922247313, + "learning_rate": 1.2634943717704275e-05, + "loss": 0.4375, + "step": 1622 + }, + { + "epoch": 0.9476680534267572, + "grad_norm": 0.07725253133383753, + "learning_rate": 1.262510602470961e-05, + "loss": 0.4222, + "step": 1623 + }, + { + "epoch": 0.9482519524122327, + "grad_norm": 0.08063102401433697, + "learning_rate": 1.2615265602340259e-05, + "loss": 0.397, + "step": 1624 + }, + { + "epoch": 0.9488358513977082, + "grad_norm": 0.0830663792063004, + "learning_rate": 1.2605422460827494e-05, + "loss": 0.4057, + "step": 1625 + }, + { + "epoch": 0.9494197503831837, + "grad_norm": 0.08678705020238921, + "learning_rate": 1.2595576610405436e-05, + "loss": 0.4197, + "step": 1626 + }, + { + "epoch": 0.9500036493686592, + "grad_norm": 0.08218777646921294, + "learning_rate": 1.2585728061311003e-05, + "loss": 0.3956, + "step": 1627 + }, + { + "epoch": 0.9505875483541347, + "grad_norm": 0.08541166386473001, + "learning_rate": 1.257587682378393e-05, + "loss": 0.4225, + "step": 1628 + }, + { + "epoch": 0.9511714473396102, + "grad_norm": 0.08418713298879263, + "learning_rate": 1.256602290806674e-05, + "loss": 0.3769, + "step": 1629 + }, + { + "epoch": 0.9517553463250857, + "grad_norm": 0.08041023544893418, + "learning_rate": 1.2556166324404747e-05, + "loss": 0.4274, + "step": 1630 + }, + { + "epoch": 0.9523392453105612, + "grad_norm": 0.08305168008644516, + "learning_rate": 1.2546307083046037e-05, + "loss": 0.4412, + "step": 1631 + }, + { + "epoch": 0.9529231442960368, + "grad_norm": 0.08668979668776144, + "learning_rate": 1.2536445194241455e-05, + "loss": 0.3972, + "step": 1632 + }, + { + "epoch": 0.9535070432815123, + "grad_norm": 0.07943819038919371, + "learning_rate": 1.2526580668244607e-05, + "loss": 0.4209, + "step": 1633 + }, + { + "epoch": 0.9540909422669878, + "grad_norm": 0.08739222505993903, + "learning_rate": 1.2516713515311832e-05, + "loss": 0.4387, + "step": 1634 + }, + { + "epoch": 0.9546748412524633, + "grad_norm": 0.07811327518234769, + "learning_rate": 1.2506843745702204e-05, + "loss": 0.4256, + "step": 1635 + }, + { + "epoch": 0.9552587402379389, + "grad_norm": 0.08183822248477049, + "learning_rate": 1.2496971369677518e-05, + "loss": 0.4036, + "step": 1636 + }, + { + "epoch": 0.9558426392234144, + "grad_norm": 0.09414996576565587, + "learning_rate": 1.248709639750228e-05, + "loss": 0.4933, + "step": 1637 + }, + { + "epoch": 0.9564265382088899, + "grad_norm": 0.08011743498844552, + "learning_rate": 1.2477218839443694e-05, + "loss": 0.4182, + "step": 1638 + }, + { + "epoch": 0.9570104371943654, + "grad_norm": 0.08749275635792429, + "learning_rate": 1.246733870577165e-05, + "loss": 0.422, + "step": 1639 + }, + { + "epoch": 0.9575943361798409, + "grad_norm": 0.07838666312372367, + "learning_rate": 1.2457456006758722e-05, + "loss": 0.3904, + "step": 1640 + }, + { + "epoch": 0.9581782351653164, + "grad_norm": 0.0799739666953882, + "learning_rate": 1.2447570752680147e-05, + "loss": 0.3911, + "step": 1641 + }, + { + "epoch": 0.9587621341507919, + "grad_norm": 0.08816481082326143, + "learning_rate": 1.243768295381382e-05, + "loss": 0.4875, + "step": 1642 + }, + { + "epoch": 0.9593460331362674, + "grad_norm": 0.0892110056571033, + "learning_rate": 1.242779262044028e-05, + "loss": 0.4382, + "step": 1643 + }, + { + "epoch": 0.9599299321217429, + "grad_norm": 0.08189532877726767, + "learning_rate": 1.24178997628427e-05, + "loss": 0.4087, + "step": 1644 + }, + { + "epoch": 0.9605138311072184, + "grad_norm": 0.07777760726416509, + "learning_rate": 1.2408004391306883e-05, + "loss": 0.3936, + "step": 1645 + }, + { + "epoch": 0.961097730092694, + "grad_norm": 0.08185938117090903, + "learning_rate": 1.2398106516121243e-05, + "loss": 0.4082, + "step": 1646 + }, + { + "epoch": 0.9616816290781695, + "grad_norm": 0.08446415754193325, + "learning_rate": 1.2388206147576796e-05, + "loss": 0.4219, + "step": 1647 + }, + { + "epoch": 0.962265528063645, + "grad_norm": 0.07742270087681763, + "learning_rate": 1.2378303295967147e-05, + "loss": 0.3896, + "step": 1648 + }, + { + "epoch": 0.9628494270491205, + "grad_norm": 0.08121525843678422, + "learning_rate": 1.2368397971588493e-05, + "loss": 0.3969, + "step": 1649 + }, + { + "epoch": 0.963433326034596, + "grad_norm": 0.08401345363693177, + "learning_rate": 1.2358490184739593e-05, + "loss": 0.4865, + "step": 1650 + }, + { + "epoch": 0.9640172250200715, + "grad_norm": 0.07599679322678936, + "learning_rate": 1.2348579945721769e-05, + "loss": 0.3943, + "step": 1651 + }, + { + "epoch": 0.9646011240055471, + "grad_norm": 0.07885341013435584, + "learning_rate": 1.2338667264838895e-05, + "loss": 0.3878, + "step": 1652 + }, + { + "epoch": 0.9651850229910226, + "grad_norm": 0.08359632869083061, + "learning_rate": 1.2328752152397373e-05, + "loss": 0.3858, + "step": 1653 + }, + { + "epoch": 0.9657689219764981, + "grad_norm": 0.08227062992001256, + "learning_rate": 1.2318834618706154e-05, + "loss": 0.4244, + "step": 1654 + }, + { + "epoch": 0.9663528209619736, + "grad_norm": 0.0839541415420231, + "learning_rate": 1.2308914674076687e-05, + "loss": 0.4217, + "step": 1655 + }, + { + "epoch": 0.9669367199474491, + "grad_norm": 0.08118188815872268, + "learning_rate": 1.2298992328822937e-05, + "loss": 0.4373, + "step": 1656 + }, + { + "epoch": 0.9675206189329246, + "grad_norm": 0.07903386148059148, + "learning_rate": 1.2289067593261358e-05, + "loss": 0.4092, + "step": 1657 + }, + { + "epoch": 0.9681045179184001, + "grad_norm": 0.08433017068912417, + "learning_rate": 1.2279140477710902e-05, + "loss": 0.4049, + "step": 1658 + }, + { + "epoch": 0.9686884169038756, + "grad_norm": 0.0821988247583829, + "learning_rate": 1.2269210992492982e-05, + "loss": 0.3934, + "step": 1659 + }, + { + "epoch": 0.9692723158893511, + "grad_norm": 0.08102401767630002, + "learning_rate": 1.2259279147931479e-05, + "loss": 0.4238, + "step": 1660 + }, + { + "epoch": 0.9698562148748266, + "grad_norm": 0.08474209088251464, + "learning_rate": 1.2249344954352735e-05, + "loss": 0.4693, + "step": 1661 + }, + { + "epoch": 0.9704401138603022, + "grad_norm": 0.08755247358618716, + "learning_rate": 1.2239408422085518e-05, + "loss": 0.4789, + "step": 1662 + }, + { + "epoch": 0.9710240128457777, + "grad_norm": 0.085234743286079, + "learning_rate": 1.2229469561461046e-05, + "loss": 0.439, + "step": 1663 + }, + { + "epoch": 0.9716079118312532, + "grad_norm": 0.08759103922437811, + "learning_rate": 1.2219528382812946e-05, + "loss": 0.4309, + "step": 1664 + }, + { + "epoch": 0.9721918108167287, + "grad_norm": 0.08126917251288683, + "learning_rate": 1.2209584896477258e-05, + "loss": 0.4034, + "step": 1665 + }, + { + "epoch": 0.9727757098022042, + "grad_norm": 0.08479920833250212, + "learning_rate": 1.2199639112792423e-05, + "loss": 0.4213, + "step": 1666 + }, + { + "epoch": 0.9733596087876797, + "grad_norm": 0.08804543978754983, + "learning_rate": 1.2189691042099265e-05, + "loss": 0.4426, + "step": 1667 + }, + { + "epoch": 0.9739435077731552, + "grad_norm": 0.08660559635884385, + "learning_rate": 1.2179740694740993e-05, + "loss": 0.4209, + "step": 1668 + }, + { + "epoch": 0.9745274067586308, + "grad_norm": 0.0866005647601678, + "learning_rate": 1.2169788081063181e-05, + "loss": 0.4609, + "step": 1669 + }, + { + "epoch": 0.9751113057441063, + "grad_norm": 0.08010544408169253, + "learning_rate": 1.2159833211413759e-05, + "loss": 0.4071, + "step": 1670 + }, + { + "epoch": 0.9756952047295818, + "grad_norm": 0.08221812032325114, + "learning_rate": 1.2149876096142998e-05, + "loss": 0.4124, + "step": 1671 + }, + { + "epoch": 0.9762791037150573, + "grad_norm": 0.07865893297798797, + "learning_rate": 1.2139916745603509e-05, + "loss": 0.4377, + "step": 1672 + }, + { + "epoch": 0.9768630027005328, + "grad_norm": 0.08698691038398994, + "learning_rate": 1.2129955170150228e-05, + "loss": 0.4254, + "step": 1673 + }, + { + "epoch": 0.9774469016860083, + "grad_norm": 0.08593476966936738, + "learning_rate": 1.21199913801404e-05, + "loss": 0.4348, + "step": 1674 + }, + { + "epoch": 0.9780308006714838, + "grad_norm": 0.08502444225890222, + "learning_rate": 1.2110025385933582e-05, + "loss": 0.4211, + "step": 1675 + }, + { + "epoch": 0.9786146996569594, + "grad_norm": 0.08225481057650674, + "learning_rate": 1.2100057197891601e-05, + "loss": 0.3789, + "step": 1676 + }, + { + "epoch": 0.9791985986424349, + "grad_norm": 0.07744177162205788, + "learning_rate": 1.209008682637859e-05, + "loss": 0.4278, + "step": 1677 + }, + { + "epoch": 0.9797824976279104, + "grad_norm": 0.0785553065567487, + "learning_rate": 1.2080114281760942e-05, + "loss": 0.4155, + "step": 1678 + }, + { + "epoch": 0.9803663966133859, + "grad_norm": 0.0882384968757395, + "learning_rate": 1.2070139574407302e-05, + "loss": 0.4483, + "step": 1679 + }, + { + "epoch": 0.9809502955988614, + "grad_norm": 0.09013051923888415, + "learning_rate": 1.2060162714688582e-05, + "loss": 0.4313, + "step": 1680 + }, + { + "epoch": 0.9815341945843369, + "grad_norm": 0.07960864534641429, + "learning_rate": 1.2050183712977903e-05, + "loss": 0.4225, + "step": 1681 + }, + { + "epoch": 0.9821180935698124, + "grad_norm": 0.08154496731246945, + "learning_rate": 1.2040202579650649e-05, + "loss": 0.4644, + "step": 1682 + }, + { + "epoch": 0.9827019925552879, + "grad_norm": 0.08217476650583742, + "learning_rate": 1.2030219325084388e-05, + "loss": 0.4749, + "step": 1683 + }, + { + "epoch": 0.9832858915407634, + "grad_norm": 0.07514077179888998, + "learning_rate": 1.2020233959658918e-05, + "loss": 0.4024, + "step": 1684 + }, + { + "epoch": 0.9838697905262389, + "grad_norm": 0.07881696058620719, + "learning_rate": 1.2010246493756215e-05, + "loss": 0.378, + "step": 1685 + }, + { + "epoch": 0.9844536895117145, + "grad_norm": 0.08235865057317901, + "learning_rate": 1.2000256937760446e-05, + "loss": 0.4184, + "step": 1686 + }, + { + "epoch": 0.98503758849719, + "grad_norm": 0.08653840648214887, + "learning_rate": 1.1990265302057948e-05, + "loss": 0.4071, + "step": 1687 + }, + { + "epoch": 0.9856214874826655, + "grad_norm": 0.08066625489547946, + "learning_rate": 1.1980271597037228e-05, + "loss": 0.411, + "step": 1688 + }, + { + "epoch": 0.986205386468141, + "grad_norm": 0.08010818831702927, + "learning_rate": 1.1970275833088936e-05, + "loss": 0.4083, + "step": 1689 + }, + { + "epoch": 0.9867892854536165, + "grad_norm": 0.08128572446366071, + "learning_rate": 1.1960278020605861e-05, + "loss": 0.4241, + "step": 1690 + }, + { + "epoch": 0.987373184439092, + "grad_norm": 0.07764663078084932, + "learning_rate": 1.1950278169982934e-05, + "loss": 0.4058, + "step": 1691 + }, + { + "epoch": 0.9879570834245676, + "grad_norm": 0.08254047570788005, + "learning_rate": 1.1940276291617192e-05, + "loss": 0.3747, + "step": 1692 + }, + { + "epoch": 0.9885409824100431, + "grad_norm": 0.07882032576317895, + "learning_rate": 1.1930272395907789e-05, + "loss": 0.3936, + "step": 1693 + }, + { + "epoch": 0.9891248813955186, + "grad_norm": 0.08927596119313902, + "learning_rate": 1.1920266493255976e-05, + "loss": 0.4086, + "step": 1694 + }, + { + "epoch": 0.9897087803809941, + "grad_norm": 0.09696320300067417, + "learning_rate": 1.1910258594065079e-05, + "loss": 0.4221, + "step": 1695 + }, + { + "epoch": 0.9902926793664696, + "grad_norm": 0.08608948947973924, + "learning_rate": 1.1900248708740515e-05, + "loss": 0.3945, + "step": 1696 + }, + { + "epoch": 0.9908765783519451, + "grad_norm": 0.08380525492904839, + "learning_rate": 1.1890236847689762e-05, + "loss": 0.4246, + "step": 1697 + }, + { + "epoch": 0.9914604773374206, + "grad_norm": 0.08187112589037113, + "learning_rate": 1.1880223021322348e-05, + "loss": 0.4185, + "step": 1698 + }, + { + "epoch": 0.9920443763228961, + "grad_norm": 0.08779979573496938, + "learning_rate": 1.1870207240049845e-05, + "loss": 0.3784, + "step": 1699 + }, + { + "epoch": 0.9926282753083716, + "grad_norm": 0.07637269898764508, + "learning_rate": 1.1860189514285858e-05, + "loss": 0.3791, + "step": 1700 + }, + { + "epoch": 0.9932121742938471, + "grad_norm": 0.08513146337452204, + "learning_rate": 1.185016985444602e-05, + "loss": 0.4131, + "step": 1701 + }, + { + "epoch": 0.9937960732793226, + "grad_norm": 0.08417615053284033, + "learning_rate": 1.1840148270947962e-05, + "loss": 0.3908, + "step": 1702 + }, + { + "epoch": 0.9943799722647982, + "grad_norm": 0.08667065987305907, + "learning_rate": 1.183012477421133e-05, + "loss": 0.3984, + "step": 1703 + }, + { + "epoch": 0.9949638712502737, + "grad_norm": 0.08138329630926554, + "learning_rate": 1.1820099374657748e-05, + "loss": 0.3547, + "step": 1704 + }, + { + "epoch": 0.9955477702357493, + "grad_norm": 0.08452836715832338, + "learning_rate": 1.1810072082710823e-05, + "loss": 0.3965, + "step": 1705 + }, + { + "epoch": 0.9961316692212248, + "grad_norm": 0.09160294536129893, + "learning_rate": 1.180004290879613e-05, + "loss": 0.4599, + "step": 1706 + }, + { + "epoch": 0.9967155682067003, + "grad_norm": 0.07525722393259596, + "learning_rate": 1.1790011863341197e-05, + "loss": 0.3888, + "step": 1707 + }, + { + "epoch": 0.9972994671921758, + "grad_norm": 0.08346191760394343, + "learning_rate": 1.1779978956775507e-05, + "loss": 0.4171, + "step": 1708 + }, + { + "epoch": 0.9978833661776513, + "grad_norm": 0.08878368238953618, + "learning_rate": 1.1769944199530458e-05, + "loss": 0.3813, + "step": 1709 + }, + { + "epoch": 0.9984672651631268, + "grad_norm": 0.08306508133978227, + "learning_rate": 1.17599076020394e-05, + "loss": 0.3852, + "step": 1710 + }, + { + "epoch": 0.9990511641486023, + "grad_norm": 0.0759656847027431, + "learning_rate": 1.1749869174737575e-05, + "loss": 0.4092, + "step": 1711 + }, + { + "epoch": 0.9996350631340778, + "grad_norm": 0.08551687380290805, + "learning_rate": 1.173982892806214e-05, + "loss": 0.4261, + "step": 1712 + }, + { + "epoch": 1.0004379242391066, + "grad_norm": 0.18343593073046008, + "learning_rate": 1.172978687245213e-05, + "loss": 0.6859, + "step": 1713 + }, + { + "epoch": 1.0010218232245822, + "grad_norm": 0.09203391189735186, + "learning_rate": 1.1719743018348477e-05, + "loss": 0.3262, + "step": 1714 + }, + { + "epoch": 1.0016057222100576, + "grad_norm": 0.07879238831005524, + "learning_rate": 1.1709697376193967e-05, + "loss": 0.3244, + "step": 1715 + }, + { + "epoch": 1.0021896211955332, + "grad_norm": 0.08599050409747704, + "learning_rate": 1.169964995643326e-05, + "loss": 0.3166, + "step": 1716 + }, + { + "epoch": 1.0027735201810086, + "grad_norm": 0.09076424043037076, + "learning_rate": 1.1689600769512855e-05, + "loss": 0.3176, + "step": 1717 + }, + { + "epoch": 1.0033574191664842, + "grad_norm": 0.09809038951662452, + "learning_rate": 1.1679549825881087e-05, + "loss": 0.3429, + "step": 1718 + }, + { + "epoch": 1.0039413181519596, + "grad_norm": 0.1136463737708502, + "learning_rate": 1.1669497135988127e-05, + "loss": 0.3602, + "step": 1719 + }, + { + "epoch": 1.0045252171374353, + "grad_norm": 0.09090731390844707, + "learning_rate": 1.1659442710285948e-05, + "loss": 0.3374, + "step": 1720 + }, + { + "epoch": 1.0051091161229107, + "grad_norm": 0.0774598072579951, + "learning_rate": 1.1649386559228342e-05, + "loss": 0.2837, + "step": 1721 + }, + { + "epoch": 1.0056930151083863, + "grad_norm": 0.09849805011422244, + "learning_rate": 1.1639328693270887e-05, + "loss": 0.3073, + "step": 1722 + }, + { + "epoch": 1.0062769140938617, + "grad_norm": 0.09190742814378323, + "learning_rate": 1.1629269122870942e-05, + "loss": 0.3586, + "step": 1723 + }, + { + "epoch": 1.0068608130793373, + "grad_norm": 0.08728847568383297, + "learning_rate": 1.1619207858487646e-05, + "loss": 0.334, + "step": 1724 + }, + { + "epoch": 1.007444712064813, + "grad_norm": 0.0842053335492152, + "learning_rate": 1.1609144910581891e-05, + "loss": 0.3058, + "step": 1725 + }, + { + "epoch": 1.0080286110502883, + "grad_norm": 0.09712853048123377, + "learning_rate": 1.1599080289616329e-05, + "loss": 0.391, + "step": 1726 + }, + { + "epoch": 1.008612510035764, + "grad_norm": 0.09506006332535694, + "learning_rate": 1.1589014006055337e-05, + "loss": 0.3582, + "step": 1727 + }, + { + "epoch": 1.0091964090212393, + "grad_norm": 0.08630817233875346, + "learning_rate": 1.1578946070365035e-05, + "loss": 0.3095, + "step": 1728 + }, + { + "epoch": 1.009780308006715, + "grad_norm": 0.09949443613362181, + "learning_rate": 1.1568876493013255e-05, + "loss": 0.366, + "step": 1729 + }, + { + "epoch": 1.0103642069921903, + "grad_norm": 0.08421156976608057, + "learning_rate": 1.1558805284469533e-05, + "loss": 0.3134, + "step": 1730 + }, + { + "epoch": 1.010948105977666, + "grad_norm": 0.08701079309833505, + "learning_rate": 1.1548732455205105e-05, + "loss": 0.3204, + "step": 1731 + }, + { + "epoch": 1.0115320049631413, + "grad_norm": 0.08317658361899183, + "learning_rate": 1.1538658015692892e-05, + "loss": 0.315, + "step": 1732 + }, + { + "epoch": 1.012115903948617, + "grad_norm": 0.08845697869692616, + "learning_rate": 1.1528581976407485e-05, + "loss": 0.3292, + "step": 1733 + }, + { + "epoch": 1.0126998029340923, + "grad_norm": 0.10094431498396644, + "learning_rate": 1.1518504347825146e-05, + "loss": 0.326, + "step": 1734 + }, + { + "epoch": 1.013283701919568, + "grad_norm": 0.08844533277203921, + "learning_rate": 1.1508425140423782e-05, + "loss": 0.3427, + "step": 1735 + }, + { + "epoch": 1.0138676009050434, + "grad_norm": 0.08872484962801165, + "learning_rate": 1.1498344364682948e-05, + "loss": 0.3435, + "step": 1736 + }, + { + "epoch": 1.014451499890519, + "grad_norm": 0.07995897659746604, + "learning_rate": 1.1488262031083816e-05, + "loss": 0.2865, + "step": 1737 + }, + { + "epoch": 1.0150353988759944, + "grad_norm": 0.09910244675814082, + "learning_rate": 1.14781781501092e-05, + "loss": 0.3566, + "step": 1738 + }, + { + "epoch": 1.01561929786147, + "grad_norm": 0.09616822757360748, + "learning_rate": 1.1468092732243506e-05, + "loss": 0.3529, + "step": 1739 + }, + { + "epoch": 1.0162031968469454, + "grad_norm": 0.09185713482810755, + "learning_rate": 1.1458005787972743e-05, + "loss": 0.3473, + "step": 1740 + }, + { + "epoch": 1.016787095832421, + "grad_norm": 0.08817513967100557, + "learning_rate": 1.1447917327784504e-05, + "loss": 0.3275, + "step": 1741 + }, + { + "epoch": 1.0173709948178966, + "grad_norm": 0.08617785888239828, + "learning_rate": 1.143782736216796e-05, + "loss": 0.3332, + "step": 1742 + }, + { + "epoch": 1.017954893803372, + "grad_norm": 0.0870434791246094, + "learning_rate": 1.1427735901613854e-05, + "loss": 0.346, + "step": 1743 + }, + { + "epoch": 1.0185387927888476, + "grad_norm": 0.08537925550406313, + "learning_rate": 1.1417642956614474e-05, + "loss": 0.3326, + "step": 1744 + }, + { + "epoch": 1.019122691774323, + "grad_norm": 0.09307799829405257, + "learning_rate": 1.1407548537663655e-05, + "loss": 0.3041, + "step": 1745 + }, + { + "epoch": 1.0197065907597986, + "grad_norm": 0.09092761782142386, + "learning_rate": 1.1397452655256762e-05, + "loss": 0.3493, + "step": 1746 + }, + { + "epoch": 1.020290489745274, + "grad_norm": 0.08708082037966822, + "learning_rate": 1.1387355319890685e-05, + "loss": 0.3264, + "step": 1747 + }, + { + "epoch": 1.0208743887307496, + "grad_norm": 0.09147641312047902, + "learning_rate": 1.1377256542063822e-05, + "loss": 0.3428, + "step": 1748 + }, + { + "epoch": 1.021458287716225, + "grad_norm": 0.08364847417375318, + "learning_rate": 1.1367156332276077e-05, + "loss": 0.3024, + "step": 1749 + }, + { + "epoch": 1.0220421867017007, + "grad_norm": 0.09618739671744223, + "learning_rate": 1.1357054701028836e-05, + "loss": 0.3841, + "step": 1750 + }, + { + "epoch": 1.022626085687176, + "grad_norm": 0.08738293780793314, + "learning_rate": 1.1346951658824958e-05, + "loss": 0.3051, + "step": 1751 + }, + { + "epoch": 1.0232099846726517, + "grad_norm": 0.0870610103855313, + "learning_rate": 1.1336847216168785e-05, + "loss": 0.3216, + "step": 1752 + }, + { + "epoch": 1.023793883658127, + "grad_norm": 0.0831338532276279, + "learning_rate": 1.1326741383566102e-05, + "loss": 0.3375, + "step": 1753 + }, + { + "epoch": 1.0243777826436027, + "grad_norm": 0.0869117572278222, + "learning_rate": 1.1316634171524147e-05, + "loss": 0.3665, + "step": 1754 + }, + { + "epoch": 1.024961681629078, + "grad_norm": 0.09919917672032577, + "learning_rate": 1.1306525590551585e-05, + "loss": 0.4091, + "step": 1755 + }, + { + "epoch": 1.0255455806145537, + "grad_norm": 0.0905836704825324, + "learning_rate": 1.1296415651158506e-05, + "loss": 0.3362, + "step": 1756 + }, + { + "epoch": 1.026129479600029, + "grad_norm": 0.08769578422804242, + "learning_rate": 1.1286304363856418e-05, + "loss": 0.3174, + "step": 1757 + }, + { + "epoch": 1.0267133785855047, + "grad_norm": 0.08293558507704744, + "learning_rate": 1.1276191739158222e-05, + "loss": 0.3103, + "step": 1758 + }, + { + "epoch": 1.0272972775709803, + "grad_norm": 0.10648781217108515, + "learning_rate": 1.126607778757822e-05, + "loss": 0.3423, + "step": 1759 + }, + { + "epoch": 1.0278811765564557, + "grad_norm": 0.09003874048289456, + "learning_rate": 1.1255962519632082e-05, + "loss": 0.3055, + "step": 1760 + }, + { + "epoch": 1.0284650755419313, + "grad_norm": 0.08565474670509707, + "learning_rate": 1.1245845945836855e-05, + "loss": 0.3345, + "step": 1761 + }, + { + "epoch": 1.0290489745274067, + "grad_norm": 0.08572882925057697, + "learning_rate": 1.123572807671094e-05, + "loss": 0.3318, + "step": 1762 + }, + { + "epoch": 1.0296328735128824, + "grad_norm": 0.08878824128837064, + "learning_rate": 1.122560892277409e-05, + "loss": 0.343, + "step": 1763 + }, + { + "epoch": 1.0302167724983577, + "grad_norm": 0.09052309519078533, + "learning_rate": 1.1215488494547384e-05, + "loss": 0.3264, + "step": 1764 + }, + { + "epoch": 1.0308006714838334, + "grad_norm": 0.08672816181637746, + "learning_rate": 1.1205366802553231e-05, + "loss": 0.3233, + "step": 1765 + }, + { + "epoch": 1.0313845704693088, + "grad_norm": 0.09225641398898507, + "learning_rate": 1.1195243857315358e-05, + "loss": 0.3226, + "step": 1766 + }, + { + "epoch": 1.0319684694547844, + "grad_norm": 0.08485601469929122, + "learning_rate": 1.1185119669358792e-05, + "loss": 0.3721, + "step": 1767 + }, + { + "epoch": 1.0325523684402598, + "grad_norm": 0.09872442750457125, + "learning_rate": 1.1174994249209852e-05, + "loss": 0.3889, + "step": 1768 + }, + { + "epoch": 1.0331362674257354, + "grad_norm": 0.09128127740351515, + "learning_rate": 1.1164867607396136e-05, + "loss": 0.3245, + "step": 1769 + }, + { + "epoch": 1.0337201664112108, + "grad_norm": 0.08798053818498862, + "learning_rate": 1.115473975444651e-05, + "loss": 0.3303, + "step": 1770 + }, + { + "epoch": 1.0343040653966864, + "grad_norm": 0.0826969355394186, + "learning_rate": 1.1144610700891108e-05, + "loss": 0.3015, + "step": 1771 + }, + { + "epoch": 1.0348879643821618, + "grad_norm": 0.09472323962794102, + "learning_rate": 1.1134480457261308e-05, + "loss": 0.3429, + "step": 1772 + }, + { + "epoch": 1.0354718633676374, + "grad_norm": 0.08253541541894249, + "learning_rate": 1.1124349034089724e-05, + "loss": 0.2844, + "step": 1773 + }, + { + "epoch": 1.0360557623531128, + "grad_norm": 0.08984108508598738, + "learning_rate": 1.1114216441910195e-05, + "loss": 0.3202, + "step": 1774 + }, + { + "epoch": 1.0366396613385884, + "grad_norm": 0.09059022051586936, + "learning_rate": 1.1104082691257778e-05, + "loss": 0.3368, + "step": 1775 + }, + { + "epoch": 1.037223560324064, + "grad_norm": 0.0867017187999974, + "learning_rate": 1.1093947792668735e-05, + "loss": 0.3131, + "step": 1776 + }, + { + "epoch": 1.0378074593095394, + "grad_norm": 0.09168091745467101, + "learning_rate": 1.1083811756680523e-05, + "loss": 0.2915, + "step": 1777 + }, + { + "epoch": 1.038391358295015, + "grad_norm": 0.08837522115661714, + "learning_rate": 1.1073674593831778e-05, + "loss": 0.3508, + "step": 1778 + }, + { + "epoch": 1.0389752572804904, + "grad_norm": 0.09052816936091201, + "learning_rate": 1.1063536314662301e-05, + "loss": 0.3544, + "step": 1779 + }, + { + "epoch": 1.039559156265966, + "grad_norm": 0.09568562761563595, + "learning_rate": 1.1053396929713076e-05, + "loss": 0.3254, + "step": 1780 + }, + { + "epoch": 1.0401430552514415, + "grad_norm": 0.0962523222584643, + "learning_rate": 1.1043256449526214e-05, + "loss": 0.3333, + "step": 1781 + }, + { + "epoch": 1.040726954236917, + "grad_norm": 0.0887905385941228, + "learning_rate": 1.103311488464497e-05, + "loss": 0.3258, + "step": 1782 + }, + { + "epoch": 1.0413108532223925, + "grad_norm": 0.0891880248130801, + "learning_rate": 1.1022972245613735e-05, + "loss": 0.3087, + "step": 1783 + }, + { + "epoch": 1.041894752207868, + "grad_norm": 0.09177977079896037, + "learning_rate": 1.101282854297801e-05, + "loss": 0.3261, + "step": 1784 + }, + { + "epoch": 1.0424786511933435, + "grad_norm": 0.0878885526003928, + "learning_rate": 1.1002683787284403e-05, + "loss": 0.3282, + "step": 1785 + }, + { + "epoch": 1.043062550178819, + "grad_norm": 0.09440257939593051, + "learning_rate": 1.0992537989080618e-05, + "loss": 0.3429, + "step": 1786 + }, + { + "epoch": 1.0436464491642945, + "grad_norm": 0.0897722207132791, + "learning_rate": 1.0982391158915441e-05, + "loss": 0.3352, + "step": 1787 + }, + { + "epoch": 1.0442303481497701, + "grad_norm": 0.08080052339633029, + "learning_rate": 1.0972243307338733e-05, + "loss": 0.3043, + "step": 1788 + }, + { + "epoch": 1.0448142471352455, + "grad_norm": 0.08999881639059784, + "learning_rate": 1.0962094444901416e-05, + "loss": 0.3463, + "step": 1789 + }, + { + "epoch": 1.0453981461207211, + "grad_norm": 0.08590566437767125, + "learning_rate": 1.0951944582155463e-05, + "loss": 0.2915, + "step": 1790 + }, + { + "epoch": 1.0459820451061965, + "grad_norm": 0.08261276151069719, + "learning_rate": 1.094179372965389e-05, + "loss": 0.3069, + "step": 1791 + }, + { + "epoch": 1.0465659440916721, + "grad_norm": 0.09359768999844546, + "learning_rate": 1.0931641897950733e-05, + "loss": 0.3428, + "step": 1792 + }, + { + "epoch": 1.0471498430771478, + "grad_norm": 0.08796913164816149, + "learning_rate": 1.0921489097601054e-05, + "loss": 0.3095, + "step": 1793 + }, + { + "epoch": 1.0477337420626232, + "grad_norm": 0.08950396906292477, + "learning_rate": 1.0911335339160924e-05, + "loss": 0.3372, + "step": 1794 + }, + { + "epoch": 1.0483176410480988, + "grad_norm": 0.07904694275443369, + "learning_rate": 1.090118063318741e-05, + "loss": 0.3011, + "step": 1795 + }, + { + "epoch": 1.0489015400335742, + "grad_norm": 0.08882041814601999, + "learning_rate": 1.089102499023855e-05, + "loss": 0.3306, + "step": 1796 + }, + { + "epoch": 1.0494854390190498, + "grad_norm": 0.08653078537691698, + "learning_rate": 1.0880868420873375e-05, + "loss": 0.3107, + "step": 1797 + }, + { + "epoch": 1.0500693380045252, + "grad_norm": 0.09169604268761192, + "learning_rate": 1.0870710935651868e-05, + "loss": 0.3429, + "step": 1798 + }, + { + "epoch": 1.0506532369900008, + "grad_norm": 0.09493007605348174, + "learning_rate": 1.086055254513497e-05, + "loss": 0.3497, + "step": 1799 + }, + { + "epoch": 1.0512371359754762, + "grad_norm": 0.08490866799480071, + "learning_rate": 1.085039325988456e-05, + "loss": 0.3124, + "step": 1800 + }, + { + "epoch": 1.0518210349609518, + "grad_norm": 0.08922831939149456, + "learning_rate": 1.0840233090463443e-05, + "loss": 0.3132, + "step": 1801 + }, + { + "epoch": 1.0524049339464272, + "grad_norm": 0.0805713269594515, + "learning_rate": 1.0830072047435354e-05, + "loss": 0.2718, + "step": 1802 + }, + { + "epoch": 1.0529888329319028, + "grad_norm": 0.08991259334441137, + "learning_rate": 1.0819910141364929e-05, + "loss": 0.3115, + "step": 1803 + }, + { + "epoch": 1.0535727319173782, + "grad_norm": 0.09374781268378798, + "learning_rate": 1.0809747382817702e-05, + "loss": 0.3681, + "step": 1804 + }, + { + "epoch": 1.0541566309028538, + "grad_norm": 0.08627362260119713, + "learning_rate": 1.0799583782360097e-05, + "loss": 0.3267, + "step": 1805 + }, + { + "epoch": 1.0547405298883292, + "grad_norm": 0.09506387196540553, + "learning_rate": 1.0789419350559407e-05, + "loss": 0.3632, + "step": 1806 + }, + { + "epoch": 1.0553244288738048, + "grad_norm": 0.09798222485238818, + "learning_rate": 1.0779254097983788e-05, + "loss": 0.3165, + "step": 1807 + }, + { + "epoch": 1.0559083278592802, + "grad_norm": 0.08768036666131321, + "learning_rate": 1.0769088035202268e-05, + "loss": 0.3072, + "step": 1808 + }, + { + "epoch": 1.0564922268447559, + "grad_norm": 0.08219342378639215, + "learning_rate": 1.0758921172784696e-05, + "loss": 0.3045, + "step": 1809 + }, + { + "epoch": 1.0570761258302315, + "grad_norm": 0.08448570506243047, + "learning_rate": 1.0748753521301758e-05, + "loss": 0.321, + "step": 1810 + }, + { + "epoch": 1.0576600248157069, + "grad_norm": 0.09029103370426278, + "learning_rate": 1.0738585091324966e-05, + "loss": 0.3553, + "step": 1811 + }, + { + "epoch": 1.0582439238011825, + "grad_norm": 0.0989027510012605, + "learning_rate": 1.0728415893426636e-05, + "loss": 0.339, + "step": 1812 + }, + { + "epoch": 1.0588278227866579, + "grad_norm": 0.09165678774356505, + "learning_rate": 1.0718245938179886e-05, + "loss": 0.312, + "step": 1813 + }, + { + "epoch": 1.0594117217721335, + "grad_norm": 0.08198090486209741, + "learning_rate": 1.0708075236158617e-05, + "loss": 0.3065, + "step": 1814 + }, + { + "epoch": 1.059995620757609, + "grad_norm": 0.09862188095755153, + "learning_rate": 1.0697903797937513e-05, + "loss": 0.3669, + "step": 1815 + }, + { + "epoch": 1.0605795197430845, + "grad_norm": 0.08531171999269131, + "learning_rate": 1.0687731634092016e-05, + "loss": 0.3163, + "step": 1816 + }, + { + "epoch": 1.06116341872856, + "grad_norm": 0.08589890088715009, + "learning_rate": 1.0677558755198327e-05, + "loss": 0.3187, + "step": 1817 + }, + { + "epoch": 1.0617473177140355, + "grad_norm": 0.09876564829875908, + "learning_rate": 1.0667385171833391e-05, + "loss": 0.3621, + "step": 1818 + }, + { + "epoch": 1.062331216699511, + "grad_norm": 0.08620276368200998, + "learning_rate": 1.0657210894574885e-05, + "loss": 0.2977, + "step": 1819 + }, + { + "epoch": 1.0629151156849865, + "grad_norm": 0.08804216875652, + "learning_rate": 1.0647035934001202e-05, + "loss": 0.332, + "step": 1820 + }, + { + "epoch": 1.063499014670462, + "grad_norm": 0.08794176555859715, + "learning_rate": 1.0636860300691452e-05, + "loss": 0.3394, + "step": 1821 + }, + { + "epoch": 1.0640829136559375, + "grad_norm": 0.08703184591676599, + "learning_rate": 1.0626684005225443e-05, + "loss": 0.3283, + "step": 1822 + }, + { + "epoch": 1.064666812641413, + "grad_norm": 0.08288200977204145, + "learning_rate": 1.0616507058183674e-05, + "loss": 0.3117, + "step": 1823 + }, + { + "epoch": 1.0652507116268886, + "grad_norm": 0.08799750108231359, + "learning_rate": 1.0606329470147313e-05, + "loss": 0.3343, + "step": 1824 + }, + { + "epoch": 1.065834610612364, + "grad_norm": 0.08509817566422585, + "learning_rate": 1.05961512516982e-05, + "loss": 0.3367, + "step": 1825 + }, + { + "epoch": 1.0664185095978396, + "grad_norm": 0.09374849162186365, + "learning_rate": 1.0585972413418833e-05, + "loss": 0.3363, + "step": 1826 + }, + { + "epoch": 1.0670024085833152, + "grad_norm": 0.09420625684969768, + "learning_rate": 1.0575792965892349e-05, + "loss": 0.3399, + "step": 1827 + }, + { + "epoch": 1.0675863075687906, + "grad_norm": 0.08674852776922133, + "learning_rate": 1.0565612919702527e-05, + "loss": 0.3337, + "step": 1828 + }, + { + "epoch": 1.0681702065542662, + "grad_norm": 0.08961677826804049, + "learning_rate": 1.0555432285433754e-05, + "loss": 0.308, + "step": 1829 + }, + { + "epoch": 1.0687541055397416, + "grad_norm": 0.09549812538635007, + "learning_rate": 1.0545251073671041e-05, + "loss": 0.352, + "step": 1830 + }, + { + "epoch": 1.0693380045252172, + "grad_norm": 0.09589061167158787, + "learning_rate": 1.0535069294999995e-05, + "loss": 0.3791, + "step": 1831 + }, + { + "epoch": 1.0699219035106926, + "grad_norm": 0.08618241978843535, + "learning_rate": 1.0524886960006813e-05, + "loss": 0.3409, + "step": 1832 + }, + { + "epoch": 1.0705058024961682, + "grad_norm": 0.08468421853287515, + "learning_rate": 1.0514704079278273e-05, + "loss": 0.3056, + "step": 1833 + }, + { + "epoch": 1.0710897014816436, + "grad_norm": 0.08845548632689418, + "learning_rate": 1.0504520663401714e-05, + "loss": 0.3299, + "step": 1834 + }, + { + "epoch": 1.0716736004671192, + "grad_norm": 0.09975184954630602, + "learning_rate": 1.049433672296503e-05, + "loss": 0.3708, + "step": 1835 + }, + { + "epoch": 1.0722574994525946, + "grad_norm": 0.08192115437700936, + "learning_rate": 1.0484152268556677e-05, + "loss": 0.2846, + "step": 1836 + }, + { + "epoch": 1.0728413984380702, + "grad_norm": 0.09424147309220979, + "learning_rate": 1.0473967310765629e-05, + "loss": 0.3534, + "step": 1837 + }, + { + "epoch": 1.0734252974235456, + "grad_norm": 0.09426711549308708, + "learning_rate": 1.0463781860181385e-05, + "loss": 0.3191, + "step": 1838 + }, + { + "epoch": 1.0740091964090213, + "grad_norm": 0.0844337214699368, + "learning_rate": 1.0453595927393962e-05, + "loss": 0.3288, + "step": 1839 + }, + { + "epoch": 1.0745930953944967, + "grad_norm": 0.10003675500230814, + "learning_rate": 1.0443409522993877e-05, + "loss": 0.3512, + "step": 1840 + }, + { + "epoch": 1.0751769943799723, + "grad_norm": 0.08554916408576194, + "learning_rate": 1.0433222657572135e-05, + "loss": 0.319, + "step": 1841 + }, + { + "epoch": 1.0757608933654477, + "grad_norm": 0.09023755474119918, + "learning_rate": 1.0423035341720222e-05, + "loss": 0.3705, + "step": 1842 + }, + { + "epoch": 1.0763447923509233, + "grad_norm": 0.09276344551483391, + "learning_rate": 1.041284758603009e-05, + "loss": 0.3309, + "step": 1843 + }, + { + "epoch": 1.076928691336399, + "grad_norm": 0.08436152914085132, + "learning_rate": 1.0402659401094154e-05, + "loss": 0.3195, + "step": 1844 + }, + { + "epoch": 1.0775125903218743, + "grad_norm": 0.09336363470567033, + "learning_rate": 1.0392470797505268e-05, + "loss": 0.3333, + "step": 1845 + }, + { + "epoch": 1.07809648930735, + "grad_norm": 0.08591793618865334, + "learning_rate": 1.0382281785856725e-05, + "loss": 0.3488, + "step": 1846 + }, + { + "epoch": 1.0786803882928253, + "grad_norm": 0.08739884747320598, + "learning_rate": 1.0372092376742247e-05, + "loss": 0.3092, + "step": 1847 + }, + { + "epoch": 1.079264287278301, + "grad_norm": 0.09811282418304136, + "learning_rate": 1.0361902580755955e-05, + "loss": 0.3632, + "step": 1848 + }, + { + "epoch": 1.0798481862637763, + "grad_norm": 0.09557900509939282, + "learning_rate": 1.035171240849239e-05, + "loss": 0.3571, + "step": 1849 + }, + { + "epoch": 1.080432085249252, + "grad_norm": 0.07843247729669184, + "learning_rate": 1.0341521870546472e-05, + "loss": 0.2942, + "step": 1850 + }, + { + "epoch": 1.0810159842347273, + "grad_norm": 0.08064069872814345, + "learning_rate": 1.033133097751351e-05, + "loss": 0.2991, + "step": 1851 + }, + { + "epoch": 1.081599883220203, + "grad_norm": 0.08693433080232424, + "learning_rate": 1.0321139739989167e-05, + "loss": 0.3231, + "step": 1852 + }, + { + "epoch": 1.0821837822056783, + "grad_norm": 0.08101892520650408, + "learning_rate": 1.0310948168569483e-05, + "loss": 0.3034, + "step": 1853 + }, + { + "epoch": 1.082767681191154, + "grad_norm": 0.09510249897453325, + "learning_rate": 1.0300756273850837e-05, + "loss": 0.381, + "step": 1854 + }, + { + "epoch": 1.0833515801766294, + "grad_norm": 0.08549846404403888, + "learning_rate": 1.0290564066429935e-05, + "loss": 0.3223, + "step": 1855 + }, + { + "epoch": 1.083935479162105, + "grad_norm": 0.08813182374334028, + "learning_rate": 1.0280371556903827e-05, + "loss": 0.3511, + "step": 1856 + }, + { + "epoch": 1.0845193781475804, + "grad_norm": 0.09608852566634614, + "learning_rate": 1.0270178755869861e-05, + "loss": 0.3364, + "step": 1857 + }, + { + "epoch": 1.085103277133056, + "grad_norm": 0.09158418096477498, + "learning_rate": 1.0259985673925694e-05, + "loss": 0.3552, + "step": 1858 + }, + { + "epoch": 1.0856871761185314, + "grad_norm": 0.09021492669139962, + "learning_rate": 1.0249792321669276e-05, + "loss": 0.2869, + "step": 1859 + }, + { + "epoch": 1.086271075104007, + "grad_norm": 0.09300721079034427, + "learning_rate": 1.0239598709698839e-05, + "loss": 0.3464, + "step": 1860 + }, + { + "epoch": 1.0868549740894826, + "grad_norm": 0.0960055804396295, + "learning_rate": 1.0229404848612882e-05, + "loss": 0.3431, + "step": 1861 + }, + { + "epoch": 1.087438873074958, + "grad_norm": 0.08069203327882529, + "learning_rate": 1.0219210749010162e-05, + "loss": 0.2986, + "step": 1862 + }, + { + "epoch": 1.0880227720604336, + "grad_norm": 0.09368974931395188, + "learning_rate": 1.0209016421489685e-05, + "loss": 0.3446, + "step": 1863 + }, + { + "epoch": 1.088606671045909, + "grad_norm": 0.09601892819659338, + "learning_rate": 1.0198821876650702e-05, + "loss": 0.3597, + "step": 1864 + }, + { + "epoch": 1.0891905700313846, + "grad_norm": 0.08172385920986065, + "learning_rate": 1.0188627125092678e-05, + "loss": 0.2997, + "step": 1865 + }, + { + "epoch": 1.08977446901686, + "grad_norm": 0.09154874967328588, + "learning_rate": 1.0178432177415298e-05, + "loss": 0.3075, + "step": 1866 + }, + { + "epoch": 1.0903583680023357, + "grad_norm": 0.0911403002948232, + "learning_rate": 1.0168237044218452e-05, + "loss": 0.3249, + "step": 1867 + }, + { + "epoch": 1.090942266987811, + "grad_norm": 0.08654142501988739, + "learning_rate": 1.0158041736102221e-05, + "loss": 0.3054, + "step": 1868 + }, + { + "epoch": 1.0915261659732867, + "grad_norm": 0.09206368059431085, + "learning_rate": 1.014784626366687e-05, + "loss": 0.3247, + "step": 1869 + }, + { + "epoch": 1.092110064958762, + "grad_norm": 0.09435088235850858, + "learning_rate": 1.0137650637512835e-05, + "loss": 0.3307, + "step": 1870 + }, + { + "epoch": 1.0926939639442377, + "grad_norm": 0.09138399197441666, + "learning_rate": 1.0127454868240702e-05, + "loss": 0.3501, + "step": 1871 + }, + { + "epoch": 1.093277862929713, + "grad_norm": 0.09625330008174991, + "learning_rate": 1.0117258966451224e-05, + "loss": 0.3161, + "step": 1872 + }, + { + "epoch": 1.0938617619151887, + "grad_norm": 0.09946204105830425, + "learning_rate": 1.0107062942745276e-05, + "loss": 0.3683, + "step": 1873 + }, + { + "epoch": 1.094445660900664, + "grad_norm": 0.08258118205313959, + "learning_rate": 1.0096866807723868e-05, + "loss": 0.3119, + "step": 1874 + }, + { + "epoch": 1.0950295598861397, + "grad_norm": 0.08668960851809858, + "learning_rate": 1.0086670571988124e-05, + "loss": 0.3095, + "step": 1875 + }, + { + "epoch": 1.095613458871615, + "grad_norm": 0.08466900741840805, + "learning_rate": 1.0076474246139272e-05, + "loss": 0.3313, + "step": 1876 + }, + { + "epoch": 1.0961973578570907, + "grad_norm": 0.08826481835562267, + "learning_rate": 1.0066277840778626e-05, + "loss": 0.287, + "step": 1877 + }, + { + "epoch": 1.0967812568425663, + "grad_norm": 0.10170503863571902, + "learning_rate": 1.0056081366507602e-05, + "loss": 0.4184, + "step": 1878 + }, + { + "epoch": 1.0973651558280417, + "grad_norm": 0.0908394116925519, + "learning_rate": 1.0045884833927673e-05, + "loss": 0.3385, + "step": 1879 + }, + { + "epoch": 1.0979490548135173, + "grad_norm": 0.08186101611614854, + "learning_rate": 1.0035688253640372e-05, + "loss": 0.2924, + "step": 1880 + }, + { + "epoch": 1.0985329537989927, + "grad_norm": 0.1020460010912883, + "learning_rate": 1.0025491636247287e-05, + "loss": 0.3709, + "step": 1881 + }, + { + "epoch": 1.0991168527844684, + "grad_norm": 0.09095734377070469, + "learning_rate": 1.0015294992350044e-05, + "loss": 0.3027, + "step": 1882 + }, + { + "epoch": 1.0997007517699438, + "grad_norm": 0.1261363141183274, + "learning_rate": 1.0005098332550293e-05, + "loss": 0.3214, + "step": 1883 + }, + { + "epoch": 1.1002846507554194, + "grad_norm": 0.0896128782523432, + "learning_rate": 9.994901667449708e-06, + "loss": 0.3352, + "step": 1884 + }, + { + "epoch": 1.1008685497408948, + "grad_norm": 0.0885084114401367, + "learning_rate": 9.98470500764996e-06, + "loss": 0.3638, + "step": 1885 + }, + { + "epoch": 1.1014524487263704, + "grad_norm": 0.08329796184729478, + "learning_rate": 9.974508363752715e-06, + "loss": 0.3177, + "step": 1886 + }, + { + "epoch": 1.1020363477118458, + "grad_norm": 0.09045340983354123, + "learning_rate": 9.964311746359631e-06, + "loss": 0.3548, + "step": 1887 + }, + { + "epoch": 1.1026202466973214, + "grad_norm": 0.08201552686318266, + "learning_rate": 9.95411516607233e-06, + "loss": 0.3442, + "step": 1888 + }, + { + "epoch": 1.1032041456827968, + "grad_norm": 0.09253972118624623, + "learning_rate": 9.943918633492401e-06, + "loss": 0.3262, + "step": 1889 + }, + { + "epoch": 1.1037880446682724, + "grad_norm": 0.10017683453064405, + "learning_rate": 9.933722159221375e-06, + "loss": 0.3587, + "step": 1890 + }, + { + "epoch": 1.1043719436537478, + "grad_norm": 0.09651968021882043, + "learning_rate": 9.923525753860735e-06, + "loss": 0.3429, + "step": 1891 + }, + { + "epoch": 1.1049558426392234, + "grad_norm": 0.07744407636594508, + "learning_rate": 9.91332942801188e-06, + "loss": 0.2575, + "step": 1892 + }, + { + "epoch": 1.1055397416246988, + "grad_norm": 0.09387865939097449, + "learning_rate": 9.903133192276134e-06, + "loss": 0.3942, + "step": 1893 + }, + { + "epoch": 1.1061236406101744, + "grad_norm": 0.08581735557704355, + "learning_rate": 9.892937057254729e-06, + "loss": 0.2986, + "step": 1894 + }, + { + "epoch": 1.10670753959565, + "grad_norm": 0.08687798158726862, + "learning_rate": 9.882741033548781e-06, + "loss": 0.3327, + "step": 1895 + }, + { + "epoch": 1.1072914385811254, + "grad_norm": 0.08748788987404404, + "learning_rate": 9.872545131759301e-06, + "loss": 0.311, + "step": 1896 + }, + { + "epoch": 1.107875337566601, + "grad_norm": 0.09543963107929415, + "learning_rate": 9.862349362487172e-06, + "loss": 0.3661, + "step": 1897 + }, + { + "epoch": 1.1084592365520765, + "grad_norm": 0.0875661572958271, + "learning_rate": 9.85215373633313e-06, + "loss": 0.3199, + "step": 1898 + }, + { + "epoch": 1.109043135537552, + "grad_norm": 0.09474444150147031, + "learning_rate": 9.841958263897779e-06, + "loss": 0.3464, + "step": 1899 + }, + { + "epoch": 1.1096270345230275, + "grad_norm": 0.08879942800812773, + "learning_rate": 9.831762955781548e-06, + "loss": 0.2864, + "step": 1900 + }, + { + "epoch": 1.110210933508503, + "grad_norm": 0.08791141654005762, + "learning_rate": 9.8215678225847e-06, + "loss": 0.338, + "step": 1901 + }, + { + "epoch": 1.1107948324939785, + "grad_norm": 0.08623327404856848, + "learning_rate": 9.811372874907323e-06, + "loss": 0.3306, + "step": 1902 + }, + { + "epoch": 1.111378731479454, + "grad_norm": 0.0891946146718852, + "learning_rate": 9.801178123349298e-06, + "loss": 0.3609, + "step": 1903 + }, + { + "epoch": 1.1119626304649295, + "grad_norm": 0.08617093574491179, + "learning_rate": 9.790983578510315e-06, + "loss": 0.3649, + "step": 1904 + }, + { + "epoch": 1.112546529450405, + "grad_norm": 0.08592057731270362, + "learning_rate": 9.780789250989841e-06, + "loss": 0.2976, + "step": 1905 + }, + { + "epoch": 1.1131304284358805, + "grad_norm": 0.08887633672923068, + "learning_rate": 9.77059515138712e-06, + "loss": 0.3234, + "step": 1906 + }, + { + "epoch": 1.1137143274213561, + "grad_norm": 0.08525591094079199, + "learning_rate": 9.760401290301164e-06, + "loss": 0.3222, + "step": 1907 + }, + { + "epoch": 1.1142982264068315, + "grad_norm": 0.08728546007828297, + "learning_rate": 9.750207678330726e-06, + "loss": 0.352, + "step": 1908 + }, + { + "epoch": 1.1148821253923071, + "grad_norm": 0.08990332738981899, + "learning_rate": 9.740014326074308e-06, + "loss": 0.3432, + "step": 1909 + }, + { + "epoch": 1.1154660243777825, + "grad_norm": 0.09605260089569588, + "learning_rate": 9.729821244130142e-06, + "loss": 0.3498, + "step": 1910 + }, + { + "epoch": 1.1160499233632581, + "grad_norm": 0.08890973600408256, + "learning_rate": 9.719628443096175e-06, + "loss": 0.3341, + "step": 1911 + }, + { + "epoch": 1.1166338223487338, + "grad_norm": 0.09524397088339692, + "learning_rate": 9.709435933570068e-06, + "loss": 0.3465, + "step": 1912 + }, + { + "epoch": 1.1172177213342092, + "grad_norm": 0.09452993705443247, + "learning_rate": 9.699243726149168e-06, + "loss": 0.3305, + "step": 1913 + }, + { + "epoch": 1.1178016203196848, + "grad_norm": 0.09133219778555215, + "learning_rate": 9.689051831430518e-06, + "loss": 0.3405, + "step": 1914 + }, + { + "epoch": 1.1183855193051602, + "grad_norm": 0.09338435111138753, + "learning_rate": 9.678860260010834e-06, + "loss": 0.3042, + "step": 1915 + }, + { + "epoch": 1.1189694182906358, + "grad_norm": 0.08788457658099857, + "learning_rate": 9.668669022486495e-06, + "loss": 0.3185, + "step": 1916 + }, + { + "epoch": 1.1195533172761112, + "grad_norm": 0.09102244584692436, + "learning_rate": 9.658478129453532e-06, + "loss": 0.3145, + "step": 1917 + }, + { + "epoch": 1.1201372162615868, + "grad_norm": 0.09820977505822985, + "learning_rate": 9.648287591507613e-06, + "loss": 0.3359, + "step": 1918 + }, + { + "epoch": 1.1207211152470622, + "grad_norm": 0.09341075263925457, + "learning_rate": 9.638097419244048e-06, + "loss": 0.3408, + "step": 1919 + }, + { + "epoch": 1.1213050142325378, + "grad_norm": 0.09577786010014924, + "learning_rate": 9.627907623257758e-06, + "loss": 0.3544, + "step": 1920 + }, + { + "epoch": 1.1218889132180132, + "grad_norm": 0.08891238118643427, + "learning_rate": 9.617718214143279e-06, + "loss": 0.3143, + "step": 1921 + }, + { + "epoch": 1.1224728122034888, + "grad_norm": 0.0845200637862441, + "learning_rate": 9.607529202494739e-06, + "loss": 0.2964, + "step": 1922 + }, + { + "epoch": 1.1230567111889642, + "grad_norm": 0.09721915300410626, + "learning_rate": 9.597340598905851e-06, + "loss": 0.3652, + "step": 1923 + }, + { + "epoch": 1.1236406101744398, + "grad_norm": 0.0849865500424207, + "learning_rate": 9.587152413969915e-06, + "loss": 0.2882, + "step": 1924 + }, + { + "epoch": 1.1242245091599152, + "grad_norm": 0.08853704011023654, + "learning_rate": 9.576964658279783e-06, + "loss": 0.3155, + "step": 1925 + }, + { + "epoch": 1.1248084081453908, + "grad_norm": 0.08982160880811822, + "learning_rate": 9.566777342427867e-06, + "loss": 0.3239, + "step": 1926 + }, + { + "epoch": 1.1253923071308662, + "grad_norm": 0.09466611244641286, + "learning_rate": 9.556590477006123e-06, + "loss": 0.3797, + "step": 1927 + }, + { + "epoch": 1.1259762061163419, + "grad_norm": 0.0861500141090069, + "learning_rate": 9.546404072606038e-06, + "loss": 0.3268, + "step": 1928 + }, + { + "epoch": 1.1265601051018175, + "grad_norm": 0.08712340601824575, + "learning_rate": 9.536218139818615e-06, + "loss": 0.3361, + "step": 1929 + }, + { + "epoch": 1.1271440040872929, + "grad_norm": 0.08641470725320563, + "learning_rate": 9.526032689234374e-06, + "loss": 0.3574, + "step": 1930 + }, + { + "epoch": 1.1277279030727685, + "grad_norm": 0.09093367953856038, + "learning_rate": 9.515847731443324e-06, + "loss": 0.3043, + "step": 1931 + }, + { + "epoch": 1.1283118020582439, + "grad_norm": 0.08474927025618849, + "learning_rate": 9.50566327703497e-06, + "loss": 0.3285, + "step": 1932 + }, + { + "epoch": 1.1288957010437195, + "grad_norm": 0.09369777973696722, + "learning_rate": 9.49547933659829e-06, + "loss": 0.3439, + "step": 1933 + }, + { + "epoch": 1.129479600029195, + "grad_norm": 0.0885202869896613, + "learning_rate": 9.48529592072173e-06, + "loss": 0.3244, + "step": 1934 + }, + { + "epoch": 1.1300634990146705, + "grad_norm": 0.08637043895605073, + "learning_rate": 9.475113039993188e-06, + "loss": 0.312, + "step": 1935 + }, + { + "epoch": 1.130647398000146, + "grad_norm": 0.08667231566000794, + "learning_rate": 9.464930705000008e-06, + "loss": 0.3318, + "step": 1936 + }, + { + "epoch": 1.1312312969856215, + "grad_norm": 0.08429349690611236, + "learning_rate": 9.454748926328962e-06, + "loss": 0.3224, + "step": 1937 + }, + { + "epoch": 1.131815195971097, + "grad_norm": 0.08637131944071748, + "learning_rate": 9.44456771456625e-06, + "loss": 0.3445, + "step": 1938 + }, + { + "epoch": 1.1323990949565725, + "grad_norm": 0.08855513734451015, + "learning_rate": 9.434387080297477e-06, + "loss": 0.3207, + "step": 1939 + }, + { + "epoch": 1.132982993942048, + "grad_norm": 0.09126002491419556, + "learning_rate": 9.424207034107653e-06, + "loss": 0.3387, + "step": 1940 + }, + { + "epoch": 1.1335668929275236, + "grad_norm": 0.0855312278483802, + "learning_rate": 9.41402758658117e-06, + "loss": 0.3053, + "step": 1941 + }, + { + "epoch": 1.1341507919129992, + "grad_norm": 0.080055141081926, + "learning_rate": 9.403848748301802e-06, + "loss": 0.3185, + "step": 1942 + }, + { + "epoch": 1.1347346908984746, + "grad_norm": 0.0872126021081361, + "learning_rate": 9.39367052985269e-06, + "loss": 0.3374, + "step": 1943 + }, + { + "epoch": 1.13531858988395, + "grad_norm": 0.09003257434312742, + "learning_rate": 9.38349294181633e-06, + "loss": 0.317, + "step": 1944 + }, + { + "epoch": 1.1359024888694256, + "grad_norm": 0.09005787075795883, + "learning_rate": 9.373315994774558e-06, + "loss": 0.3081, + "step": 1945 + }, + { + "epoch": 1.1364863878549012, + "grad_norm": 0.0879423308725366, + "learning_rate": 9.363139699308552e-06, + "loss": 0.3694, + "step": 1946 + }, + { + "epoch": 1.1370702868403766, + "grad_norm": 0.0897252869537841, + "learning_rate": 9.352964065998801e-06, + "loss": 0.3661, + "step": 1947 + }, + { + "epoch": 1.1376541858258522, + "grad_norm": 0.08750478822635603, + "learning_rate": 9.34278910542512e-06, + "loss": 0.329, + "step": 1948 + }, + { + "epoch": 1.1382380848113276, + "grad_norm": 0.09462196011160937, + "learning_rate": 9.332614828166612e-06, + "loss": 0.3644, + "step": 1949 + }, + { + "epoch": 1.1388219837968032, + "grad_norm": 0.0966323802417669, + "learning_rate": 9.322441244801678e-06, + "loss": 0.343, + "step": 1950 + }, + { + "epoch": 1.1394058827822786, + "grad_norm": 0.0876051977019757, + "learning_rate": 9.312268365907989e-06, + "loss": 0.3203, + "step": 1951 + }, + { + "epoch": 1.1399897817677542, + "grad_norm": 0.08808572259773834, + "learning_rate": 9.302096202062492e-06, + "loss": 0.3151, + "step": 1952 + }, + { + "epoch": 1.1405736807532296, + "grad_norm": 0.09327218945882221, + "learning_rate": 9.291924763841387e-06, + "loss": 0.3073, + "step": 1953 + }, + { + "epoch": 1.1411575797387052, + "grad_norm": 0.09370052182546974, + "learning_rate": 9.281754061820116e-06, + "loss": 0.3812, + "step": 1954 + }, + { + "epoch": 1.1417414787241806, + "grad_norm": 0.08830101505639888, + "learning_rate": 9.271584106573364e-06, + "loss": 0.3046, + "step": 1955 + }, + { + "epoch": 1.1423253777096563, + "grad_norm": 0.09636322610297128, + "learning_rate": 9.261414908675036e-06, + "loss": 0.3367, + "step": 1956 + }, + { + "epoch": 1.1429092766951316, + "grad_norm": 0.08440938717924151, + "learning_rate": 9.251246478698242e-06, + "loss": 0.2962, + "step": 1957 + }, + { + "epoch": 1.1434931756806073, + "grad_norm": 0.09128667816804639, + "learning_rate": 9.241078827215305e-06, + "loss": 0.3592, + "step": 1958 + }, + { + "epoch": 1.1440770746660829, + "grad_norm": 0.09389906607057764, + "learning_rate": 9.230911964797734e-06, + "loss": 0.3315, + "step": 1959 + }, + { + "epoch": 1.1446609736515583, + "grad_norm": 0.09837540301060549, + "learning_rate": 9.22074590201621e-06, + "loss": 0.3819, + "step": 1960 + }, + { + "epoch": 1.1452448726370337, + "grad_norm": 0.08707778711773677, + "learning_rate": 9.210580649440598e-06, + "loss": 0.303, + "step": 1961 + }, + { + "epoch": 1.1458287716225093, + "grad_norm": 0.0913934683112321, + "learning_rate": 9.200416217639906e-06, + "loss": 0.3506, + "step": 1962 + }, + { + "epoch": 1.146412670607985, + "grad_norm": 0.08986913466504597, + "learning_rate": 9.190252617182301e-06, + "loss": 0.2968, + "step": 1963 + }, + { + "epoch": 1.1469965695934603, + "grad_norm": 0.08363074851294831, + "learning_rate": 9.180089858635075e-06, + "loss": 0.3339, + "step": 1964 + }, + { + "epoch": 1.147580468578936, + "grad_norm": 0.09167795760631908, + "learning_rate": 9.169927952564649e-06, + "loss": 0.3346, + "step": 1965 + }, + { + "epoch": 1.1481643675644113, + "grad_norm": 0.08951413858063466, + "learning_rate": 9.159766909536559e-06, + "loss": 0.3289, + "step": 1966 + }, + { + "epoch": 1.148748266549887, + "grad_norm": 0.0877588964335431, + "learning_rate": 9.149606740115444e-06, + "loss": 0.3254, + "step": 1967 + }, + { + "epoch": 1.1493321655353623, + "grad_norm": 0.08633956915256566, + "learning_rate": 9.139447454865034e-06, + "loss": 0.3122, + "step": 1968 + }, + { + "epoch": 1.149916064520838, + "grad_norm": 0.094524543212872, + "learning_rate": 9.129289064348135e-06, + "loss": 0.3246, + "step": 1969 + }, + { + "epoch": 1.1504999635063133, + "grad_norm": 0.10255989673364829, + "learning_rate": 9.119131579126628e-06, + "loss": 0.359, + "step": 1970 + }, + { + "epoch": 1.151083862491789, + "grad_norm": 0.09036458505641354, + "learning_rate": 9.108975009761452e-06, + "loss": 0.3557, + "step": 1971 + }, + { + "epoch": 1.1516677614772644, + "grad_norm": 0.08486439547057462, + "learning_rate": 9.098819366812594e-06, + "loss": 0.3164, + "step": 1972 + }, + { + "epoch": 1.15225166046274, + "grad_norm": 0.08610447263168651, + "learning_rate": 9.088664660839078e-06, + "loss": 0.3315, + "step": 1973 + }, + { + "epoch": 1.1528355594482154, + "grad_norm": 0.0821416341536052, + "learning_rate": 9.078510902398948e-06, + "loss": 0.3021, + "step": 1974 + }, + { + "epoch": 1.153419458433691, + "grad_norm": 0.08737575297618551, + "learning_rate": 9.068358102049272e-06, + "loss": 0.3095, + "step": 1975 + }, + { + "epoch": 1.1540033574191666, + "grad_norm": 0.08942829383846747, + "learning_rate": 9.058206270346115e-06, + "loss": 0.3247, + "step": 1976 + }, + { + "epoch": 1.154587256404642, + "grad_norm": 0.08786813475121386, + "learning_rate": 9.04805541784454e-06, + "loss": 0.3365, + "step": 1977 + }, + { + "epoch": 1.1551711553901174, + "grad_norm": 0.08419588326779412, + "learning_rate": 9.037905555098589e-06, + "loss": 0.3065, + "step": 1978 + }, + { + "epoch": 1.155755054375593, + "grad_norm": 0.08507313609289438, + "learning_rate": 9.027756692661272e-06, + "loss": 0.3009, + "step": 1979 + }, + { + "epoch": 1.1563389533610686, + "grad_norm": 0.09340794481966345, + "learning_rate": 9.017608841084564e-06, + "loss": 0.3412, + "step": 1980 + }, + { + "epoch": 1.156922852346544, + "grad_norm": 0.09000360743008402, + "learning_rate": 9.007462010919387e-06, + "loss": 0.3105, + "step": 1981 + }, + { + "epoch": 1.1575067513320196, + "grad_norm": 0.10725680285629115, + "learning_rate": 8.997316212715599e-06, + "loss": 0.3618, + "step": 1982 + }, + { + "epoch": 1.158090650317495, + "grad_norm": 0.10158171229696798, + "learning_rate": 8.987171457021992e-06, + "loss": 0.3699, + "step": 1983 + }, + { + "epoch": 1.1586745493029706, + "grad_norm": 0.09466697949043662, + "learning_rate": 8.977027754386267e-06, + "loss": 0.3134, + "step": 1984 + }, + { + "epoch": 1.159258448288446, + "grad_norm": 0.09574576573575216, + "learning_rate": 8.966885115355033e-06, + "loss": 0.3449, + "step": 1985 + }, + { + "epoch": 1.1598423472739217, + "grad_norm": 0.08604147597181305, + "learning_rate": 8.95674355047379e-06, + "loss": 0.3185, + "step": 1986 + }, + { + "epoch": 1.160426246259397, + "grad_norm": 0.09185857480692544, + "learning_rate": 8.946603070286926e-06, + "loss": 0.3346, + "step": 1987 + }, + { + "epoch": 1.1610101452448727, + "grad_norm": 0.08843123498991942, + "learning_rate": 8.936463685337697e-06, + "loss": 0.3117, + "step": 1988 + }, + { + "epoch": 1.161594044230348, + "grad_norm": 0.0965653971633371, + "learning_rate": 8.926325406168225e-06, + "loss": 0.3393, + "step": 1989 + }, + { + "epoch": 1.1621779432158237, + "grad_norm": 0.09224900450230931, + "learning_rate": 8.91618824331948e-06, + "loss": 0.3045, + "step": 1990 + }, + { + "epoch": 1.162761842201299, + "grad_norm": 0.09327984735466081, + "learning_rate": 8.906052207331268e-06, + "loss": 0.3595, + "step": 1991 + }, + { + "epoch": 1.1633457411867747, + "grad_norm": 0.0898004683215074, + "learning_rate": 8.895917308742224e-06, + "loss": 0.354, + "step": 1992 + }, + { + "epoch": 1.1639296401722503, + "grad_norm": 0.09450340933352126, + "learning_rate": 8.88578355808981e-06, + "loss": 0.3372, + "step": 1993 + }, + { + "epoch": 1.1645135391577257, + "grad_norm": 0.096049954830058, + "learning_rate": 8.87565096591028e-06, + "loss": 0.3422, + "step": 1994 + }, + { + "epoch": 1.165097438143201, + "grad_norm": 0.08759726327723343, + "learning_rate": 8.865519542738696e-06, + "loss": 0.3328, + "step": 1995 + }, + { + "epoch": 1.1656813371286767, + "grad_norm": 0.0855851612212482, + "learning_rate": 8.855389299108894e-06, + "loss": 0.3619, + "step": 1996 + }, + { + "epoch": 1.1662652361141523, + "grad_norm": 0.07977637337252606, + "learning_rate": 8.845260245553493e-06, + "loss": 0.2931, + "step": 1997 + }, + { + "epoch": 1.1668491350996277, + "grad_norm": 0.08858841568240232, + "learning_rate": 8.83513239260387e-06, + "loss": 0.3002, + "step": 1998 + }, + { + "epoch": 1.1674330340851033, + "grad_norm": 0.08865197901365328, + "learning_rate": 8.82500575079015e-06, + "loss": 0.3414, + "step": 1999 + }, + { + "epoch": 1.1680169330705787, + "grad_norm": 0.08836497268257691, + "learning_rate": 8.81488033064121e-06, + "loss": 0.3064, + "step": 2000 + }, + { + "epoch": 1.1686008320560544, + "grad_norm": 0.08726725846232473, + "learning_rate": 8.804756142684644e-06, + "loss": 0.3232, + "step": 2001 + }, + { + "epoch": 1.1691847310415298, + "grad_norm": 0.1003175501300991, + "learning_rate": 8.79463319744677e-06, + "loss": 0.3418, + "step": 2002 + }, + { + "epoch": 1.1697686300270054, + "grad_norm": 0.09579676483673263, + "learning_rate": 8.78451150545262e-06, + "loss": 0.3275, + "step": 2003 + }, + { + "epoch": 1.1703525290124808, + "grad_norm": 0.08340171263925608, + "learning_rate": 8.774391077225914e-06, + "loss": 0.2823, + "step": 2004 + }, + { + "epoch": 1.1709364279979564, + "grad_norm": 0.08471021410803596, + "learning_rate": 8.764271923289064e-06, + "loss": 0.333, + "step": 2005 + }, + { + "epoch": 1.1715203269834318, + "grad_norm": 0.08286583324571123, + "learning_rate": 8.754154054163148e-06, + "loss": 0.3199, + "step": 2006 + }, + { + "epoch": 1.1721042259689074, + "grad_norm": 0.08938709323429189, + "learning_rate": 8.744037480367922e-06, + "loss": 0.3156, + "step": 2007 + }, + { + "epoch": 1.1726881249543828, + "grad_norm": 0.09071984817071674, + "learning_rate": 8.733922212421785e-06, + "loss": 0.3103, + "step": 2008 + }, + { + "epoch": 1.1732720239398584, + "grad_norm": 0.09296823278237291, + "learning_rate": 8.723808260841781e-06, + "loss": 0.3524, + "step": 2009 + }, + { + "epoch": 1.173855922925334, + "grad_norm": 0.09797577152058487, + "learning_rate": 8.713695636143584e-06, + "loss": 0.3292, + "step": 2010 + }, + { + "epoch": 1.1744398219108094, + "grad_norm": 0.08157491255408575, + "learning_rate": 8.703584348841494e-06, + "loss": 0.3096, + "step": 2011 + }, + { + "epoch": 1.1750237208962848, + "grad_norm": 0.08533795315988303, + "learning_rate": 8.693474409448416e-06, + "loss": 0.3058, + "step": 2012 + }, + { + "epoch": 1.1756076198817604, + "grad_norm": 0.09020326129304207, + "learning_rate": 8.683365828475855e-06, + "loss": 0.3205, + "step": 2013 + }, + { + "epoch": 1.176191518867236, + "grad_norm": 0.08628119935114273, + "learning_rate": 8.673258616433898e-06, + "loss": 0.3184, + "step": 2014 + }, + { + "epoch": 1.1767754178527114, + "grad_norm": 0.0936071568525904, + "learning_rate": 8.663152783831215e-06, + "loss": 0.3622, + "step": 2015 + }, + { + "epoch": 1.177359316838187, + "grad_norm": 0.08506301902193308, + "learning_rate": 8.653048341175044e-06, + "loss": 0.3025, + "step": 2016 + }, + { + "epoch": 1.1779432158236625, + "grad_norm": 0.08486492419968637, + "learning_rate": 8.642945298971168e-06, + "loss": 0.3278, + "step": 2017 + }, + { + "epoch": 1.178527114809138, + "grad_norm": 0.08643089452139781, + "learning_rate": 8.632843667723927e-06, + "loss": 0.3168, + "step": 2018 + }, + { + "epoch": 1.1791110137946135, + "grad_norm": 0.08311772011829664, + "learning_rate": 8.62274345793618e-06, + "loss": 0.3291, + "step": 2019 + }, + { + "epoch": 1.179694912780089, + "grad_norm": 0.08518278542348555, + "learning_rate": 8.61264468010932e-06, + "loss": 0.3407, + "step": 2020 + }, + { + "epoch": 1.1802788117655645, + "grad_norm": 0.08950711091173093, + "learning_rate": 8.602547344743241e-06, + "loss": 0.3604, + "step": 2021 + }, + { + "epoch": 1.18086271075104, + "grad_norm": 0.08009010302762865, + "learning_rate": 8.592451462336348e-06, + "loss": 0.3108, + "step": 2022 + }, + { + "epoch": 1.1814466097365155, + "grad_norm": 0.08536565204964701, + "learning_rate": 8.582357043385529e-06, + "loss": 0.3274, + "step": 2023 + }, + { + "epoch": 1.1820305087219911, + "grad_norm": 0.10093033959494314, + "learning_rate": 8.572264098386149e-06, + "loss": 0.3948, + "step": 2024 + }, + { + "epoch": 1.1826144077074665, + "grad_norm": 0.09359737967716063, + "learning_rate": 8.562172637832041e-06, + "loss": 0.3249, + "step": 2025 + }, + { + "epoch": 1.1831983066929421, + "grad_norm": 0.08419883167045322, + "learning_rate": 8.5520826722155e-06, + "loss": 0.3185, + "step": 2026 + }, + { + "epoch": 1.1837822056784177, + "grad_norm": 0.09418754010571004, + "learning_rate": 8.54199421202726e-06, + "loss": 0.3457, + "step": 2027 + }, + { + "epoch": 1.1843661046638931, + "grad_norm": 0.09364267136227672, + "learning_rate": 8.531907267756498e-06, + "loss": 0.3135, + "step": 2028 + }, + { + "epoch": 1.1849500036493688, + "grad_norm": 0.09517343293166886, + "learning_rate": 8.521821849890802e-06, + "loss": 0.3432, + "step": 2029 + }, + { + "epoch": 1.1855339026348442, + "grad_norm": 0.08899341831106064, + "learning_rate": 8.511737968916185e-06, + "loss": 0.3666, + "step": 2030 + }, + { + "epoch": 1.1861178016203198, + "grad_norm": 0.09225512938249174, + "learning_rate": 8.50165563531706e-06, + "loss": 0.3556, + "step": 2031 + }, + { + "epoch": 1.1867017006057952, + "grad_norm": 0.08182079619534831, + "learning_rate": 8.491574859576222e-06, + "loss": 0.2991, + "step": 2032 + }, + { + "epoch": 1.1872855995912708, + "grad_norm": 0.0834382289868222, + "learning_rate": 8.481495652174859e-06, + "loss": 0.3274, + "step": 2033 + }, + { + "epoch": 1.1878694985767462, + "grad_norm": 0.08848447473008489, + "learning_rate": 8.47141802359252e-06, + "loss": 0.314, + "step": 2034 + }, + { + "epoch": 1.1884533975622218, + "grad_norm": 0.08885211099252358, + "learning_rate": 8.461341984307115e-06, + "loss": 0.3111, + "step": 2035 + }, + { + "epoch": 1.1890372965476972, + "grad_norm": 0.08374474532687923, + "learning_rate": 8.4512675447949e-06, + "loss": 0.3098, + "step": 2036 + }, + { + "epoch": 1.1896211955331728, + "grad_norm": 0.08851997469428567, + "learning_rate": 8.441194715530472e-06, + "loss": 0.3248, + "step": 2037 + }, + { + "epoch": 1.1902050945186482, + "grad_norm": 0.08155370546586357, + "learning_rate": 8.431123506986747e-06, + "loss": 0.3005, + "step": 2038 + }, + { + "epoch": 1.1907889935041238, + "grad_norm": 0.08316419877005728, + "learning_rate": 8.421053929634966e-06, + "loss": 0.3174, + "step": 2039 + }, + { + "epoch": 1.1913728924895992, + "grad_norm": 0.08646171446567384, + "learning_rate": 8.410985993944663e-06, + "loss": 0.312, + "step": 2040 + }, + { + "epoch": 1.1919567914750748, + "grad_norm": 0.09041485750204672, + "learning_rate": 8.400919710383673e-06, + "loss": 0.3443, + "step": 2041 + }, + { + "epoch": 1.1925406904605502, + "grad_norm": 0.08246920817556226, + "learning_rate": 8.390855089418109e-06, + "loss": 0.3031, + "step": 2042 + }, + { + "epoch": 1.1931245894460258, + "grad_norm": 0.0811914653360878, + "learning_rate": 8.380792141512355e-06, + "loss": 0.2956, + "step": 2043 + }, + { + "epoch": 1.1937084884315015, + "grad_norm": 0.08202782421981449, + "learning_rate": 8.37073087712906e-06, + "loss": 0.2931, + "step": 2044 + }, + { + "epoch": 1.1942923874169769, + "grad_norm": 0.08635896376418187, + "learning_rate": 8.360671306729114e-06, + "loss": 0.3062, + "step": 2045 + }, + { + "epoch": 1.1948762864024525, + "grad_norm": 0.08775343594750251, + "learning_rate": 8.350613440771661e-06, + "loss": 0.2896, + "step": 2046 + }, + { + "epoch": 1.1954601853879279, + "grad_norm": 0.08491976655178993, + "learning_rate": 8.340557289714055e-06, + "loss": 0.3098, + "step": 2047 + }, + { + "epoch": 1.1960440843734035, + "grad_norm": 0.0908519474896532, + "learning_rate": 8.330502864011878e-06, + "loss": 0.3052, + "step": 2048 + }, + { + "epoch": 1.1966279833588789, + "grad_norm": 0.08912397670221836, + "learning_rate": 8.320450174118914e-06, + "loss": 0.3044, + "step": 2049 + }, + { + "epoch": 1.1972118823443545, + "grad_norm": 0.0954851380432506, + "learning_rate": 8.310399230487148e-06, + "loss": 0.3589, + "step": 2050 + }, + { + "epoch": 1.19779578132983, + "grad_norm": 0.0866851430666876, + "learning_rate": 8.300350043566742e-06, + "loss": 0.348, + "step": 2051 + }, + { + "epoch": 1.1983796803153055, + "grad_norm": 0.08701828972001084, + "learning_rate": 8.290302623806035e-06, + "loss": 0.2881, + "step": 2052 + }, + { + "epoch": 1.198963579300781, + "grad_norm": 0.08558329331497519, + "learning_rate": 8.280256981651527e-06, + "loss": 0.3344, + "step": 2053 + }, + { + "epoch": 1.1995474782862565, + "grad_norm": 0.08872220597770586, + "learning_rate": 8.270213127547871e-06, + "loss": 0.3296, + "step": 2054 + }, + { + "epoch": 1.200131377271732, + "grad_norm": 0.08763877816884778, + "learning_rate": 8.260171071937863e-06, + "loss": 0.3382, + "step": 2055 + }, + { + "epoch": 1.2007152762572075, + "grad_norm": 0.08645770562055194, + "learning_rate": 8.250130825262426e-06, + "loss": 0.3047, + "step": 2056 + }, + { + "epoch": 1.201299175242683, + "grad_norm": 0.09020848283827998, + "learning_rate": 8.240092397960601e-06, + "loss": 0.3234, + "step": 2057 + }, + { + "epoch": 1.2018830742281585, + "grad_norm": 0.09172660715230793, + "learning_rate": 8.230055800469543e-06, + "loss": 0.3226, + "step": 2058 + }, + { + "epoch": 1.202466973213634, + "grad_norm": 0.08085673436785963, + "learning_rate": 8.2200210432245e-06, + "loss": 0.2804, + "step": 2059 + }, + { + "epoch": 1.2030508721991096, + "grad_norm": 0.08400725026168468, + "learning_rate": 8.209988136658805e-06, + "loss": 0.3092, + "step": 2060 + }, + { + "epoch": 1.2036347711845852, + "grad_norm": 0.09010977366947723, + "learning_rate": 8.199957091203876e-06, + "loss": 0.2953, + "step": 2061 + }, + { + "epoch": 1.2042186701700606, + "grad_norm": 0.08870089761512859, + "learning_rate": 8.189927917289182e-06, + "loss": 0.2976, + "step": 2062 + }, + { + "epoch": 1.2048025691555362, + "grad_norm": 0.0983006010441233, + "learning_rate": 8.179900625342256e-06, + "loss": 0.3816, + "step": 2063 + }, + { + "epoch": 1.2053864681410116, + "grad_norm": 0.08858299676276171, + "learning_rate": 8.169875225788675e-06, + "loss": 0.3538, + "step": 2064 + }, + { + "epoch": 1.2059703671264872, + "grad_norm": 0.08176245408855808, + "learning_rate": 8.159851729052041e-06, + "loss": 0.3006, + "step": 2065 + }, + { + "epoch": 1.2065542661119626, + "grad_norm": 0.09528740460599704, + "learning_rate": 8.149830145553982e-06, + "loss": 0.366, + "step": 2066 + }, + { + "epoch": 1.2071381650974382, + "grad_norm": 0.09165988064565685, + "learning_rate": 8.139810485714142e-06, + "loss": 0.3401, + "step": 2067 + }, + { + "epoch": 1.2077220640829136, + "grad_norm": 0.09088025931989592, + "learning_rate": 8.129792759950157e-06, + "loss": 0.3652, + "step": 2068 + }, + { + "epoch": 1.2083059630683892, + "grad_norm": 0.09429153703216536, + "learning_rate": 8.119776978677655e-06, + "loss": 0.35, + "step": 2069 + }, + { + "epoch": 1.2088898620538646, + "grad_norm": 0.09583452625159206, + "learning_rate": 8.10976315231024e-06, + "loss": 0.3712, + "step": 2070 + }, + { + "epoch": 1.2094737610393402, + "grad_norm": 0.09269155046051072, + "learning_rate": 8.099751291259485e-06, + "loss": 0.336, + "step": 2071 + }, + { + "epoch": 1.2100576600248156, + "grad_norm": 0.08554524420094772, + "learning_rate": 8.089741405934923e-06, + "loss": 0.2997, + "step": 2072 + }, + { + "epoch": 1.2106415590102912, + "grad_norm": 0.09557355383696006, + "learning_rate": 8.079733506744027e-06, + "loss": 0.3388, + "step": 2073 + }, + { + "epoch": 1.2112254579957666, + "grad_norm": 0.09686085859596412, + "learning_rate": 8.069727604092213e-06, + "loss": 0.3164, + "step": 2074 + }, + { + "epoch": 1.2118093569812423, + "grad_norm": 0.09033688330494663, + "learning_rate": 8.05972370838281e-06, + "loss": 0.2947, + "step": 2075 + }, + { + "epoch": 1.2123932559667177, + "grad_norm": 0.09377017429739727, + "learning_rate": 8.04972183001707e-06, + "loss": 0.3203, + "step": 2076 + }, + { + "epoch": 1.2129771549521933, + "grad_norm": 0.094785731784518, + "learning_rate": 8.03972197939414e-06, + "loss": 0.3422, + "step": 2077 + }, + { + "epoch": 1.2135610539376689, + "grad_norm": 0.09004417194853796, + "learning_rate": 8.029724166911069e-06, + "loss": 0.3556, + "step": 2078 + }, + { + "epoch": 1.2141449529231443, + "grad_norm": 0.08473785025212613, + "learning_rate": 8.019728402962776e-06, + "loss": 0.2942, + "step": 2079 + }, + { + "epoch": 1.21472885190862, + "grad_norm": 0.09377688849726686, + "learning_rate": 8.009734697942054e-06, + "loss": 0.3421, + "step": 2080 + }, + { + "epoch": 1.2153127508940953, + "grad_norm": 0.0847032110238424, + "learning_rate": 7.999743062239557e-06, + "loss": 0.3062, + "step": 2081 + }, + { + "epoch": 1.215896649879571, + "grad_norm": 0.08963636950525647, + "learning_rate": 7.989753506243787e-06, + "loss": 0.3308, + "step": 2082 + }, + { + "epoch": 1.2164805488650463, + "grad_norm": 0.08278240427189165, + "learning_rate": 7.979766040341084e-06, + "loss": 0.3181, + "step": 2083 + }, + { + "epoch": 1.217064447850522, + "grad_norm": 0.08426764510891403, + "learning_rate": 7.969780674915613e-06, + "loss": 0.3362, + "step": 2084 + }, + { + "epoch": 1.2176483468359973, + "grad_norm": 0.0817778687871385, + "learning_rate": 7.959797420349356e-06, + "loss": 0.3029, + "step": 2085 + }, + { + "epoch": 1.218232245821473, + "grad_norm": 0.08971186618482638, + "learning_rate": 7.949816287022098e-06, + "loss": 0.3412, + "step": 2086 + }, + { + "epoch": 1.2188161448069483, + "grad_norm": 0.0991056308527583, + "learning_rate": 7.939837285311425e-06, + "loss": 0.3824, + "step": 2087 + }, + { + "epoch": 1.219400043792424, + "grad_norm": 0.09080812645886331, + "learning_rate": 7.9298604255927e-06, + "loss": 0.3503, + "step": 2088 + }, + { + "epoch": 1.2199839427778993, + "grad_norm": 0.0860805993747914, + "learning_rate": 7.919885718239063e-06, + "loss": 0.3155, + "step": 2089 + }, + { + "epoch": 1.220567841763375, + "grad_norm": 0.09450351103914792, + "learning_rate": 7.909913173621413e-06, + "loss": 0.3544, + "step": 2090 + }, + { + "epoch": 1.2211517407488504, + "grad_norm": 0.09059345485425477, + "learning_rate": 7.899942802108402e-06, + "loss": 0.3341, + "step": 2091 + }, + { + "epoch": 1.221735639734326, + "grad_norm": 0.08901615566492187, + "learning_rate": 7.889974614066425e-06, + "loss": 0.3108, + "step": 2092 + }, + { + "epoch": 1.2223195387198014, + "grad_norm": 0.08412818517312293, + "learning_rate": 7.880008619859601e-06, + "loss": 0.3434, + "step": 2093 + }, + { + "epoch": 1.222903437705277, + "grad_norm": 0.08889742269975304, + "learning_rate": 7.870044829849772e-06, + "loss": 0.3184, + "step": 2094 + }, + { + "epoch": 1.2234873366907526, + "grad_norm": 0.0889264989382093, + "learning_rate": 7.860083254396491e-06, + "loss": 0.3176, + "step": 2095 + }, + { + "epoch": 1.224071235676228, + "grad_norm": 0.09503912735138634, + "learning_rate": 7.850123903857004e-06, + "loss": 0.3367, + "step": 2096 + }, + { + "epoch": 1.2246551346617036, + "grad_norm": 0.08696946176200984, + "learning_rate": 7.840166788586244e-06, + "loss": 0.3044, + "step": 2097 + }, + { + "epoch": 1.225239033647179, + "grad_norm": 0.09002158096072227, + "learning_rate": 7.83021191893682e-06, + "loss": 0.3339, + "step": 2098 + }, + { + "epoch": 1.2258229326326546, + "grad_norm": 0.08500182917475167, + "learning_rate": 7.820259305259009e-06, + "loss": 0.3057, + "step": 2099 + }, + { + "epoch": 1.22640683161813, + "grad_norm": 0.0868798777966659, + "learning_rate": 7.810308957900736e-06, + "loss": 0.3084, + "step": 2100 + }, + { + "epoch": 1.2269907306036056, + "grad_norm": 0.09647998993093253, + "learning_rate": 7.800360887207579e-06, + "loss": 0.3355, + "step": 2101 + }, + { + "epoch": 1.227574629589081, + "grad_norm": 0.09278055985883078, + "learning_rate": 7.790415103522744e-06, + "loss": 0.324, + "step": 2102 + }, + { + "epoch": 1.2281585285745567, + "grad_norm": 0.08881490473212136, + "learning_rate": 7.780471617187056e-06, + "loss": 0.3494, + "step": 2103 + }, + { + "epoch": 1.228742427560032, + "grad_norm": 0.07830010752707076, + "learning_rate": 7.770530438538955e-06, + "loss": 0.259, + "step": 2104 + }, + { + "epoch": 1.2293263265455077, + "grad_norm": 0.09231147705540103, + "learning_rate": 7.760591577914483e-06, + "loss": 0.3242, + "step": 2105 + }, + { + "epoch": 1.229910225530983, + "grad_norm": 0.0948175286391098, + "learning_rate": 7.750655045647268e-06, + "loss": 0.341, + "step": 2106 + }, + { + "epoch": 1.2304941245164587, + "grad_norm": 0.10138261012242991, + "learning_rate": 7.740720852068524e-06, + "loss": 0.3417, + "step": 2107 + }, + { + "epoch": 1.231078023501934, + "grad_norm": 0.08720982712502892, + "learning_rate": 7.730789007507023e-06, + "loss": 0.2918, + "step": 2108 + }, + { + "epoch": 1.2316619224874097, + "grad_norm": 0.09433216845095259, + "learning_rate": 7.720859522289101e-06, + "loss": 0.3609, + "step": 2109 + }, + { + "epoch": 1.232245821472885, + "grad_norm": 0.09007355004493099, + "learning_rate": 7.710932406738643e-06, + "loss": 0.3162, + "step": 2110 + }, + { + "epoch": 1.2328297204583607, + "grad_norm": 0.0902764246795122, + "learning_rate": 7.701007671177066e-06, + "loss": 0.3275, + "step": 2111 + }, + { + "epoch": 1.2334136194438363, + "grad_norm": 0.09175902602301357, + "learning_rate": 7.691085325923317e-06, + "loss": 0.3717, + "step": 2112 + }, + { + "epoch": 1.2339975184293117, + "grad_norm": 0.08325644544545185, + "learning_rate": 7.68116538129385e-06, + "loss": 0.3156, + "step": 2113 + }, + { + "epoch": 1.2345814174147873, + "grad_norm": 0.08475367139227559, + "learning_rate": 7.671247847602628e-06, + "loss": 0.326, + "step": 2114 + }, + { + "epoch": 1.2351653164002627, + "grad_norm": 0.08824218481038557, + "learning_rate": 7.661332735161111e-06, + "loss": 0.2932, + "step": 2115 + }, + { + "epoch": 1.2357492153857383, + "grad_norm": 0.09357777543389917, + "learning_rate": 7.651420054278234e-06, + "loss": 0.3547, + "step": 2116 + }, + { + "epoch": 1.2363331143712137, + "grad_norm": 0.08257600107304924, + "learning_rate": 7.641509815260412e-06, + "loss": 0.3119, + "step": 2117 + }, + { + "epoch": 1.2369170133566894, + "grad_norm": 0.09086093748377244, + "learning_rate": 7.631602028411512e-06, + "loss": 0.3621, + "step": 2118 + }, + { + "epoch": 1.2375009123421647, + "grad_norm": 0.08877450222714746, + "learning_rate": 7.621696704032857e-06, + "loss": 0.3319, + "step": 2119 + }, + { + "epoch": 1.2380848113276404, + "grad_norm": 0.08629618934457023, + "learning_rate": 7.6117938524232105e-06, + "loss": 0.2752, + "step": 2120 + }, + { + "epoch": 1.2386687103131158, + "grad_norm": 0.09260099613989543, + "learning_rate": 7.601893483878761e-06, + "loss": 0.3242, + "step": 2121 + }, + { + "epoch": 1.2392526092985914, + "grad_norm": 0.09638837192692037, + "learning_rate": 7.591995608693118e-06, + "loss": 0.3304, + "step": 2122 + }, + { + "epoch": 1.2398365082840668, + "grad_norm": 0.08944137922931576, + "learning_rate": 7.5821002371573005e-06, + "loss": 0.298, + "step": 2123 + }, + { + "epoch": 1.2404204072695424, + "grad_norm": 0.0928329254971466, + "learning_rate": 7.572207379559722e-06, + "loss": 0.3651, + "step": 2124 + }, + { + "epoch": 1.2410043062550178, + "grad_norm": 0.083530204917641, + "learning_rate": 7.562317046186182e-06, + "loss": 0.2979, + "step": 2125 + }, + { + "epoch": 1.2415882052404934, + "grad_norm": 0.08385221896372473, + "learning_rate": 7.552429247319854e-06, + "loss": 0.2847, + "step": 2126 + }, + { + "epoch": 1.2421721042259688, + "grad_norm": 0.08905509883427613, + "learning_rate": 7.542543993241278e-06, + "loss": 0.3355, + "step": 2127 + }, + { + "epoch": 1.2427560032114444, + "grad_norm": 0.09170532490783884, + "learning_rate": 7.53266129422835e-06, + "loss": 0.3217, + "step": 2128 + }, + { + "epoch": 1.24333990219692, + "grad_norm": 0.09559023854479497, + "learning_rate": 7.522781160556308e-06, + "loss": 0.3567, + "step": 2129 + }, + { + "epoch": 1.2439238011823954, + "grad_norm": 0.08425778978756891, + "learning_rate": 7.512903602497723e-06, + "loss": 0.28, + "step": 2130 + }, + { + "epoch": 1.244507700167871, + "grad_norm": 0.0923496107481391, + "learning_rate": 7.503028630322486e-06, + "loss": 0.3308, + "step": 2131 + }, + { + "epoch": 1.2450915991533464, + "grad_norm": 0.09200687690065452, + "learning_rate": 7.4931562542977994e-06, + "loss": 0.3519, + "step": 2132 + }, + { + "epoch": 1.245675498138822, + "grad_norm": 0.08595469953917594, + "learning_rate": 7.483286484688172e-06, + "loss": 0.3423, + "step": 2133 + }, + { + "epoch": 1.2462593971242975, + "grad_norm": 0.08380654279510158, + "learning_rate": 7.473419331755395e-06, + "loss": 0.2676, + "step": 2134 + }, + { + "epoch": 1.246843296109773, + "grad_norm": 0.09162630582583986, + "learning_rate": 7.463554805758546e-06, + "loss": 0.3391, + "step": 2135 + }, + { + "epoch": 1.2474271950952485, + "grad_norm": 0.08344837522392781, + "learning_rate": 7.453692916953965e-06, + "loss": 0.2986, + "step": 2136 + }, + { + "epoch": 1.248011094080724, + "grad_norm": 0.08590676047840917, + "learning_rate": 7.443833675595254e-06, + "loss": 0.3104, + "step": 2137 + }, + { + "epoch": 1.2485949930661995, + "grad_norm": 0.08727713583487184, + "learning_rate": 7.433977091933262e-06, + "loss": 0.3204, + "step": 2138 + }, + { + "epoch": 1.249178892051675, + "grad_norm": 0.0893831073259057, + "learning_rate": 7.424123176216072e-06, + "loss": 0.3536, + "step": 2139 + }, + { + "epoch": 1.2497627910371505, + "grad_norm": 0.09296090809386962, + "learning_rate": 7.414271938689e-06, + "loss": 0.3378, + "step": 2140 + }, + { + "epoch": 1.250346690022626, + "grad_norm": 0.09565704202369064, + "learning_rate": 7.404423389594569e-06, + "loss": 0.3592, + "step": 2141 + }, + { + "epoch": 1.2509305890081017, + "grad_norm": 0.08904583474133027, + "learning_rate": 7.394577539172507e-06, + "loss": 0.2896, + "step": 2142 + }, + { + "epoch": 1.2515144879935771, + "grad_norm": 0.09310089349679415, + "learning_rate": 7.3847343976597454e-06, + "loss": 0.3373, + "step": 2143 + }, + { + "epoch": 1.2520983869790525, + "grad_norm": 0.09898287781251883, + "learning_rate": 7.374893975290391e-06, + "loss": 0.3678, + "step": 2144 + }, + { + "epoch": 1.2526822859645281, + "grad_norm": 0.09139450392915356, + "learning_rate": 7.3650562822957285e-06, + "loss": 0.3001, + "step": 2145 + }, + { + "epoch": 1.2532661849500037, + "grad_norm": 0.09089933067684172, + "learning_rate": 7.355221328904196e-06, + "loss": 0.3278, + "step": 2146 + }, + { + "epoch": 1.2538500839354791, + "grad_norm": 0.0886111087164762, + "learning_rate": 7.3453891253413935e-06, + "loss": 0.332, + "step": 2147 + }, + { + "epoch": 1.2544339829209545, + "grad_norm": 0.09257488314179184, + "learning_rate": 7.335559681830058e-06, + "loss": 0.3439, + "step": 2148 + }, + { + "epoch": 1.2550178819064302, + "grad_norm": 0.08532806334972541, + "learning_rate": 7.325733008590053e-06, + "loss": 0.3004, + "step": 2149 + }, + { + "epoch": 1.2556017808919058, + "grad_norm": 0.08535436371065264, + "learning_rate": 7.315909115838367e-06, + "loss": 0.3178, + "step": 2150 + }, + { + "epoch": 1.2561856798773812, + "grad_norm": 0.07842060269302502, + "learning_rate": 7.306088013789097e-06, + "loss": 0.2919, + "step": 2151 + }, + { + "epoch": 1.2567695788628568, + "grad_norm": 0.09659238495831153, + "learning_rate": 7.296269712653436e-06, + "loss": 0.3769, + "step": 2152 + }, + { + "epoch": 1.2573534778483322, + "grad_norm": 0.08957122963897593, + "learning_rate": 7.28645422263967e-06, + "loss": 0.2861, + "step": 2153 + }, + { + "epoch": 1.2579373768338078, + "grad_norm": 0.08855421026899454, + "learning_rate": 7.27664155395315e-06, + "loss": 0.3635, + "step": 2154 + }, + { + "epoch": 1.2585212758192832, + "grad_norm": 0.08603340736389661, + "learning_rate": 7.266831716796307e-06, + "loss": 0.3123, + "step": 2155 + }, + { + "epoch": 1.2591051748047588, + "grad_norm": 0.08390293651041925, + "learning_rate": 7.257024721368624e-06, + "loss": 0.3079, + "step": 2156 + }, + { + "epoch": 1.2596890737902342, + "grad_norm": 0.08585749084197501, + "learning_rate": 7.247220577866625e-06, + "loss": 0.2973, + "step": 2157 + }, + { + "epoch": 1.2602729727757098, + "grad_norm": 0.08752801496908506, + "learning_rate": 7.237419296483876e-06, + "loss": 0.3127, + "step": 2158 + }, + { + "epoch": 1.2608568717611854, + "grad_norm": 0.08607776861634736, + "learning_rate": 7.227620887410958e-06, + "loss": 0.3293, + "step": 2159 + }, + { + "epoch": 1.2614407707466608, + "grad_norm": 0.0855948674623974, + "learning_rate": 7.217825360835475e-06, + "loss": 0.3294, + "step": 2160 + }, + { + "epoch": 1.2620246697321362, + "grad_norm": 0.08438160607030644, + "learning_rate": 7.208032726942027e-06, + "loss": 0.3088, + "step": 2161 + }, + { + "epoch": 1.2626085687176118, + "grad_norm": 0.08408724429883209, + "learning_rate": 7.198242995912211e-06, + "loss": 0.2711, + "step": 2162 + }, + { + "epoch": 1.2631924677030875, + "grad_norm": 0.0905791674999825, + "learning_rate": 7.1884561779246055e-06, + "loss": 0.3465, + "step": 2163 + }, + { + "epoch": 1.2637763666885629, + "grad_norm": 0.08661087049744949, + "learning_rate": 7.178672283154756e-06, + "loss": 0.3134, + "step": 2164 + }, + { + "epoch": 1.2643602656740383, + "grad_norm": 0.09017883695829913, + "learning_rate": 7.168891321775172e-06, + "loss": 0.3735, + "step": 2165 + }, + { + "epoch": 1.2649441646595139, + "grad_norm": 0.08446965060452846, + "learning_rate": 7.159113303955314e-06, + "loss": 0.3, + "step": 2166 + }, + { + "epoch": 1.2655280636449895, + "grad_norm": 0.08253933103553424, + "learning_rate": 7.149338239861579e-06, + "loss": 0.2883, + "step": 2167 + }, + { + "epoch": 1.2661119626304649, + "grad_norm": 0.08575623719159506, + "learning_rate": 7.139566139657298e-06, + "loss": 0.3233, + "step": 2168 + }, + { + "epoch": 1.2666958616159405, + "grad_norm": 0.08418084973676067, + "learning_rate": 7.129797013502713e-06, + "loss": 0.3132, + "step": 2169 + }, + { + "epoch": 1.267279760601416, + "grad_norm": 0.08828193297500339, + "learning_rate": 7.12003087155498e-06, + "loss": 0.3175, + "step": 2170 + }, + { + "epoch": 1.2678636595868915, + "grad_norm": 0.09080420646597591, + "learning_rate": 7.110267723968147e-06, + "loss": 0.3234, + "step": 2171 + }, + { + "epoch": 1.268447558572367, + "grad_norm": 0.08552695616141476, + "learning_rate": 7.100507580893156e-06, + "loss": 0.3242, + "step": 2172 + }, + { + "epoch": 1.2690314575578425, + "grad_norm": 0.07819110042017013, + "learning_rate": 7.09075045247782e-06, + "loss": 0.2996, + "step": 2173 + }, + { + "epoch": 1.269615356543318, + "grad_norm": 0.08465204152697191, + "learning_rate": 7.080996348866817e-06, + "loss": 0.3041, + "step": 2174 + }, + { + "epoch": 1.2701992555287935, + "grad_norm": 0.08197392828512999, + "learning_rate": 7.071245280201682e-06, + "loss": 0.2885, + "step": 2175 + }, + { + "epoch": 1.2707831545142692, + "grad_norm": 0.09135695794292213, + "learning_rate": 7.061497256620793e-06, + "loss": 0.3424, + "step": 2176 + }, + { + "epoch": 1.2713670534997445, + "grad_norm": 0.09016288741766847, + "learning_rate": 7.051752288259366e-06, + "loss": 0.3095, + "step": 2177 + }, + { + "epoch": 1.27195095248522, + "grad_norm": 0.08689267685938844, + "learning_rate": 7.042010385249433e-06, + "loss": 0.3023, + "step": 2178 + }, + { + "epoch": 1.2725348514706956, + "grad_norm": 0.08701026711324569, + "learning_rate": 7.032271557719847e-06, + "loss": 0.3456, + "step": 2179 + }, + { + "epoch": 1.2731187504561712, + "grad_norm": 0.0877401259725806, + "learning_rate": 7.022535815796261e-06, + "loss": 0.3444, + "step": 2180 + }, + { + "epoch": 1.2737026494416466, + "grad_norm": 0.09311060888179258, + "learning_rate": 7.012803169601118e-06, + "loss": 0.3072, + "step": 2181 + }, + { + "epoch": 1.2742865484271222, + "grad_norm": 0.0877088562797437, + "learning_rate": 7.003073629253638e-06, + "loss": 0.2839, + "step": 2182 + }, + { + "epoch": 1.2748704474125976, + "grad_norm": 0.08818796564307313, + "learning_rate": 6.9933472048698225e-06, + "loss": 0.3098, + "step": 2183 + }, + { + "epoch": 1.2754543463980732, + "grad_norm": 0.09359175728029971, + "learning_rate": 6.983623906562422e-06, + "loss": 0.3487, + "step": 2184 + }, + { + "epoch": 1.2760382453835486, + "grad_norm": 0.08516415412088504, + "learning_rate": 6.973903744440949e-06, + "loss": 0.2897, + "step": 2185 + }, + { + "epoch": 1.2766221443690242, + "grad_norm": 0.09261767199327858, + "learning_rate": 6.964186728611644e-06, + "loss": 0.3368, + "step": 2186 + }, + { + "epoch": 1.2772060433544996, + "grad_norm": 0.09248284264252786, + "learning_rate": 6.954472869177479e-06, + "loss": 0.3361, + "step": 2187 + }, + { + "epoch": 1.2777899423399752, + "grad_norm": 0.08569528811744197, + "learning_rate": 6.944762176238149e-06, + "loss": 0.3231, + "step": 2188 + }, + { + "epoch": 1.2783738413254506, + "grad_norm": 0.08942214516761585, + "learning_rate": 6.935054659890053e-06, + "loss": 0.335, + "step": 2189 + }, + { + "epoch": 1.2789577403109262, + "grad_norm": 0.09616967050095145, + "learning_rate": 6.9253503302262855e-06, + "loss": 0.3502, + "step": 2190 + }, + { + "epoch": 1.2795416392964016, + "grad_norm": 0.08420300863991267, + "learning_rate": 6.915649197336638e-06, + "loss": 0.3064, + "step": 2191 + }, + { + "epoch": 1.2801255382818773, + "grad_norm": 0.08412636442580161, + "learning_rate": 6.905951271307561e-06, + "loss": 0.316, + "step": 2192 + }, + { + "epoch": 1.2807094372673529, + "grad_norm": 0.09616666932174228, + "learning_rate": 6.896256562222184e-06, + "loss": 0.3416, + "step": 2193 + }, + { + "epoch": 1.2812933362528283, + "grad_norm": 0.1008877355099578, + "learning_rate": 6.8865650801602855e-06, + "loss": 0.42, + "step": 2194 + }, + { + "epoch": 1.2818772352383037, + "grad_norm": 0.07988114301219877, + "learning_rate": 6.8768768351982964e-06, + "loss": 0.2824, + "step": 2195 + }, + { + "epoch": 1.2824611342237793, + "grad_norm": 0.08236933044443509, + "learning_rate": 6.867191837409275e-06, + "loss": 0.2913, + "step": 2196 + }, + { + "epoch": 1.283045033209255, + "grad_norm": 0.09080908360713386, + "learning_rate": 6.857510096862901e-06, + "loss": 0.3338, + "step": 2197 + }, + { + "epoch": 1.2836289321947303, + "grad_norm": 0.09112184857960147, + "learning_rate": 6.847831623625476e-06, + "loss": 0.3542, + "step": 2198 + }, + { + "epoch": 1.284212831180206, + "grad_norm": 0.08492954494818158, + "learning_rate": 6.8381564277598974e-06, + "loss": 0.2895, + "step": 2199 + }, + { + "epoch": 1.2847967301656813, + "grad_norm": 0.08764422666889649, + "learning_rate": 6.82848451932566e-06, + "loss": 0.324, + "step": 2200 + }, + { + "epoch": 1.285380629151157, + "grad_norm": 0.10242026715071606, + "learning_rate": 6.81881590837884e-06, + "loss": 0.406, + "step": 2201 + }, + { + "epoch": 1.2859645281366323, + "grad_norm": 0.08597184043256181, + "learning_rate": 6.809150604972079e-06, + "loss": 0.2856, + "step": 2202 + }, + { + "epoch": 1.286548427122108, + "grad_norm": 0.07621260625413552, + "learning_rate": 6.799488619154586e-06, + "loss": 0.2734, + "step": 2203 + }, + { + "epoch": 1.2871323261075833, + "grad_norm": 0.09336758701902104, + "learning_rate": 6.7898299609721186e-06, + "loss": 0.3259, + "step": 2204 + }, + { + "epoch": 1.287716225093059, + "grad_norm": 0.09711728283209334, + "learning_rate": 6.780174640466976e-06, + "loss": 0.3388, + "step": 2205 + }, + { + "epoch": 1.2883001240785343, + "grad_norm": 0.10620903445766622, + "learning_rate": 6.7705226676779855e-06, + "loss": 0.3508, + "step": 2206 + }, + { + "epoch": 1.28888402306401, + "grad_norm": 0.08626800542483866, + "learning_rate": 6.760874052640494e-06, + "loss": 0.2895, + "step": 2207 + }, + { + "epoch": 1.2894679220494853, + "grad_norm": 0.08031672973715964, + "learning_rate": 6.751228805386363e-06, + "loss": 0.2987, + "step": 2208 + }, + { + "epoch": 1.290051821034961, + "grad_norm": 0.08976524286231323, + "learning_rate": 6.741586935943937e-06, + "loss": 0.3321, + "step": 2209 + }, + { + "epoch": 1.2906357200204366, + "grad_norm": 0.08745319732277276, + "learning_rate": 6.731948454338064e-06, + "loss": 0.293, + "step": 2210 + }, + { + "epoch": 1.291219619005912, + "grad_norm": 0.09028163718088607, + "learning_rate": 6.7223133705900635e-06, + "loss": 0.3413, + "step": 2211 + }, + { + "epoch": 1.2918035179913874, + "grad_norm": 0.08892035818716731, + "learning_rate": 6.712681694717723e-06, + "loss": 0.3291, + "step": 2212 + }, + { + "epoch": 1.292387416976863, + "grad_norm": 0.0940637969912293, + "learning_rate": 6.7030534367352884e-06, + "loss": 0.3441, + "step": 2213 + }, + { + "epoch": 1.2929713159623386, + "grad_norm": 0.09422753340458027, + "learning_rate": 6.693428606653445e-06, + "loss": 0.3624, + "step": 2214 + }, + { + "epoch": 1.293555214947814, + "grad_norm": 0.09483873426990524, + "learning_rate": 6.683807214479323e-06, + "loss": 0.332, + "step": 2215 + }, + { + "epoch": 1.2941391139332896, + "grad_norm": 0.08995445952764203, + "learning_rate": 6.6741892702164735e-06, + "loss": 0.3431, + "step": 2216 + }, + { + "epoch": 1.294723012918765, + "grad_norm": 0.0837743296916354, + "learning_rate": 6.664574783864862e-06, + "loss": 0.3048, + "step": 2217 + }, + { + "epoch": 1.2953069119042406, + "grad_norm": 0.08744013428007931, + "learning_rate": 6.654963765420866e-06, + "loss": 0.3184, + "step": 2218 + }, + { + "epoch": 1.295890810889716, + "grad_norm": 0.07554607588145608, + "learning_rate": 6.645356224877242e-06, + "loss": 0.2774, + "step": 2219 + }, + { + "epoch": 1.2964747098751916, + "grad_norm": 0.09097309522067505, + "learning_rate": 6.635752172223146e-06, + "loss": 0.3421, + "step": 2220 + }, + { + "epoch": 1.297058608860667, + "grad_norm": 0.08233005581478425, + "learning_rate": 6.626151617444103e-06, + "loss": 0.3005, + "step": 2221 + }, + { + "epoch": 1.2976425078461427, + "grad_norm": 0.08752739492407434, + "learning_rate": 6.6165545705219955e-06, + "loss": 0.2901, + "step": 2222 + }, + { + "epoch": 1.298226406831618, + "grad_norm": 0.09068936353364801, + "learning_rate": 6.606961041435068e-06, + "loss": 0.3684, + "step": 2223 + }, + { + "epoch": 1.2988103058170937, + "grad_norm": 0.08982675126176749, + "learning_rate": 6.5973710401578985e-06, + "loss": 0.3381, + "step": 2224 + }, + { + "epoch": 1.299394204802569, + "grad_norm": 0.09074750032597248, + "learning_rate": 6.587784576661401e-06, + "loss": 0.2994, + "step": 2225 + }, + { + "epoch": 1.2999781037880447, + "grad_norm": 0.08729585544203403, + "learning_rate": 6.578201660912814e-06, + "loss": 0.2869, + "step": 2226 + }, + { + "epoch": 1.3005620027735203, + "grad_norm": 0.09495752202530981, + "learning_rate": 6.568622302875682e-06, + "loss": 0.3491, + "step": 2227 + }, + { + "epoch": 1.3011459017589957, + "grad_norm": 0.08549156796822828, + "learning_rate": 6.559046512509859e-06, + "loss": 0.2931, + "step": 2228 + }, + { + "epoch": 1.301729800744471, + "grad_norm": 0.08471469635005309, + "learning_rate": 6.5494742997714765e-06, + "loss": 0.292, + "step": 2229 + }, + { + "epoch": 1.3023136997299467, + "grad_norm": 0.0820828483989417, + "learning_rate": 6.539905674612956e-06, + "loss": 0.3271, + "step": 2230 + }, + { + "epoch": 1.3028975987154223, + "grad_norm": 0.08652593876725581, + "learning_rate": 6.530340646982987e-06, + "loss": 0.34, + "step": 2231 + }, + { + "epoch": 1.3034814977008977, + "grad_norm": 0.09132262828249602, + "learning_rate": 6.520779226826517e-06, + "loss": 0.2995, + "step": 2232 + }, + { + "epoch": 1.3040653966863733, + "grad_norm": 0.08362934650429593, + "learning_rate": 6.511221424084748e-06, + "loss": 0.2908, + "step": 2233 + }, + { + "epoch": 1.3046492956718487, + "grad_norm": 0.08992030898838053, + "learning_rate": 6.501667248695107e-06, + "loss": 0.3082, + "step": 2234 + }, + { + "epoch": 1.3052331946573243, + "grad_norm": 0.07898833881846185, + "learning_rate": 6.4921167105912696e-06, + "loss": 0.2594, + "step": 2235 + }, + { + "epoch": 1.3058170936427997, + "grad_norm": 0.0876149955839265, + "learning_rate": 6.482569819703117e-06, + "loss": 0.2913, + "step": 2236 + }, + { + "epoch": 1.3064009926282754, + "grad_norm": 0.09193639934893987, + "learning_rate": 6.473026585956736e-06, + "loss": 0.3655, + "step": 2237 + }, + { + "epoch": 1.3069848916137508, + "grad_norm": 0.09575444768370414, + "learning_rate": 6.4634870192744205e-06, + "loss": 0.3287, + "step": 2238 + }, + { + "epoch": 1.3075687905992264, + "grad_norm": 0.08852133768890016, + "learning_rate": 6.453951129574644e-06, + "loss": 0.3027, + "step": 2239 + }, + { + "epoch": 1.3081526895847018, + "grad_norm": 0.09326848511389527, + "learning_rate": 6.44441892677206e-06, + "loss": 0.3492, + "step": 2240 + }, + { + "epoch": 1.3087365885701774, + "grad_norm": 0.09086327815404624, + "learning_rate": 6.434890420777491e-06, + "loss": 0.3115, + "step": 2241 + }, + { + "epoch": 1.3093204875556528, + "grad_norm": 0.08772912315526857, + "learning_rate": 6.42536562149791e-06, + "loss": 0.3071, + "step": 2242 + }, + { + "epoch": 1.3099043865411284, + "grad_norm": 0.0922033033400455, + "learning_rate": 6.41584453883644e-06, + "loss": 0.3093, + "step": 2243 + }, + { + "epoch": 1.310488285526604, + "grad_norm": 0.08936009136418017, + "learning_rate": 6.40632718269234e-06, + "loss": 0.307, + "step": 2244 + }, + { + "epoch": 1.3110721845120794, + "grad_norm": 0.09021718135916507, + "learning_rate": 6.396813562960993e-06, + "loss": 0.3061, + "step": 2245 + }, + { + "epoch": 1.3116560834975548, + "grad_norm": 0.08992121526506444, + "learning_rate": 6.387303689533899e-06, + "loss": 0.2875, + "step": 2246 + }, + { + "epoch": 1.3122399824830304, + "grad_norm": 0.0852803727394185, + "learning_rate": 6.377797572298661e-06, + "loss": 0.3437, + "step": 2247 + }, + { + "epoch": 1.312823881468506, + "grad_norm": 0.0908064451701135, + "learning_rate": 6.3682952211389735e-06, + "loss": 0.3314, + "step": 2248 + }, + { + "epoch": 1.3134077804539814, + "grad_norm": 0.08487180971281678, + "learning_rate": 6.358796645934624e-06, + "loss": 0.3001, + "step": 2249 + }, + { + "epoch": 1.313991679439457, + "grad_norm": 0.08497997587352095, + "learning_rate": 6.349301856561468e-06, + "loss": 0.338, + "step": 2250 + }, + { + "epoch": 1.3145755784249324, + "grad_norm": 0.0894974313476187, + "learning_rate": 6.3398108628914264e-06, + "loss": 0.3287, + "step": 2251 + }, + { + "epoch": 1.315159477410408, + "grad_norm": 0.0844865210053765, + "learning_rate": 6.330323674792472e-06, + "loss": 0.3276, + "step": 2252 + }, + { + "epoch": 1.3157433763958835, + "grad_norm": 0.08265232853869993, + "learning_rate": 6.320840302128619e-06, + "loss": 0.3115, + "step": 2253 + }, + { + "epoch": 1.316327275381359, + "grad_norm": 0.08207649027856014, + "learning_rate": 6.311360754759923e-06, + "loss": 0.3391, + "step": 2254 + }, + { + "epoch": 1.3169111743668345, + "grad_norm": 0.08636987716456682, + "learning_rate": 6.301885042542455e-06, + "loss": 0.3155, + "step": 2255 + }, + { + "epoch": 1.31749507335231, + "grad_norm": 0.08927553627852344, + "learning_rate": 6.292413175328302e-06, + "loss": 0.3288, + "step": 2256 + }, + { + "epoch": 1.3180789723377855, + "grad_norm": 0.08785150379664414, + "learning_rate": 6.282945162965548e-06, + "loss": 0.3242, + "step": 2257 + }, + { + "epoch": 1.318662871323261, + "grad_norm": 0.08251333861950808, + "learning_rate": 6.273481015298275e-06, + "loss": 0.3035, + "step": 2258 + }, + { + "epoch": 1.3192467703087365, + "grad_norm": 0.0880586319347823, + "learning_rate": 6.264020742166543e-06, + "loss": 0.3199, + "step": 2259 + }, + { + "epoch": 1.319830669294212, + "grad_norm": 0.08587110021390001, + "learning_rate": 6.2545643534063894e-06, + "loss": 0.3233, + "step": 2260 + }, + { + "epoch": 1.3204145682796877, + "grad_norm": 0.08970088554360765, + "learning_rate": 6.245111858849808e-06, + "loss": 0.3143, + "step": 2261 + }, + { + "epoch": 1.3209984672651631, + "grad_norm": 0.08672468431565483, + "learning_rate": 6.235663268324735e-06, + "loss": 0.3038, + "step": 2262 + }, + { + "epoch": 1.3215823662506385, + "grad_norm": 0.09465642014809919, + "learning_rate": 6.226218591655071e-06, + "loss": 0.3288, + "step": 2263 + }, + { + "epoch": 1.3221662652361141, + "grad_norm": 0.09375551208713268, + "learning_rate": 6.216777838660627e-06, + "loss": 0.3233, + "step": 2264 + }, + { + "epoch": 1.3227501642215898, + "grad_norm": 0.08963715042018003, + "learning_rate": 6.2073410191571395e-06, + "loss": 0.3401, + "step": 2265 + }, + { + "epoch": 1.3233340632070651, + "grad_norm": 0.08409008280490306, + "learning_rate": 6.1979081429562575e-06, + "loss": 0.3043, + "step": 2266 + }, + { + "epoch": 1.3239179621925408, + "grad_norm": 0.08421715359657778, + "learning_rate": 6.188479219865529e-06, + "loss": 0.3294, + "step": 2267 + }, + { + "epoch": 1.3245018611780162, + "grad_norm": 0.09152018200214336, + "learning_rate": 6.179054259688393e-06, + "loss": 0.2975, + "step": 2268 + }, + { + "epoch": 1.3250857601634918, + "grad_norm": 0.08465863249059051, + "learning_rate": 6.169633272224167e-06, + "loss": 0.2869, + "step": 2269 + }, + { + "epoch": 1.3256696591489672, + "grad_norm": 0.08192991638661111, + "learning_rate": 6.160216267268037e-06, + "loss": 0.2994, + "step": 2270 + }, + { + "epoch": 1.3262535581344428, + "grad_norm": 0.08416419878399281, + "learning_rate": 6.1508032546110485e-06, + "loss": 0.3328, + "step": 2271 + }, + { + "epoch": 1.3268374571199182, + "grad_norm": 0.09097247349739727, + "learning_rate": 6.1413942440400994e-06, + "loss": 0.336, + "step": 2272 + }, + { + "epoch": 1.3274213561053938, + "grad_norm": 0.09588493043502216, + "learning_rate": 6.1319892453379235e-06, + "loss": 0.3499, + "step": 2273 + }, + { + "epoch": 1.3280052550908692, + "grad_norm": 0.10019225153311204, + "learning_rate": 6.122588268283085e-06, + "loss": 0.3476, + "step": 2274 + }, + { + "epoch": 1.3285891540763448, + "grad_norm": 0.07885383963938226, + "learning_rate": 6.113191322649964e-06, + "loss": 0.2745, + "step": 2275 + }, + { + "epoch": 1.3291730530618202, + "grad_norm": 0.0961393669052555, + "learning_rate": 6.10379841820875e-06, + "loss": 0.3563, + "step": 2276 + }, + { + "epoch": 1.3297569520472958, + "grad_norm": 0.09079312175679641, + "learning_rate": 6.094409564725435e-06, + "loss": 0.3322, + "step": 2277 + }, + { + "epoch": 1.3303408510327714, + "grad_norm": 0.08529134769716017, + "learning_rate": 6.085024771961792e-06, + "loss": 0.3068, + "step": 2278 + }, + { + "epoch": 1.3309247500182468, + "grad_norm": 0.09095069977334372, + "learning_rate": 6.07564404967538e-06, + "loss": 0.3196, + "step": 2279 + }, + { + "epoch": 1.3315086490037222, + "grad_norm": 0.09220079070713902, + "learning_rate": 6.06626740761952e-06, + "loss": 0.3174, + "step": 2280 + }, + { + "epoch": 1.3320925479891979, + "grad_norm": 0.09315870355841284, + "learning_rate": 6.056894855543289e-06, + "loss": 0.3042, + "step": 2281 + }, + { + "epoch": 1.3326764469746735, + "grad_norm": 0.0927035883105029, + "learning_rate": 6.047526403191517e-06, + "loss": 0.3383, + "step": 2282 + }, + { + "epoch": 1.3332603459601489, + "grad_norm": 0.0934526495055974, + "learning_rate": 6.038162060304771e-06, + "loss": 0.3471, + "step": 2283 + }, + { + "epoch": 1.3338442449456245, + "grad_norm": 0.08308904613200654, + "learning_rate": 6.028801836619345e-06, + "loss": 0.3266, + "step": 2284 + }, + { + "epoch": 1.3344281439310999, + "grad_norm": 0.08711519509926798, + "learning_rate": 6.019445741867245e-06, + "loss": 0.3309, + "step": 2285 + }, + { + "epoch": 1.3350120429165755, + "grad_norm": 0.09850579396456162, + "learning_rate": 6.010093785776188e-06, + "loss": 0.3188, + "step": 2286 + }, + { + "epoch": 1.3355959419020509, + "grad_norm": 0.08664242125847466, + "learning_rate": 6.0007459780695885e-06, + "loss": 0.3119, + "step": 2287 + }, + { + "epoch": 1.3361798408875265, + "grad_norm": 0.08804636783157405, + "learning_rate": 5.991402328466549e-06, + "loss": 0.2901, + "step": 2288 + }, + { + "epoch": 1.336763739873002, + "grad_norm": 0.0897055635498984, + "learning_rate": 5.982062846681848e-06, + "loss": 0.3325, + "step": 2289 + }, + { + "epoch": 1.3373476388584775, + "grad_norm": 0.08868624138802335, + "learning_rate": 5.97272754242592e-06, + "loss": 0.3018, + "step": 2290 + }, + { + "epoch": 1.3379315378439531, + "grad_norm": 0.09267649539708721, + "learning_rate": 5.963396425404877e-06, + "loss": 0.3549, + "step": 2291 + }, + { + "epoch": 1.3385154368294285, + "grad_norm": 0.0905492375222619, + "learning_rate": 5.954069505320466e-06, + "loss": 0.2922, + "step": 2292 + }, + { + "epoch": 1.339099335814904, + "grad_norm": 0.07938074202567831, + "learning_rate": 5.944746791870062e-06, + "loss": 0.2841, + "step": 2293 + }, + { + "epoch": 1.3396832348003795, + "grad_norm": 0.09491290505536017, + "learning_rate": 5.935428294746679e-06, + "loss": 0.3233, + "step": 2294 + }, + { + "epoch": 1.3402671337858552, + "grad_norm": 0.08982539093778345, + "learning_rate": 5.926114023638944e-06, + "loss": 0.3142, + "step": 2295 + }, + { + "epoch": 1.3408510327713306, + "grad_norm": 0.08415242787315182, + "learning_rate": 5.916803988231087e-06, + "loss": 0.2782, + "step": 2296 + }, + { + "epoch": 1.341434931756806, + "grad_norm": 0.0901244823907529, + "learning_rate": 5.907498198202939e-06, + "loss": 0.3413, + "step": 2297 + }, + { + "epoch": 1.3420188307422816, + "grad_norm": 0.0854653562708484, + "learning_rate": 5.898196663229912e-06, + "loss": 0.3112, + "step": 2298 + }, + { + "epoch": 1.3426027297277572, + "grad_norm": 0.09657466161365869, + "learning_rate": 5.888899392982994e-06, + "loss": 0.3395, + "step": 2299 + }, + { + "epoch": 1.3431866287132326, + "grad_norm": 0.08933989483018863, + "learning_rate": 5.879606397128743e-06, + "loss": 0.3175, + "step": 2300 + }, + { + "epoch": 1.3437705276987082, + "grad_norm": 0.08793395769254472, + "learning_rate": 5.8703176853292705e-06, + "loss": 0.3238, + "step": 2301 + }, + { + "epoch": 1.3443544266841836, + "grad_norm": 0.09489643414218522, + "learning_rate": 5.861033267242238e-06, + "loss": 0.3194, + "step": 2302 + }, + { + "epoch": 1.3449383256696592, + "grad_norm": 0.08146929049976696, + "learning_rate": 5.85175315252083e-06, + "loss": 0.2864, + "step": 2303 + }, + { + "epoch": 1.3455222246551346, + "grad_norm": 0.08962029537333602, + "learning_rate": 5.842477350813773e-06, + "loss": 0.3215, + "step": 2304 + }, + { + "epoch": 1.3461061236406102, + "grad_norm": 0.09932078165586779, + "learning_rate": 5.833205871765297e-06, + "loss": 0.4093, + "step": 2305 + }, + { + "epoch": 1.3466900226260856, + "grad_norm": 0.08264374264297154, + "learning_rate": 5.823938725015148e-06, + "loss": 0.2694, + "step": 2306 + }, + { + "epoch": 1.3472739216115612, + "grad_norm": 0.09210225730665865, + "learning_rate": 5.8146759201985525e-06, + "loss": 0.3423, + "step": 2307 + }, + { + "epoch": 1.3478578205970368, + "grad_norm": 0.09642793780094958, + "learning_rate": 5.8054174669462425e-06, + "loss": 0.3438, + "step": 2308 + }, + { + "epoch": 1.3484417195825122, + "grad_norm": 0.0941957871039035, + "learning_rate": 5.796163374884406e-06, + "loss": 0.3191, + "step": 2309 + }, + { + "epoch": 1.3490256185679876, + "grad_norm": 0.08477501235147696, + "learning_rate": 5.786913653634714e-06, + "loss": 0.2993, + "step": 2310 + }, + { + "epoch": 1.3496095175534633, + "grad_norm": 0.1040868923852869, + "learning_rate": 5.77766831281428e-06, + "loss": 0.3713, + "step": 2311 + }, + { + "epoch": 1.3501934165389389, + "grad_norm": 0.08889556568888123, + "learning_rate": 5.768427362035665e-06, + "loss": 0.3343, + "step": 2312 + }, + { + "epoch": 1.3507773155244143, + "grad_norm": 0.08833662100742769, + "learning_rate": 5.759190810906876e-06, + "loss": 0.2917, + "step": 2313 + }, + { + "epoch": 1.3513612145098897, + "grad_norm": 0.08762407991793804, + "learning_rate": 5.749958669031329e-06, + "loss": 0.2977, + "step": 2314 + }, + { + "epoch": 1.3519451134953653, + "grad_norm": 0.09611865005319543, + "learning_rate": 5.740730946007874e-06, + "loss": 0.3328, + "step": 2315 + }, + { + "epoch": 1.352529012480841, + "grad_norm": 0.09744197012244712, + "learning_rate": 5.7315076514307535e-06, + "loss": 0.3483, + "step": 2316 + }, + { + "epoch": 1.3531129114663163, + "grad_norm": 0.08736967597051962, + "learning_rate": 5.722288794889603e-06, + "loss": 0.3081, + "step": 2317 + }, + { + "epoch": 1.353696810451792, + "grad_norm": 0.08696178625041079, + "learning_rate": 5.713074385969457e-06, + "loss": 0.2967, + "step": 2318 + }, + { + "epoch": 1.3542807094372673, + "grad_norm": 0.08809003623691669, + "learning_rate": 5.703864434250721e-06, + "loss": 0.3138, + "step": 2319 + }, + { + "epoch": 1.354864608422743, + "grad_norm": 0.08923836295390841, + "learning_rate": 5.694658949309158e-06, + "loss": 0.2979, + "step": 2320 + }, + { + "epoch": 1.3554485074082183, + "grad_norm": 0.0914873273755702, + "learning_rate": 5.685457940715898e-06, + "loss": 0.3161, + "step": 2321 + }, + { + "epoch": 1.356032406393694, + "grad_norm": 0.08458578739512944, + "learning_rate": 5.67626141803741e-06, + "loss": 0.2973, + "step": 2322 + }, + { + "epoch": 1.3566163053791693, + "grad_norm": 0.08919375028311262, + "learning_rate": 5.667069390835496e-06, + "loss": 0.3073, + "step": 2323 + }, + { + "epoch": 1.357200204364645, + "grad_norm": 0.09638264898662965, + "learning_rate": 5.657881868667296e-06, + "loss": 0.3423, + "step": 2324 + }, + { + "epoch": 1.3577841033501206, + "grad_norm": 0.08514646725897815, + "learning_rate": 5.648698861085254e-06, + "loss": 0.2919, + "step": 2325 + }, + { + "epoch": 1.358368002335596, + "grad_norm": 0.08898868734936971, + "learning_rate": 5.639520377637127e-06, + "loss": 0.3064, + "step": 2326 + }, + { + "epoch": 1.3589519013210714, + "grad_norm": 0.09089482632797917, + "learning_rate": 5.630346427865965e-06, + "loss": 0.3123, + "step": 2327 + }, + { + "epoch": 1.359535800306547, + "grad_norm": 0.09382254468991998, + "learning_rate": 5.621177021310101e-06, + "loss": 0.3375, + "step": 2328 + }, + { + "epoch": 1.3601196992920226, + "grad_norm": 0.09318852129177874, + "learning_rate": 5.612012167503157e-06, + "loss": 0.3441, + "step": 2329 + }, + { + "epoch": 1.360703598277498, + "grad_norm": 0.08873060752720996, + "learning_rate": 5.602851875974005e-06, + "loss": 0.3064, + "step": 2330 + }, + { + "epoch": 1.3612874972629734, + "grad_norm": 0.08728502041268663, + "learning_rate": 5.593696156246788e-06, + "loss": 0.314, + "step": 2331 + }, + { + "epoch": 1.361871396248449, + "grad_norm": 0.0927463275361709, + "learning_rate": 5.584545017840886e-06, + "loss": 0.3685, + "step": 2332 + }, + { + "epoch": 1.3624552952339246, + "grad_norm": 0.09369001279195396, + "learning_rate": 5.575398470270913e-06, + "loss": 0.3686, + "step": 2333 + }, + { + "epoch": 1.3630391942194, + "grad_norm": 0.08668617407630662, + "learning_rate": 5.566256523046727e-06, + "loss": 0.2923, + "step": 2334 + }, + { + "epoch": 1.3636230932048756, + "grad_norm": 0.0888838804135982, + "learning_rate": 5.5571191856733795e-06, + "loss": 0.311, + "step": 2335 + }, + { + "epoch": 1.364206992190351, + "grad_norm": 0.09254991206820427, + "learning_rate": 5.547986467651152e-06, + "loss": 0.3295, + "step": 2336 + }, + { + "epoch": 1.3647908911758266, + "grad_norm": 0.09068196504570432, + "learning_rate": 5.538858378475508e-06, + "loss": 0.3497, + "step": 2337 + }, + { + "epoch": 1.365374790161302, + "grad_norm": 0.1034669344195245, + "learning_rate": 5.529734927637096e-06, + "loss": 0.3083, + "step": 2338 + }, + { + "epoch": 1.3659586891467776, + "grad_norm": 0.08684356593366298, + "learning_rate": 5.520616124621759e-06, + "loss": 0.308, + "step": 2339 + }, + { + "epoch": 1.366542588132253, + "grad_norm": 0.0836675718229037, + "learning_rate": 5.511501978910488e-06, + "loss": 0.2955, + "step": 2340 + }, + { + "epoch": 1.3671264871177287, + "grad_norm": 0.08678145120349268, + "learning_rate": 5.50239249997945e-06, + "loss": 0.3173, + "step": 2341 + }, + { + "epoch": 1.3677103861032043, + "grad_norm": 0.09463097829253442, + "learning_rate": 5.493287697299943e-06, + "loss": 0.349, + "step": 2342 + }, + { + "epoch": 1.3682942850886797, + "grad_norm": 0.08915587942394304, + "learning_rate": 5.484187580338409e-06, + "loss": 0.2712, + "step": 2343 + }, + { + "epoch": 1.368878184074155, + "grad_norm": 0.09014612794458693, + "learning_rate": 5.475092158556429e-06, + "loss": 0.3215, + "step": 2344 + }, + { + "epoch": 1.3694620830596307, + "grad_norm": 0.09777350564059445, + "learning_rate": 5.4660014414106825e-06, + "loss": 0.3614, + "step": 2345 + }, + { + "epoch": 1.3700459820451063, + "grad_norm": 0.08131766318418737, + "learning_rate": 5.4569154383529736e-06, + "loss": 0.2895, + "step": 2346 + }, + { + "epoch": 1.3706298810305817, + "grad_norm": 0.08481668626094135, + "learning_rate": 5.447834158830202e-06, + "loss": 0.2973, + "step": 2347 + }, + { + "epoch": 1.371213780016057, + "grad_norm": 0.08857640370260278, + "learning_rate": 5.438757612284348e-06, + "loss": 0.3202, + "step": 2348 + }, + { + "epoch": 1.3717976790015327, + "grad_norm": 0.09360356427689126, + "learning_rate": 5.429685808152483e-06, + "loss": 0.3291, + "step": 2349 + }, + { + "epoch": 1.3723815779870083, + "grad_norm": 0.09088786221232266, + "learning_rate": 5.420618755866736e-06, + "loss": 0.3401, + "step": 2350 + }, + { + "epoch": 1.3729654769724837, + "grad_norm": 0.08738037990918432, + "learning_rate": 5.411556464854301e-06, + "loss": 0.3057, + "step": 2351 + }, + { + "epoch": 1.3735493759579593, + "grad_norm": 0.09166121931080977, + "learning_rate": 5.4024989445374245e-06, + "loss": 0.3122, + "step": 2352 + }, + { + "epoch": 1.3741332749434347, + "grad_norm": 0.09067182252476852, + "learning_rate": 5.393446204333386e-06, + "loss": 0.3123, + "step": 2353 + }, + { + "epoch": 1.3747171739289104, + "grad_norm": 0.08584275851832228, + "learning_rate": 5.384398253654504e-06, + "loss": 0.2996, + "step": 2354 + }, + { + "epoch": 1.3753010729143857, + "grad_norm": 0.08953651166555315, + "learning_rate": 5.3753551019081065e-06, + "loss": 0.3319, + "step": 2355 + }, + { + "epoch": 1.3758849718998614, + "grad_norm": 0.08459576431836931, + "learning_rate": 5.366316758496537e-06, + "loss": 0.298, + "step": 2356 + }, + { + "epoch": 1.3764688708853368, + "grad_norm": 0.08688944981926627, + "learning_rate": 5.357283232817147e-06, + "loss": 0.308, + "step": 2357 + }, + { + "epoch": 1.3770527698708124, + "grad_norm": 0.08749063579595985, + "learning_rate": 5.348254534262262e-06, + "loss": 0.3272, + "step": 2358 + }, + { + "epoch": 1.377636668856288, + "grad_norm": 0.09900726384641693, + "learning_rate": 5.339230672219209e-06, + "loss": 0.3622, + "step": 2359 + }, + { + "epoch": 1.3782205678417634, + "grad_norm": 0.0938802197059796, + "learning_rate": 5.330211656070269e-06, + "loss": 0.3331, + "step": 2360 + }, + { + "epoch": 1.3788044668272388, + "grad_norm": 0.09336466505481651, + "learning_rate": 5.3211974951926906e-06, + "loss": 0.3568, + "step": 2361 + }, + { + "epoch": 1.3793883658127144, + "grad_norm": 0.08804332717108786, + "learning_rate": 5.312188198958681e-06, + "loss": 0.3083, + "step": 2362 + }, + { + "epoch": 1.37997226479819, + "grad_norm": 0.09008200820945853, + "learning_rate": 5.303183776735379e-06, + "loss": 0.3028, + "step": 2363 + }, + { + "epoch": 1.3805561637836654, + "grad_norm": 0.0892474916519469, + "learning_rate": 5.294184237884865e-06, + "loss": 0.3144, + "step": 2364 + }, + { + "epoch": 1.3811400627691408, + "grad_norm": 0.08223145774836955, + "learning_rate": 5.2851895917641345e-06, + "loss": 0.306, + "step": 2365 + }, + { + "epoch": 1.3817239617546164, + "grad_norm": 0.08098177841713741, + "learning_rate": 5.276199847725098e-06, + "loss": 0.2933, + "step": 2366 + }, + { + "epoch": 1.382307860740092, + "grad_norm": 0.08688322863987814, + "learning_rate": 5.267215015114574e-06, + "loss": 0.3069, + "step": 2367 + }, + { + "epoch": 1.3828917597255674, + "grad_norm": 0.08378347266843482, + "learning_rate": 5.258235103274265e-06, + "loss": 0.3034, + "step": 2368 + }, + { + "epoch": 1.383475658711043, + "grad_norm": 0.08454380710933902, + "learning_rate": 5.249260121540772e-06, + "loss": 0.2976, + "step": 2369 + }, + { + "epoch": 1.3840595576965184, + "grad_norm": 0.08839053097244433, + "learning_rate": 5.240290079245555e-06, + "loss": 0.2879, + "step": 2370 + }, + { + "epoch": 1.384643456681994, + "grad_norm": 0.09006590705762338, + "learning_rate": 5.231324985714942e-06, + "loss": 0.3213, + "step": 2371 + }, + { + "epoch": 1.3852273556674695, + "grad_norm": 0.09706417197920225, + "learning_rate": 5.222364850270125e-06, + "loss": 0.3494, + "step": 2372 + }, + { + "epoch": 1.385811254652945, + "grad_norm": 0.08343025312016701, + "learning_rate": 5.213409682227129e-06, + "loss": 0.3262, + "step": 2373 + }, + { + "epoch": 1.3863951536384205, + "grad_norm": 0.08742227091949549, + "learning_rate": 5.204459490896818e-06, + "loss": 0.3178, + "step": 2374 + }, + { + "epoch": 1.386979052623896, + "grad_norm": 0.0797910582517202, + "learning_rate": 5.195514285584893e-06, + "loss": 0.3126, + "step": 2375 + }, + { + "epoch": 1.3875629516093717, + "grad_norm": 0.08858594630841747, + "learning_rate": 5.1865740755918496e-06, + "loss": 0.3168, + "step": 2376 + }, + { + "epoch": 1.388146850594847, + "grad_norm": 0.09631330127847068, + "learning_rate": 5.177638870213008e-06, + "loss": 0.4051, + "step": 2377 + }, + { + "epoch": 1.3887307495803225, + "grad_norm": 0.08355308751976169, + "learning_rate": 5.1687086787384786e-06, + "loss": 0.3115, + "step": 2378 + }, + { + "epoch": 1.3893146485657981, + "grad_norm": 0.0863955143558716, + "learning_rate": 5.1597835104531514e-06, + "loss": 0.3057, + "step": 2379 + }, + { + "epoch": 1.3898985475512737, + "grad_norm": 0.07591133976512111, + "learning_rate": 5.1508633746367075e-06, + "loss": 0.2764, + "step": 2380 + }, + { + "epoch": 1.3904824465367491, + "grad_norm": 0.08008593536481562, + "learning_rate": 5.141948280563582e-06, + "loss": 0.2802, + "step": 2381 + }, + { + "epoch": 1.3910663455222245, + "grad_norm": 0.08798366857304221, + "learning_rate": 5.133038237502983e-06, + "loss": 0.3115, + "step": 2382 + }, + { + "epoch": 1.3916502445077001, + "grad_norm": 0.0896288630572044, + "learning_rate": 5.1241332547188535e-06, + "loss": 0.3092, + "step": 2383 + }, + { + "epoch": 1.3922341434931758, + "grad_norm": 0.09434405164124895, + "learning_rate": 5.1152333414698774e-06, + "loss": 0.3169, + "step": 2384 + }, + { + "epoch": 1.3928180424786512, + "grad_norm": 0.09115571048804291, + "learning_rate": 5.106338507009478e-06, + "loss": 0.3238, + "step": 2385 + }, + { + "epoch": 1.3934019414641268, + "grad_norm": 0.09490079900382614, + "learning_rate": 5.097448760585784e-06, + "loss": 0.3303, + "step": 2386 + }, + { + "epoch": 1.3939858404496022, + "grad_norm": 0.0888260244262516, + "learning_rate": 5.088564111441645e-06, + "loss": 0.3111, + "step": 2387 + }, + { + "epoch": 1.3945697394350778, + "grad_norm": 0.0915085677667555, + "learning_rate": 5.079684568814607e-06, + "loss": 0.3118, + "step": 2388 + }, + { + "epoch": 1.3951536384205532, + "grad_norm": 0.0823939819979883, + "learning_rate": 5.070810141936901e-06, + "loss": 0.2832, + "step": 2389 + }, + { + "epoch": 1.3957375374060288, + "grad_norm": 0.08914409721994397, + "learning_rate": 5.06194084003545e-06, + "loss": 0.3272, + "step": 2390 + }, + { + "epoch": 1.3963214363915042, + "grad_norm": 0.08624331955605972, + "learning_rate": 5.053076672331837e-06, + "loss": 0.293, + "step": 2391 + }, + { + "epoch": 1.3969053353769798, + "grad_norm": 0.08523689707460945, + "learning_rate": 5.04421764804232e-06, + "loss": 0.2851, + "step": 2392 + }, + { + "epoch": 1.3974892343624554, + "grad_norm": 0.09400706998854881, + "learning_rate": 5.035363776377797e-06, + "loss": 0.2878, + "step": 2393 + }, + { + "epoch": 1.3980731333479308, + "grad_norm": 0.09569727179332492, + "learning_rate": 5.026515066543813e-06, + "loss": 0.3504, + "step": 2394 + }, + { + "epoch": 1.3986570323334062, + "grad_norm": 0.08909777953613215, + "learning_rate": 5.017671527740551e-06, + "loss": 0.3243, + "step": 2395 + }, + { + "epoch": 1.3992409313188818, + "grad_norm": 0.08829680049071838, + "learning_rate": 5.008833169162805e-06, + "loss": 0.2971, + "step": 2396 + }, + { + "epoch": 1.3998248303043574, + "grad_norm": 0.09062963508407255, + "learning_rate": 5.000000000000003e-06, + "loss": 0.3123, + "step": 2397 + }, + { + "epoch": 1.4004087292898328, + "grad_norm": 0.09466043378086396, + "learning_rate": 4.991172029436157e-06, + "loss": 0.3151, + "step": 2398 + }, + { + "epoch": 1.4009926282753082, + "grad_norm": 0.08276613525555017, + "learning_rate": 4.982349266649884e-06, + "loss": 0.2993, + "step": 2399 + }, + { + "epoch": 1.4015765272607839, + "grad_norm": 0.0783168915983525, + "learning_rate": 4.97353172081439e-06, + "loss": 0.2667, + "step": 2400 + }, + { + "epoch": 1.4021604262462595, + "grad_norm": 0.09277063821120075, + "learning_rate": 4.964719401097444e-06, + "loss": 0.3575, + "step": 2401 + }, + { + "epoch": 1.4027443252317349, + "grad_norm": 0.08034353893032221, + "learning_rate": 4.9559123166613935e-06, + "loss": 0.278, + "step": 2402 + }, + { + "epoch": 1.4033282242172105, + "grad_norm": 0.089987609682839, + "learning_rate": 4.947110476663143e-06, + "loss": 0.3279, + "step": 2403 + }, + { + "epoch": 1.4039121232026859, + "grad_norm": 0.08410256785086956, + "learning_rate": 4.93831389025413e-06, + "loss": 0.2947, + "step": 2404 + }, + { + "epoch": 1.4044960221881615, + "grad_norm": 0.08174319185043308, + "learning_rate": 4.9295225665803504e-06, + "loss": 0.2679, + "step": 2405 + }, + { + "epoch": 1.405079921173637, + "grad_norm": 0.0934377459393833, + "learning_rate": 4.9207365147823114e-06, + "loss": 0.3464, + "step": 2406 + }, + { + "epoch": 1.4056638201591125, + "grad_norm": 0.09087806109456224, + "learning_rate": 4.911955743995042e-06, + "loss": 0.3686, + "step": 2407 + }, + { + "epoch": 1.406247719144588, + "grad_norm": 0.08702607275646844, + "learning_rate": 4.90318026334809e-06, + "loss": 0.2951, + "step": 2408 + }, + { + "epoch": 1.4068316181300635, + "grad_norm": 0.08745615324701193, + "learning_rate": 4.894410081965489e-06, + "loss": 0.3125, + "step": 2409 + }, + { + "epoch": 1.4074155171155391, + "grad_norm": 0.08164698269954809, + "learning_rate": 4.885645208965779e-06, + "loss": 0.28, + "step": 2410 + }, + { + "epoch": 1.4079994161010145, + "grad_norm": 0.09556156302921621, + "learning_rate": 4.876885653461967e-06, + "loss": 0.3702, + "step": 2411 + }, + { + "epoch": 1.40858331508649, + "grad_norm": 0.08805956721964225, + "learning_rate": 4.868131424561532e-06, + "loss": 0.297, + "step": 2412 + }, + { + "epoch": 1.4091672140719655, + "grad_norm": 0.08170501075075969, + "learning_rate": 4.859382531366428e-06, + "loss": 0.3054, + "step": 2413 + }, + { + "epoch": 1.4097511130574412, + "grad_norm": 0.09075494274467835, + "learning_rate": 4.850638982973043e-06, + "loss": 0.3416, + "step": 2414 + }, + { + "epoch": 1.4103350120429166, + "grad_norm": 0.09100186519968893, + "learning_rate": 4.841900788472227e-06, + "loss": 0.3102, + "step": 2415 + }, + { + "epoch": 1.410918911028392, + "grad_norm": 0.08782295170474512, + "learning_rate": 4.833167956949249e-06, + "loss": 0.3154, + "step": 2416 + }, + { + "epoch": 1.4115028100138676, + "grad_norm": 0.08189470709325967, + "learning_rate": 4.824440497483802e-06, + "loss": 0.2698, + "step": 2417 + }, + { + "epoch": 1.4120867089993432, + "grad_norm": 0.0923971262599707, + "learning_rate": 4.815718419150007e-06, + "loss": 0.2986, + "step": 2418 + }, + { + "epoch": 1.4126706079848186, + "grad_norm": 0.08991001348597132, + "learning_rate": 4.807001731016374e-06, + "loss": 0.3356, + "step": 2419 + }, + { + "epoch": 1.4132545069702942, + "grad_norm": 0.09044980403159322, + "learning_rate": 4.7982904421458245e-06, + "loss": 0.3189, + "step": 2420 + }, + { + "epoch": 1.4138384059557696, + "grad_norm": 0.08840491072029048, + "learning_rate": 4.789584561595651e-06, + "loss": 0.2987, + "step": 2421 + }, + { + "epoch": 1.4144223049412452, + "grad_norm": 0.0945298079999019, + "learning_rate": 4.780884098417531e-06, + "loss": 0.3517, + "step": 2422 + }, + { + "epoch": 1.4150062039267206, + "grad_norm": 0.09052684780179952, + "learning_rate": 4.772189061657511e-06, + "loss": 0.3092, + "step": 2423 + }, + { + "epoch": 1.4155901029121962, + "grad_norm": 0.08538798541210658, + "learning_rate": 4.763499460355988e-06, + "loss": 0.2917, + "step": 2424 + }, + { + "epoch": 1.4161740018976716, + "grad_norm": 0.09447112095262956, + "learning_rate": 4.7548153035477185e-06, + "loss": 0.3388, + "step": 2425 + }, + { + "epoch": 1.4167579008831472, + "grad_norm": 0.08306824905023341, + "learning_rate": 4.746136600261791e-06, + "loss": 0.2784, + "step": 2426 + }, + { + "epoch": 1.4173417998686229, + "grad_norm": 0.08317019946958368, + "learning_rate": 4.737463359521618e-06, + "loss": 0.3086, + "step": 2427 + }, + { + "epoch": 1.4179256988540982, + "grad_norm": 0.09183631302562928, + "learning_rate": 4.728795590344948e-06, + "loss": 0.3528, + "step": 2428 + }, + { + "epoch": 1.4185095978395736, + "grad_norm": 0.08554191953557318, + "learning_rate": 4.7201333017438266e-06, + "loss": 0.3335, + "step": 2429 + }, + { + "epoch": 1.4190934968250493, + "grad_norm": 0.08686460350054413, + "learning_rate": 4.711476502724609e-06, + "loss": 0.3284, + "step": 2430 + }, + { + "epoch": 1.4196773958105249, + "grad_norm": 0.08560859367525159, + "learning_rate": 4.702825202287944e-06, + "loss": 0.2908, + "step": 2431 + }, + { + "epoch": 1.4202612947960003, + "grad_norm": 0.09273816507312574, + "learning_rate": 4.694179409428752e-06, + "loss": 0.3324, + "step": 2432 + }, + { + "epoch": 1.4208451937814757, + "grad_norm": 0.07591668372459705, + "learning_rate": 4.685539133136244e-06, + "loss": 0.2641, + "step": 2433 + }, + { + "epoch": 1.4214290927669513, + "grad_norm": 0.09459004175564577, + "learning_rate": 4.6769043823938806e-06, + "loss": 0.3078, + "step": 2434 + }, + { + "epoch": 1.422012991752427, + "grad_norm": 0.09473727281588787, + "learning_rate": 4.668275166179383e-06, + "loss": 0.3299, + "step": 2435 + }, + { + "epoch": 1.4225968907379023, + "grad_norm": 0.0866202685062746, + "learning_rate": 4.659651493464721e-06, + "loss": 0.2891, + "step": 2436 + }, + { + "epoch": 1.423180789723378, + "grad_norm": 0.08605177588483497, + "learning_rate": 4.6510333732160915e-06, + "loss": 0.3173, + "step": 2437 + }, + { + "epoch": 1.4237646887088533, + "grad_norm": 0.0815916438584674, + "learning_rate": 4.642420814393934e-06, + "loss": 0.2944, + "step": 2438 + }, + { + "epoch": 1.424348587694329, + "grad_norm": 0.09018274144450666, + "learning_rate": 4.633813825952892e-06, + "loss": 0.2897, + "step": 2439 + }, + { + "epoch": 1.4249324866798043, + "grad_norm": 0.08459155392195206, + "learning_rate": 4.625212416841816e-06, + "loss": 0.3184, + "step": 2440 + }, + { + "epoch": 1.42551638566528, + "grad_norm": 0.0931650549253166, + "learning_rate": 4.616616596003772e-06, + "loss": 0.3558, + "step": 2441 + }, + { + "epoch": 1.4261002846507553, + "grad_norm": 0.09535102609661507, + "learning_rate": 4.6080263723759955e-06, + "loss": 0.3535, + "step": 2442 + }, + { + "epoch": 1.426684183636231, + "grad_norm": 0.1080525681892022, + "learning_rate": 4.599441754889919e-06, + "loss": 0.3991, + "step": 2443 + }, + { + "epoch": 1.4272680826217066, + "grad_norm": 0.09629100812596107, + "learning_rate": 4.590862752471138e-06, + "loss": 0.3511, + "step": 2444 + }, + { + "epoch": 1.427851981607182, + "grad_norm": 0.08973101095096933, + "learning_rate": 4.582289374039405e-06, + "loss": 0.3108, + "step": 2445 + }, + { + "epoch": 1.4284358805926574, + "grad_norm": 0.0916666497014747, + "learning_rate": 4.573721628508638e-06, + "loss": 0.3235, + "step": 2446 + }, + { + "epoch": 1.429019779578133, + "grad_norm": 0.08732416647869701, + "learning_rate": 4.565159524786888e-06, + "loss": 0.2806, + "step": 2447 + }, + { + "epoch": 1.4296036785636086, + "grad_norm": 0.08606772704786021, + "learning_rate": 4.556603071776347e-06, + "loss": 0.2712, + "step": 2448 + }, + { + "epoch": 1.430187577549084, + "grad_norm": 0.09088188622011086, + "learning_rate": 4.548052278373327e-06, + "loss": 0.3074, + "step": 2449 + }, + { + "epoch": 1.4307714765345594, + "grad_norm": 0.08694864993018561, + "learning_rate": 4.539507153468254e-06, + "loss": 0.2979, + "step": 2450 + }, + { + "epoch": 1.431355375520035, + "grad_norm": 0.0947627072328961, + "learning_rate": 4.530967705945668e-06, + "loss": 0.3329, + "step": 2451 + }, + { + "epoch": 1.4319392745055106, + "grad_norm": 0.09571808714681529, + "learning_rate": 4.522433944684197e-06, + "loss": 0.3382, + "step": 2452 + }, + { + "epoch": 1.432523173490986, + "grad_norm": 0.07745021664423844, + "learning_rate": 4.513905878556568e-06, + "loss": 0.2676, + "step": 2453 + }, + { + "epoch": 1.4331070724764616, + "grad_norm": 0.10463757182580202, + "learning_rate": 4.505383516429577e-06, + "loss": 0.3449, + "step": 2454 + }, + { + "epoch": 1.433690971461937, + "grad_norm": 0.08662135172294493, + "learning_rate": 4.496866867164087e-06, + "loss": 0.2878, + "step": 2455 + }, + { + "epoch": 1.4342748704474126, + "grad_norm": 0.08426146600816206, + "learning_rate": 4.488355939615035e-06, + "loss": 0.3103, + "step": 2456 + }, + { + "epoch": 1.434858769432888, + "grad_norm": 0.09044895178724251, + "learning_rate": 4.479850742631396e-06, + "loss": 0.3373, + "step": 2457 + }, + { + "epoch": 1.4354426684183637, + "grad_norm": 0.08688801918272432, + "learning_rate": 4.471351285056192e-06, + "loss": 0.3242, + "step": 2458 + }, + { + "epoch": 1.436026567403839, + "grad_norm": 0.08513908152948639, + "learning_rate": 4.462857575726482e-06, + "loss": 0.2895, + "step": 2459 + }, + { + "epoch": 1.4366104663893147, + "grad_norm": 0.08618501270964886, + "learning_rate": 4.454369623473337e-06, + "loss": 0.3055, + "step": 2460 + }, + { + "epoch": 1.4371943653747903, + "grad_norm": 0.08471623236365555, + "learning_rate": 4.445887437121855e-06, + "loss": 0.3016, + "step": 2461 + }, + { + "epoch": 1.4377782643602657, + "grad_norm": 0.08665903034832784, + "learning_rate": 4.437411025491131e-06, + "loss": 0.3213, + "step": 2462 + }, + { + "epoch": 1.438362163345741, + "grad_norm": 0.09289445369451396, + "learning_rate": 4.428940397394253e-06, + "loss": 0.324, + "step": 2463 + }, + { + "epoch": 1.4389460623312167, + "grad_norm": 0.08757740022629228, + "learning_rate": 4.420475561638309e-06, + "loss": 0.3108, + "step": 2464 + }, + { + "epoch": 1.4395299613166923, + "grad_norm": 0.08969677865224199, + "learning_rate": 4.412016527024348e-06, + "loss": 0.2948, + "step": 2465 + }, + { + "epoch": 1.4401138603021677, + "grad_norm": 0.08867170913372029, + "learning_rate": 4.4035633023474035e-06, + "loss": 0.3171, + "step": 2466 + }, + { + "epoch": 1.4406977592876433, + "grad_norm": 0.09475795171605284, + "learning_rate": 4.395115896396457e-06, + "loss": 0.3348, + "step": 2467 + }, + { + "epoch": 1.4412816582731187, + "grad_norm": 0.08522651113351296, + "learning_rate": 4.386674317954439e-06, + "loss": 0.2684, + "step": 2468 + }, + { + "epoch": 1.4418655572585943, + "grad_norm": 0.09562030369299673, + "learning_rate": 4.378238575798233e-06, + "loss": 0.3353, + "step": 2469 + }, + { + "epoch": 1.4424494562440697, + "grad_norm": 0.09000601690321446, + "learning_rate": 4.3698086786986425e-06, + "loss": 0.3196, + "step": 2470 + }, + { + "epoch": 1.4430333552295453, + "grad_norm": 0.08823640117087708, + "learning_rate": 4.3613846354204025e-06, + "loss": 0.2943, + "step": 2471 + }, + { + "epoch": 1.4436172542150207, + "grad_norm": 0.09022163394196821, + "learning_rate": 4.352966454722155e-06, + "loss": 0.3233, + "step": 2472 + }, + { + "epoch": 1.4442011532004964, + "grad_norm": 0.0936239700642225, + "learning_rate": 4.344554145356447e-06, + "loss": 0.3441, + "step": 2473 + }, + { + "epoch": 1.4447850521859718, + "grad_norm": 0.08294014304076505, + "learning_rate": 4.336147716069727e-06, + "loss": 0.2991, + "step": 2474 + }, + { + "epoch": 1.4453689511714474, + "grad_norm": 0.09143392677166375, + "learning_rate": 4.327747175602321e-06, + "loss": 0.3465, + "step": 2475 + }, + { + "epoch": 1.4459528501569228, + "grad_norm": 0.08934921625126872, + "learning_rate": 4.319352532688444e-06, + "loss": 0.2977, + "step": 2476 + }, + { + "epoch": 1.4465367491423984, + "grad_norm": 0.08647089246504024, + "learning_rate": 4.310963796056168e-06, + "loss": 0.2919, + "step": 2477 + }, + { + "epoch": 1.447120648127874, + "grad_norm": 0.08299328999723277, + "learning_rate": 4.302580974427426e-06, + "loss": 0.291, + "step": 2478 + }, + { + "epoch": 1.4477045471133494, + "grad_norm": 0.08523415242118329, + "learning_rate": 4.29420407651801e-06, + "loss": 0.3247, + "step": 2479 + }, + { + "epoch": 1.4482884460988248, + "grad_norm": 0.09698688995030566, + "learning_rate": 4.2858331110375406e-06, + "loss": 0.3382, + "step": 2480 + }, + { + "epoch": 1.4488723450843004, + "grad_norm": 0.09015256766046738, + "learning_rate": 4.277468086689481e-06, + "loss": 0.3167, + "step": 2481 + }, + { + "epoch": 1.449456244069776, + "grad_norm": 0.08616163482900188, + "learning_rate": 4.269109012171112e-06, + "loss": 0.3035, + "step": 2482 + }, + { + "epoch": 1.4500401430552514, + "grad_norm": 0.09569918693794723, + "learning_rate": 4.260755896173523e-06, + "loss": 0.3554, + "step": 2483 + }, + { + "epoch": 1.450624042040727, + "grad_norm": 0.08976739770868741, + "learning_rate": 4.252408747381622e-06, + "loss": 0.303, + "step": 2484 + }, + { + "epoch": 1.4512079410262024, + "grad_norm": 0.08595994660246761, + "learning_rate": 4.244067574474098e-06, + "loss": 0.3138, + "step": 2485 + }, + { + "epoch": 1.451791840011678, + "grad_norm": 0.09117098429154519, + "learning_rate": 4.235732386123437e-06, + "loss": 0.3351, + "step": 2486 + }, + { + "epoch": 1.4523757389971534, + "grad_norm": 0.08412735606790589, + "learning_rate": 4.227403190995901e-06, + "loss": 0.2753, + "step": 2487 + }, + { + "epoch": 1.452959637982629, + "grad_norm": 0.09714700659280308, + "learning_rate": 4.219079997751515e-06, + "loss": 0.3496, + "step": 2488 + }, + { + "epoch": 1.4535435369681045, + "grad_norm": 0.09793100692926154, + "learning_rate": 4.210762815044073e-06, + "loss": 0.3517, + "step": 2489 + }, + { + "epoch": 1.45412743595358, + "grad_norm": 0.08243131072608699, + "learning_rate": 4.20245165152111e-06, + "loss": 0.2817, + "step": 2490 + }, + { + "epoch": 1.4547113349390555, + "grad_norm": 0.090659357023587, + "learning_rate": 4.194146515823906e-06, + "loss": 0.2986, + "step": 2491 + }, + { + "epoch": 1.455295233924531, + "grad_norm": 0.09841364726880468, + "learning_rate": 4.185847416587481e-06, + "loss": 0.3486, + "step": 2492 + }, + { + "epoch": 1.4558791329100065, + "grad_norm": 0.09069963864071329, + "learning_rate": 4.177554362440565e-06, + "loss": 0.3161, + "step": 2493 + }, + { + "epoch": 1.456463031895482, + "grad_norm": 0.08986172202801727, + "learning_rate": 4.169267362005619e-06, + "loss": 0.309, + "step": 2494 + }, + { + "epoch": 1.4570469308809577, + "grad_norm": 0.09012422968549345, + "learning_rate": 4.160986423898798e-06, + "loss": 0.3286, + "step": 2495 + }, + { + "epoch": 1.457630829866433, + "grad_norm": 0.085718043486365, + "learning_rate": 4.15271155672995e-06, + "loss": 0.2813, + "step": 2496 + }, + { + "epoch": 1.4582147288519085, + "grad_norm": 0.08771758264618021, + "learning_rate": 4.14444276910263e-06, + "loss": 0.3202, + "step": 2497 + }, + { + "epoch": 1.4587986278373841, + "grad_norm": 0.08735065285679727, + "learning_rate": 4.1361800696140505e-06, + "loss": 0.2979, + "step": 2498 + }, + { + "epoch": 1.4593825268228597, + "grad_norm": 0.0904841789766028, + "learning_rate": 4.127923466855111e-06, + "loss": 0.3188, + "step": 2499 + }, + { + "epoch": 1.4599664258083351, + "grad_norm": 0.08601495404465023, + "learning_rate": 4.119672969410362e-06, + "loss": 0.3099, + "step": 2500 + }, + { + "epoch": 1.4605503247938108, + "grad_norm": 0.09038222921848654, + "learning_rate": 4.111428585858005e-06, + "loss": 0.3184, + "step": 2501 + }, + { + "epoch": 1.4611342237792861, + "grad_norm": 0.07973056962940092, + "learning_rate": 4.103190324769895e-06, + "loss": 0.2783, + "step": 2502 + }, + { + "epoch": 1.4617181227647618, + "grad_norm": 0.08298472264243383, + "learning_rate": 4.0949581947115106e-06, + "loss": 0.2845, + "step": 2503 + }, + { + "epoch": 1.4623020217502372, + "grad_norm": 0.08924768110019642, + "learning_rate": 4.086732204241964e-06, + "loss": 0.3013, + "step": 2504 + }, + { + "epoch": 1.4628859207357128, + "grad_norm": 0.09002577048218759, + "learning_rate": 4.07851236191398e-06, + "loss": 0.3275, + "step": 2505 + }, + { + "epoch": 1.4634698197211882, + "grad_norm": 0.08596161218087442, + "learning_rate": 4.070298676273886e-06, + "loss": 0.2875, + "step": 2506 + }, + { + "epoch": 1.4640537187066638, + "grad_norm": 0.09130206808679746, + "learning_rate": 4.06209115586162e-06, + "loss": 0.3363, + "step": 2507 + }, + { + "epoch": 1.4646376176921392, + "grad_norm": 0.09154604539302902, + "learning_rate": 4.053889809210698e-06, + "loss": 0.332, + "step": 2508 + }, + { + "epoch": 1.4652215166776148, + "grad_norm": 0.08610567723334644, + "learning_rate": 4.045694644848228e-06, + "loss": 0.3025, + "step": 2509 + }, + { + "epoch": 1.4658054156630902, + "grad_norm": 0.08971847447697227, + "learning_rate": 4.037505671294883e-06, + "loss": 0.354, + "step": 2510 + }, + { + "epoch": 1.4663893146485658, + "grad_norm": 0.08994065936774182, + "learning_rate": 4.0293228970648955e-06, + "loss": 0.3342, + "step": 2511 + }, + { + "epoch": 1.4669732136340414, + "grad_norm": 0.09226998653848904, + "learning_rate": 4.021146330666065e-06, + "loss": 0.3155, + "step": 2512 + }, + { + "epoch": 1.4675571126195168, + "grad_norm": 0.08901069281514984, + "learning_rate": 4.012975980599724e-06, + "loss": 0.2829, + "step": 2513 + }, + { + "epoch": 1.4681410116049922, + "grad_norm": 0.09930505789529599, + "learning_rate": 4.0048118553607485e-06, + "loss": 0.3751, + "step": 2514 + }, + { + "epoch": 1.4687249105904678, + "grad_norm": 0.09322828397572255, + "learning_rate": 3.996653963437546e-06, + "loss": 0.3033, + "step": 2515 + }, + { + "epoch": 1.4693088095759435, + "grad_norm": 0.09044024227995366, + "learning_rate": 3.98850231331203e-06, + "loss": 0.3368, + "step": 2516 + }, + { + "epoch": 1.4698927085614188, + "grad_norm": 0.08846955800188881, + "learning_rate": 3.980356913459642e-06, + "loss": 0.316, + "step": 2517 + }, + { + "epoch": 1.4704766075468945, + "grad_norm": 0.08495556857034177, + "learning_rate": 3.972217772349309e-06, + "loss": 0.2905, + "step": 2518 + }, + { + "epoch": 1.4710605065323699, + "grad_norm": 0.08894861750563823, + "learning_rate": 3.9640848984434556e-06, + "loss": 0.3356, + "step": 2519 + }, + { + "epoch": 1.4716444055178455, + "grad_norm": 0.08500942367504839, + "learning_rate": 3.955958300197998e-06, + "loss": 0.3036, + "step": 2520 + }, + { + "epoch": 1.4722283045033209, + "grad_norm": 0.08188570479040262, + "learning_rate": 3.947837986062314e-06, + "loss": 0.2781, + "step": 2521 + }, + { + "epoch": 1.4728122034887965, + "grad_norm": 0.08569879099897162, + "learning_rate": 3.939723964479262e-06, + "loss": 0.2973, + "step": 2522 + }, + { + "epoch": 1.4733961024742719, + "grad_norm": 0.08641788500052708, + "learning_rate": 3.931616243885148e-06, + "loss": 0.3141, + "step": 2523 + }, + { + "epoch": 1.4739800014597475, + "grad_norm": 0.08691195686905638, + "learning_rate": 3.923514832709725e-06, + "loss": 0.3172, + "step": 2524 + }, + { + "epoch": 1.474563900445223, + "grad_norm": 0.08584473568889367, + "learning_rate": 3.915419739376198e-06, + "loss": 0.3126, + "step": 2525 + }, + { + "epoch": 1.4751477994306985, + "grad_norm": 0.08611923632545083, + "learning_rate": 3.90733097230119e-06, + "loss": 0.3089, + "step": 2526 + }, + { + "epoch": 1.475731698416174, + "grad_norm": 0.08948726847762559, + "learning_rate": 3.899248539894756e-06, + "loss": 0.3027, + "step": 2527 + }, + { + "epoch": 1.4763155974016495, + "grad_norm": 0.08381643629689604, + "learning_rate": 3.891172450560362e-06, + "loss": 0.2719, + "step": 2528 + }, + { + "epoch": 1.4768994963871251, + "grad_norm": 0.0941175627440657, + "learning_rate": 3.883102712694871e-06, + "loss": 0.3337, + "step": 2529 + }, + { + "epoch": 1.4774833953726005, + "grad_norm": 0.09718621021739013, + "learning_rate": 3.875039334688556e-06, + "loss": 0.3388, + "step": 2530 + }, + { + "epoch": 1.478067294358076, + "grad_norm": 0.08661953305324183, + "learning_rate": 3.866982324925066e-06, + "loss": 0.3014, + "step": 2531 + }, + { + "epoch": 1.4786511933435516, + "grad_norm": 0.09089634804311249, + "learning_rate": 3.858931691781439e-06, + "loss": 0.3229, + "step": 2532 + }, + { + "epoch": 1.4792350923290272, + "grad_norm": 0.09039762119126787, + "learning_rate": 3.850887443628075e-06, + "loss": 0.3348, + "step": 2533 + }, + { + "epoch": 1.4798189913145026, + "grad_norm": 0.08940398427298936, + "learning_rate": 3.842849588828733e-06, + "loss": 0.3273, + "step": 2534 + }, + { + "epoch": 1.4804028902999782, + "grad_norm": 0.08570585142528248, + "learning_rate": 3.834818135740539e-06, + "loss": 0.3026, + "step": 2535 + }, + { + "epoch": 1.4809867892854536, + "grad_norm": 0.08542179907438183, + "learning_rate": 3.826793092713944e-06, + "loss": 0.311, + "step": 2536 + }, + { + "epoch": 1.4815706882709292, + "grad_norm": 0.08851597417883289, + "learning_rate": 3.818774468092754e-06, + "loss": 0.3093, + "step": 2537 + }, + { + "epoch": 1.4821545872564046, + "grad_norm": 0.08771587984695163, + "learning_rate": 3.8107622702140856e-06, + "loss": 0.3415, + "step": 2538 + }, + { + "epoch": 1.4827384862418802, + "grad_norm": 0.09182055460716745, + "learning_rate": 3.802756507408377e-06, + "loss": 0.3585, + "step": 2539 + }, + { + "epoch": 1.4833223852273556, + "grad_norm": 0.08147962579414032, + "learning_rate": 3.794757187999386e-06, + "loss": 0.2934, + "step": 2540 + }, + { + "epoch": 1.4839062842128312, + "grad_norm": 0.08872049638110728, + "learning_rate": 3.7867643203041548e-06, + "loss": 0.3367, + "step": 2541 + }, + { + "epoch": 1.4844901831983066, + "grad_norm": 0.08463648626939235, + "learning_rate": 3.7787779126330314e-06, + "loss": 0.2889, + "step": 2542 + }, + { + "epoch": 1.4850740821837822, + "grad_norm": 0.09160817203545947, + "learning_rate": 3.770797973289644e-06, + "loss": 0.3352, + "step": 2543 + }, + { + "epoch": 1.4856579811692576, + "grad_norm": 0.07906587125508356, + "learning_rate": 3.762824510570887e-06, + "loss": 0.2621, + "step": 2544 + }, + { + "epoch": 1.4862418801547332, + "grad_norm": 0.08662648277395785, + "learning_rate": 3.7548575327669345e-06, + "loss": 0.318, + "step": 2545 + }, + { + "epoch": 1.4868257791402089, + "grad_norm": 0.08180147157320809, + "learning_rate": 3.7468970481612077e-06, + "loss": 0.2601, + "step": 2546 + }, + { + "epoch": 1.4874096781256843, + "grad_norm": 0.08655746823378133, + "learning_rate": 3.738943065030376e-06, + "loss": 0.2962, + "step": 2547 + }, + { + "epoch": 1.4879935771111596, + "grad_norm": 0.07873237379988372, + "learning_rate": 3.7309955916443597e-06, + "loss": 0.2686, + "step": 2548 + }, + { + "epoch": 1.4885774760966353, + "grad_norm": 0.08163741978287954, + "learning_rate": 3.723054636266299e-06, + "loss": 0.3108, + "step": 2549 + }, + { + "epoch": 1.4891613750821109, + "grad_norm": 0.08527190370409653, + "learning_rate": 3.715120207152567e-06, + "loss": 0.3151, + "step": 2550 + }, + { + "epoch": 1.4897452740675863, + "grad_norm": 0.08696312638227123, + "learning_rate": 3.7071923125527444e-06, + "loss": 0.3193, + "step": 2551 + }, + { + "epoch": 1.490329173053062, + "grad_norm": 0.09558432119539427, + "learning_rate": 3.6992709607096167e-06, + "loss": 0.3448, + "step": 2552 + }, + { + "epoch": 1.4909130720385373, + "grad_norm": 0.08442125172674488, + "learning_rate": 3.6913561598591775e-06, + "loss": 0.2939, + "step": 2553 + }, + { + "epoch": 1.491496971024013, + "grad_norm": 0.08255553310023878, + "learning_rate": 3.683447918230594e-06, + "loss": 0.3199, + "step": 2554 + }, + { + "epoch": 1.4920808700094883, + "grad_norm": 0.10286818756099826, + "learning_rate": 3.6755462440462288e-06, + "loss": 0.3776, + "step": 2555 + }, + { + "epoch": 1.492664768994964, + "grad_norm": 0.0888991866433996, + "learning_rate": 3.6676511455216056e-06, + "loss": 0.3265, + "step": 2556 + }, + { + "epoch": 1.4932486679804393, + "grad_norm": 0.08375636020273769, + "learning_rate": 3.659762630865411e-06, + "loss": 0.2968, + "step": 2557 + }, + { + "epoch": 1.493832566965915, + "grad_norm": 0.08867117037489732, + "learning_rate": 3.651880708279497e-06, + "loss": 0.3111, + "step": 2558 + }, + { + "epoch": 1.4944164659513903, + "grad_norm": 0.08934907392782562, + "learning_rate": 3.6440053859588478e-06, + "loss": 0.2917, + "step": 2559 + }, + { + "epoch": 1.495000364936866, + "grad_norm": 0.08864204546079371, + "learning_rate": 3.636136672091598e-06, + "loss": 0.3, + "step": 2560 + }, + { + "epoch": 1.4955842639223413, + "grad_norm": 0.07945773609817983, + "learning_rate": 3.628274574859002e-06, + "loss": 0.3017, + "step": 2561 + }, + { + "epoch": 1.496168162907817, + "grad_norm": 0.08632780858489379, + "learning_rate": 3.6204191024354352e-06, + "loss": 0.3192, + "step": 2562 + }, + { + "epoch": 1.4967520618932926, + "grad_norm": 0.0920333257674575, + "learning_rate": 3.612570262988393e-06, + "loss": 0.3203, + "step": 2563 + }, + { + "epoch": 1.497335960878768, + "grad_norm": 0.08033343836559535, + "learning_rate": 3.604728064678464e-06, + "loss": 0.2561, + "step": 2564 + }, + { + "epoch": 1.4979198598642434, + "grad_norm": 0.09440564287873016, + "learning_rate": 3.5968925156593426e-06, + "loss": 0.3251, + "step": 2565 + }, + { + "epoch": 1.498503758849719, + "grad_norm": 0.08981023086653087, + "learning_rate": 3.589063624077802e-06, + "loss": 0.3286, + "step": 2566 + }, + { + "epoch": 1.4990876578351946, + "grad_norm": 0.0927846259650536, + "learning_rate": 3.5812413980736916e-06, + "loss": 0.3366, + "step": 2567 + }, + { + "epoch": 1.49967155682067, + "grad_norm": 0.09348132732321833, + "learning_rate": 3.5734258457799407e-06, + "loss": 0.3228, + "step": 2568 + }, + { + "epoch": 1.5002554558061454, + "grad_norm": 0.09497713925309846, + "learning_rate": 3.5656169753225278e-06, + "loss": 0.3452, + "step": 2569 + }, + { + "epoch": 1.500839354791621, + "grad_norm": 0.08466706237381316, + "learning_rate": 3.5578147948204934e-06, + "loss": 0.302, + "step": 2570 + }, + { + "epoch": 1.5014232537770966, + "grad_norm": 0.08286093636108129, + "learning_rate": 3.5500193123859227e-06, + "loss": 0.2957, + "step": 2571 + }, + { + "epoch": 1.5020071527625722, + "grad_norm": 0.08337855999661459, + "learning_rate": 3.542230536123925e-06, + "loss": 0.2751, + "step": 2572 + }, + { + "epoch": 1.5025910517480476, + "grad_norm": 0.08485056055211938, + "learning_rate": 3.5344484741326533e-06, + "loss": 0.3194, + "step": 2573 + }, + { + "epoch": 1.503174950733523, + "grad_norm": 0.09361459492862763, + "learning_rate": 3.526673134503267e-06, + "loss": 0.3099, + "step": 2574 + }, + { + "epoch": 1.5037588497189986, + "grad_norm": 0.08884435042229283, + "learning_rate": 3.5189045253199384e-06, + "loss": 0.3455, + "step": 2575 + }, + { + "epoch": 1.5043427487044743, + "grad_norm": 0.09601250042016751, + "learning_rate": 3.51114265465985e-06, + "loss": 0.332, + "step": 2576 + }, + { + "epoch": 1.5049266476899497, + "grad_norm": 0.09257052469454667, + "learning_rate": 3.5033875305931662e-06, + "loss": 0.3065, + "step": 2577 + }, + { + "epoch": 1.505510546675425, + "grad_norm": 0.09372779239842753, + "learning_rate": 3.4956391611830486e-06, + "loss": 0.3348, + "step": 2578 + }, + { + "epoch": 1.5060944456609007, + "grad_norm": 0.0829884477470413, + "learning_rate": 3.4878975544856285e-06, + "loss": 0.3059, + "step": 2579 + }, + { + "epoch": 1.5066783446463763, + "grad_norm": 0.08551848987221668, + "learning_rate": 3.4801627185500033e-06, + "loss": 0.2992, + "step": 2580 + }, + { + "epoch": 1.5072622436318517, + "grad_norm": 0.07745200177784105, + "learning_rate": 3.4724346614182427e-06, + "loss": 0.2661, + "step": 2581 + }, + { + "epoch": 1.507846142617327, + "grad_norm": 0.08614037594933865, + "learning_rate": 3.4647133911253516e-06, + "loss": 0.3119, + "step": 2582 + }, + { + "epoch": 1.5084300416028027, + "grad_norm": 0.0841382840188242, + "learning_rate": 3.4569989156992965e-06, + "loss": 0.2981, + "step": 2583 + }, + { + "epoch": 1.5090139405882783, + "grad_norm": 0.08799092104109033, + "learning_rate": 3.449291243160966e-06, + "loss": 0.3021, + "step": 2584 + }, + { + "epoch": 1.5095978395737537, + "grad_norm": 0.09521764317264678, + "learning_rate": 3.4415903815241757e-06, + "loss": 0.3149, + "step": 2585 + }, + { + "epoch": 1.510181738559229, + "grad_norm": 0.0829306073036616, + "learning_rate": 3.4338963387956726e-06, + "loss": 0.3061, + "step": 2586 + }, + { + "epoch": 1.5107656375447047, + "grad_norm": 0.0828357964172012, + "learning_rate": 3.4262091229750973e-06, + "loss": 0.3036, + "step": 2587 + }, + { + "epoch": 1.5113495365301803, + "grad_norm": 0.08789551410781374, + "learning_rate": 3.418528742055006e-06, + "loss": 0.3199, + "step": 2588 + }, + { + "epoch": 1.511933435515656, + "grad_norm": 0.08947715900152868, + "learning_rate": 3.4108552040208408e-06, + "loss": 0.3491, + "step": 2589 + }, + { + "epoch": 1.5125173345011313, + "grad_norm": 0.09391907859650965, + "learning_rate": 3.403188516850927e-06, + "loss": 0.3112, + "step": 2590 + }, + { + "epoch": 1.5131012334866067, + "grad_norm": 0.08889938266253214, + "learning_rate": 3.3955286885164786e-06, + "loss": 0.3027, + "step": 2591 + }, + { + "epoch": 1.5136851324720824, + "grad_norm": 0.08726169364435987, + "learning_rate": 3.387875726981563e-06, + "loss": 0.3061, + "step": 2592 + }, + { + "epoch": 1.514269031457558, + "grad_norm": 0.08678719146493954, + "learning_rate": 3.3802296402031234e-06, + "loss": 0.3076, + "step": 2593 + }, + { + "epoch": 1.5148529304430334, + "grad_norm": 0.08682086994141712, + "learning_rate": 3.3725904361309426e-06, + "loss": 0.2977, + "step": 2594 + }, + { + "epoch": 1.5154368294285088, + "grad_norm": 0.08388156683792423, + "learning_rate": 3.36495812270765e-06, + "loss": 0.2694, + "step": 2595 + }, + { + "epoch": 1.5160207284139844, + "grad_norm": 0.08616675525847933, + "learning_rate": 3.35733270786872e-06, + "loss": 0.2904, + "step": 2596 + }, + { + "epoch": 1.51660462739946, + "grad_norm": 0.0864191019802009, + "learning_rate": 3.3497141995424397e-06, + "loss": 0.3369, + "step": 2597 + }, + { + "epoch": 1.5171885263849354, + "grad_norm": 0.08183762103315785, + "learning_rate": 3.3421026056499273e-06, + "loss": 0.2839, + "step": 2598 + }, + { + "epoch": 1.5177724253704108, + "grad_norm": 0.08584654494455739, + "learning_rate": 3.3344979341051108e-06, + "loss": 0.3087, + "step": 2599 + }, + { + "epoch": 1.5183563243558864, + "grad_norm": 0.08725161722219169, + "learning_rate": 3.3269001928147103e-06, + "loss": 0.3033, + "step": 2600 + }, + { + "epoch": 1.518940223341362, + "grad_norm": 0.10228855075944325, + "learning_rate": 3.3193093896782546e-06, + "loss": 0.3631, + "step": 2601 + }, + { + "epoch": 1.5195241223268374, + "grad_norm": 0.09474712834853626, + "learning_rate": 3.311725532588049e-06, + "loss": 0.3198, + "step": 2602 + }, + { + "epoch": 1.5201080213123128, + "grad_norm": 0.08648975643119687, + "learning_rate": 3.3041486294291767e-06, + "loss": 0.2903, + "step": 2603 + }, + { + "epoch": 1.5206919202977884, + "grad_norm": 0.09402215150929882, + "learning_rate": 3.2965786880795005e-06, + "loss": 0.3297, + "step": 2604 + }, + { + "epoch": 1.521275819283264, + "grad_norm": 0.08872398985139705, + "learning_rate": 3.2890157164096315e-06, + "loss": 0.3122, + "step": 2605 + }, + { + "epoch": 1.5218597182687397, + "grad_norm": 0.09075411213507607, + "learning_rate": 3.2814597222829468e-06, + "loss": 0.3153, + "step": 2606 + }, + { + "epoch": 1.522443617254215, + "grad_norm": 0.08570983270220695, + "learning_rate": 3.2739107135555603e-06, + "loss": 0.3186, + "step": 2607 + }, + { + "epoch": 1.5230275162396905, + "grad_norm": 0.08843518837298886, + "learning_rate": 3.266368698076323e-06, + "loss": 0.3164, + "step": 2608 + }, + { + "epoch": 1.523611415225166, + "grad_norm": 0.0827392835133779, + "learning_rate": 3.258833683686824e-06, + "loss": 0.2684, + "step": 2609 + }, + { + "epoch": 1.5241953142106417, + "grad_norm": 0.08720838487671367, + "learning_rate": 3.251305678221359e-06, + "loss": 0.3214, + "step": 2610 + }, + { + "epoch": 1.524779213196117, + "grad_norm": 0.09348680073155007, + "learning_rate": 3.2437846895069535e-06, + "loss": 0.3284, + "step": 2611 + }, + { + "epoch": 1.5253631121815925, + "grad_norm": 0.08481581095343046, + "learning_rate": 3.236270725363323e-06, + "loss": 0.2941, + "step": 2612 + }, + { + "epoch": 1.525947011167068, + "grad_norm": 0.08381514498329123, + "learning_rate": 3.2287637936028814e-06, + "loss": 0.3176, + "step": 2613 + }, + { + "epoch": 1.5265309101525437, + "grad_norm": 0.08338791988891553, + "learning_rate": 3.2212639020307423e-06, + "loss": 0.2866, + "step": 2614 + }, + { + "epoch": 1.5271148091380191, + "grad_norm": 0.09290709784062363, + "learning_rate": 3.2137710584446837e-06, + "loss": 0.3312, + "step": 2615 + }, + { + "epoch": 1.5276987081234945, + "grad_norm": 0.08932695327280428, + "learning_rate": 3.20628527063517e-06, + "loss": 0.3288, + "step": 2616 + }, + { + "epoch": 1.5282826071089701, + "grad_norm": 0.09090177125105138, + "learning_rate": 3.1988065463853204e-06, + "loss": 0.3265, + "step": 2617 + }, + { + "epoch": 1.5288665060944457, + "grad_norm": 0.08645434605355905, + "learning_rate": 3.1913348934709076e-06, + "loss": 0.307, + "step": 2618 + }, + { + "epoch": 1.5294504050799211, + "grad_norm": 0.08485574788037434, + "learning_rate": 3.183870319660365e-06, + "loss": 0.3002, + "step": 2619 + }, + { + "epoch": 1.5300343040653965, + "grad_norm": 0.08527221019291757, + "learning_rate": 3.1764128327147515e-06, + "loss": 0.3029, + "step": 2620 + }, + { + "epoch": 1.5306182030508722, + "grad_norm": 0.0923780581947832, + "learning_rate": 3.1689624403877685e-06, + "loss": 0.3125, + "step": 2621 + }, + { + "epoch": 1.5312021020363478, + "grad_norm": 0.09102404419588203, + "learning_rate": 3.161519150425735e-06, + "loss": 0.3125, + "step": 2622 + }, + { + "epoch": 1.5317860010218234, + "grad_norm": 0.08643019349340769, + "learning_rate": 3.1540829705675835e-06, + "loss": 0.3032, + "step": 2623 + }, + { + "epoch": 1.5323699000072988, + "grad_norm": 0.08234402307534415, + "learning_rate": 3.1466539085448624e-06, + "loss": 0.293, + "step": 2624 + }, + { + "epoch": 1.5329537989927742, + "grad_norm": 0.09150110302046388, + "learning_rate": 3.139231972081709e-06, + "loss": 0.3195, + "step": 2625 + }, + { + "epoch": 1.5335376979782498, + "grad_norm": 0.08291190589508031, + "learning_rate": 3.1318171688948618e-06, + "loss": 0.2932, + "step": 2626 + }, + { + "epoch": 1.5341215969637254, + "grad_norm": 0.08926677222244253, + "learning_rate": 3.1244095066936396e-06, + "loss": 0.2993, + "step": 2627 + }, + { + "epoch": 1.5347054959492008, + "grad_norm": 0.08850964726349937, + "learning_rate": 3.1170089931799296e-06, + "loss": 0.3048, + "step": 2628 + }, + { + "epoch": 1.5352893949346762, + "grad_norm": 0.09124258470336576, + "learning_rate": 3.1096156360482e-06, + "loss": 0.3094, + "step": 2629 + }, + { + "epoch": 1.5358732939201518, + "grad_norm": 0.09108393581693536, + "learning_rate": 3.102229442985466e-06, + "loss": 0.2866, + "step": 2630 + }, + { + "epoch": 1.5364571929056274, + "grad_norm": 0.09393894173965676, + "learning_rate": 3.094850421671295e-06, + "loss": 0.3293, + "step": 2631 + }, + { + "epoch": 1.5370410918911028, + "grad_norm": 0.08122528978720524, + "learning_rate": 3.0874785797778096e-06, + "loss": 0.2947, + "step": 2632 + }, + { + "epoch": 1.5376249908765782, + "grad_norm": 0.08887020105251107, + "learning_rate": 3.080113924969652e-06, + "loss": 0.3431, + "step": 2633 + }, + { + "epoch": 1.5382088898620538, + "grad_norm": 0.09330874645408001, + "learning_rate": 3.0727564649040066e-06, + "loss": 0.3363, + "step": 2634 + }, + { + "epoch": 1.5387927888475295, + "grad_norm": 0.08809782335326645, + "learning_rate": 3.0654062072305667e-06, + "loss": 0.3301, + "step": 2635 + }, + { + "epoch": 1.5393766878330049, + "grad_norm": 0.09328648602144836, + "learning_rate": 3.0580631595915368e-06, + "loss": 0.3135, + "step": 2636 + }, + { + "epoch": 1.5399605868184802, + "grad_norm": 0.09454765439259959, + "learning_rate": 3.050727329621637e-06, + "loss": 0.3371, + "step": 2637 + }, + { + "epoch": 1.5405444858039559, + "grad_norm": 0.08066730682246062, + "learning_rate": 3.043398724948068e-06, + "loss": 0.2819, + "step": 2638 + }, + { + "epoch": 1.5411283847894315, + "grad_norm": 0.09118225596361167, + "learning_rate": 3.03607735319053e-06, + "loss": 0.3378, + "step": 2639 + }, + { + "epoch": 1.541712283774907, + "grad_norm": 0.09376977082075572, + "learning_rate": 3.028763221961196e-06, + "loss": 0.3036, + "step": 2640 + }, + { + "epoch": 1.5422961827603825, + "grad_norm": 0.09196880151592596, + "learning_rate": 3.02145633886471e-06, + "loss": 0.346, + "step": 2641 + }, + { + "epoch": 1.542880081745858, + "grad_norm": 0.08850645767533999, + "learning_rate": 3.0141567114981897e-06, + "loss": 0.2866, + "step": 2642 + }, + { + "epoch": 1.5434639807313335, + "grad_norm": 0.08629889728451555, + "learning_rate": 3.006864347451195e-06, + "loss": 0.2929, + "step": 2643 + }, + { + "epoch": 1.5440478797168091, + "grad_norm": 0.09156427980351761, + "learning_rate": 2.999579254305748e-06, + "loss": 0.3062, + "step": 2644 + }, + { + "epoch": 1.5446317787022845, + "grad_norm": 0.0888650709853789, + "learning_rate": 2.992301439636299e-06, + "loss": 0.3063, + "step": 2645 + }, + { + "epoch": 1.54521567768776, + "grad_norm": 0.08695230597429232, + "learning_rate": 2.9850309110097364e-06, + "loss": 0.299, + "step": 2646 + }, + { + "epoch": 1.5457995766732355, + "grad_norm": 0.08888549430796489, + "learning_rate": 2.977767675985377e-06, + "loss": 0.3165, + "step": 2647 + }, + { + "epoch": 1.5463834756587111, + "grad_norm": 0.09296495844226355, + "learning_rate": 2.970511742114943e-06, + "loss": 0.3623, + "step": 2648 + }, + { + "epoch": 1.5469673746441865, + "grad_norm": 0.07870418346208977, + "learning_rate": 2.963263116942581e-06, + "loss": 0.2588, + "step": 2649 + }, + { + "epoch": 1.547551273629662, + "grad_norm": 0.09178741251416972, + "learning_rate": 2.9560218080048243e-06, + "loss": 0.3114, + "step": 2650 + }, + { + "epoch": 1.5481351726151376, + "grad_norm": 0.09784579911663943, + "learning_rate": 2.9487878228306044e-06, + "loss": 0.3337, + "step": 2651 + }, + { + "epoch": 1.5487190716006132, + "grad_norm": 0.09350923421827556, + "learning_rate": 2.9415611689412426e-06, + "loss": 0.3001, + "step": 2652 + }, + { + "epoch": 1.5493029705860886, + "grad_norm": 0.0895801196231565, + "learning_rate": 2.9343418538504297e-06, + "loss": 0.3056, + "step": 2653 + }, + { + "epoch": 1.549886869571564, + "grad_norm": 0.08541940155857061, + "learning_rate": 2.9271298850642337e-06, + "loss": 0.2577, + "step": 2654 + }, + { + "epoch": 1.5504707685570396, + "grad_norm": 0.08080385037734253, + "learning_rate": 2.9199252700810833e-06, + "loss": 0.2825, + "step": 2655 + }, + { + "epoch": 1.5510546675425152, + "grad_norm": 0.08596095378140892, + "learning_rate": 2.912728016391753e-06, + "loss": 0.3061, + "step": 2656 + }, + { + "epoch": 1.5516385665279908, + "grad_norm": 0.08554282081643991, + "learning_rate": 2.905538131479376e-06, + "loss": 0.2901, + "step": 2657 + }, + { + "epoch": 1.5522224655134662, + "grad_norm": 0.0877928065084015, + "learning_rate": 2.8983556228194165e-06, + "loss": 0.2891, + "step": 2658 + }, + { + "epoch": 1.5528063644989416, + "grad_norm": 0.09345385674591772, + "learning_rate": 2.8911804978796664e-06, + "loss": 0.3517, + "step": 2659 + }, + { + "epoch": 1.5533902634844172, + "grad_norm": 0.08543641532848185, + "learning_rate": 2.884012764120252e-06, + "loss": 0.3226, + "step": 2660 + }, + { + "epoch": 1.5539741624698928, + "grad_norm": 0.0894385760526303, + "learning_rate": 2.8768524289936007e-06, + "loss": 0.3199, + "step": 2661 + }, + { + "epoch": 1.5545580614553682, + "grad_norm": 0.0939097421572987, + "learning_rate": 2.8696994999444614e-06, + "loss": 0.3417, + "step": 2662 + }, + { + "epoch": 1.5551419604408436, + "grad_norm": 0.09072711674169534, + "learning_rate": 2.8625539844098736e-06, + "loss": 0.3077, + "step": 2663 + }, + { + "epoch": 1.5557258594263192, + "grad_norm": 0.09204619348525285, + "learning_rate": 2.8554158898191674e-06, + "loss": 0.3231, + "step": 2664 + }, + { + "epoch": 1.5563097584117949, + "grad_norm": 0.0830030993816261, + "learning_rate": 2.8482852235939672e-06, + "loss": 0.3138, + "step": 2665 + }, + { + "epoch": 1.5568936573972703, + "grad_norm": 0.09335085719218644, + "learning_rate": 2.8411619931481627e-06, + "loss": 0.3241, + "step": 2666 + }, + { + "epoch": 1.5574775563827457, + "grad_norm": 0.09227349669957297, + "learning_rate": 2.8340462058879214e-06, + "loss": 0.3523, + "step": 2667 + }, + { + "epoch": 1.5580614553682213, + "grad_norm": 0.09346809095628791, + "learning_rate": 2.8269378692116676e-06, + "loss": 0.3235, + "step": 2668 + }, + { + "epoch": 1.5586453543536969, + "grad_norm": 0.08715656923266973, + "learning_rate": 2.8198369905100754e-06, + "loss": 0.3288, + "step": 2669 + }, + { + "epoch": 1.5592292533391723, + "grad_norm": 0.08927154668491762, + "learning_rate": 2.812743577166075e-06, + "loss": 0.319, + "step": 2670 + }, + { + "epoch": 1.5598131523246477, + "grad_norm": 0.08928291381659015, + "learning_rate": 2.8056576365548216e-06, + "loss": 0.2803, + "step": 2671 + }, + { + "epoch": 1.5603970513101233, + "grad_norm": 0.09018299932973615, + "learning_rate": 2.7985791760437163e-06, + "loss": 0.3001, + "step": 2672 + }, + { + "epoch": 1.560980950295599, + "grad_norm": 0.09726547739723537, + "learning_rate": 2.79150820299237e-06, + "loss": 0.3206, + "step": 2673 + }, + { + "epoch": 1.5615648492810745, + "grad_norm": 0.09646197351281602, + "learning_rate": 2.784444724752611e-06, + "loss": 0.3251, + "step": 2674 + }, + { + "epoch": 1.56214874826655, + "grad_norm": 0.08697824163225679, + "learning_rate": 2.7773887486684815e-06, + "loss": 0.3092, + "step": 2675 + }, + { + "epoch": 1.5627326472520253, + "grad_norm": 0.0923683710646133, + "learning_rate": 2.770340282076216e-06, + "loss": 0.3262, + "step": 2676 + }, + { + "epoch": 1.563316546237501, + "grad_norm": 0.08682935258751044, + "learning_rate": 2.76329933230425e-06, + "loss": 0.2717, + "step": 2677 + }, + { + "epoch": 1.5639004452229766, + "grad_norm": 0.08505212099711616, + "learning_rate": 2.7562659066731947e-06, + "loss": 0.2802, + "step": 2678 + }, + { + "epoch": 1.564484344208452, + "grad_norm": 0.09151551662023606, + "learning_rate": 2.7492400124958397e-06, + "loss": 0.3282, + "step": 2679 + }, + { + "epoch": 1.5650682431939273, + "grad_norm": 0.08680322206744127, + "learning_rate": 2.742221657077151e-06, + "loss": 0.314, + "step": 2680 + }, + { + "epoch": 1.565652142179403, + "grad_norm": 0.08845186481868289, + "learning_rate": 2.735210847714247e-06, + "loss": 0.321, + "step": 2681 + }, + { + "epoch": 1.5662360411648786, + "grad_norm": 0.08444803810472204, + "learning_rate": 2.7282075916964077e-06, + "loss": 0.2845, + "step": 2682 + }, + { + "epoch": 1.566819940150354, + "grad_norm": 0.08574044410889803, + "learning_rate": 2.721211896305059e-06, + "loss": 0.3172, + "step": 2683 + }, + { + "epoch": 1.5674038391358294, + "grad_norm": 0.08325089208611512, + "learning_rate": 2.7142237688137594e-06, + "loss": 0.2825, + "step": 2684 + }, + { + "epoch": 1.567987738121305, + "grad_norm": 0.08653466280417799, + "learning_rate": 2.707243216488208e-06, + "loss": 0.3286, + "step": 2685 + }, + { + "epoch": 1.5685716371067806, + "grad_norm": 0.08782633762064919, + "learning_rate": 2.7002702465862206e-06, + "loss": 0.2948, + "step": 2686 + }, + { + "epoch": 1.569155536092256, + "grad_norm": 0.09323948153474308, + "learning_rate": 2.6933048663577297e-06, + "loss": 0.2969, + "step": 2687 + }, + { + "epoch": 1.5697394350777314, + "grad_norm": 0.08736682846849728, + "learning_rate": 2.6863470830447837e-06, + "loss": 0.3081, + "step": 2688 + }, + { + "epoch": 1.570323334063207, + "grad_norm": 0.08156043603300059, + "learning_rate": 2.6793969038815224e-06, + "loss": 0.2602, + "step": 2689 + }, + { + "epoch": 1.5709072330486826, + "grad_norm": 0.08938889266124213, + "learning_rate": 2.672454336094191e-06, + "loss": 0.338, + "step": 2690 + }, + { + "epoch": 1.5714911320341582, + "grad_norm": 0.09233684398799216, + "learning_rate": 2.665519386901111e-06, + "loss": 0.3502, + "step": 2691 + }, + { + "epoch": 1.5720750310196336, + "grad_norm": 0.08714149249097451, + "learning_rate": 2.658592063512684e-06, + "loss": 0.3684, + "step": 2692 + }, + { + "epoch": 1.572658930005109, + "grad_norm": 0.09733977891030005, + "learning_rate": 2.6516723731313896e-06, + "loss": 0.3517, + "step": 2693 + }, + { + "epoch": 1.5732428289905847, + "grad_norm": 0.08550959770346168, + "learning_rate": 2.644760322951764e-06, + "loss": 0.2811, + "step": 2694 + }, + { + "epoch": 1.5738267279760603, + "grad_norm": 0.09033829352710061, + "learning_rate": 2.6378559201604047e-06, + "loss": 0.3268, + "step": 2695 + }, + { + "epoch": 1.5744106269615357, + "grad_norm": 0.08506894819848403, + "learning_rate": 2.6309591719359563e-06, + "loss": 0.3111, + "step": 2696 + }, + { + "epoch": 1.574994525947011, + "grad_norm": 0.08767198472428339, + "learning_rate": 2.6240700854490988e-06, + "loss": 0.2934, + "step": 2697 + }, + { + "epoch": 1.5755784249324867, + "grad_norm": 0.08895283288324536, + "learning_rate": 2.6171886678625593e-06, + "loss": 0.2974, + "step": 2698 + }, + { + "epoch": 1.5761623239179623, + "grad_norm": 0.08482031631716146, + "learning_rate": 2.6103149263310768e-06, + "loss": 0.2944, + "step": 2699 + }, + { + "epoch": 1.5767462229034377, + "grad_norm": 0.08517241939697799, + "learning_rate": 2.6034488680014236e-06, + "loss": 0.2857, + "step": 2700 + }, + { + "epoch": 1.577330121888913, + "grad_norm": 0.08447574686558407, + "learning_rate": 2.5965905000123736e-06, + "loss": 0.283, + "step": 2701 + }, + { + "epoch": 1.5779140208743887, + "grad_norm": 0.09184672982850255, + "learning_rate": 2.5897398294947027e-06, + "loss": 0.309, + "step": 2702 + }, + { + "epoch": 1.5784979198598643, + "grad_norm": 0.09660331757828053, + "learning_rate": 2.582896863571197e-06, + "loss": 0.3025, + "step": 2703 + }, + { + "epoch": 1.5790818188453397, + "grad_norm": 0.09190800503887955, + "learning_rate": 2.576061609356617e-06, + "loss": 0.3253, + "step": 2704 + }, + { + "epoch": 1.579665717830815, + "grad_norm": 0.08054801360990027, + "learning_rate": 2.569234073957717e-06, + "loss": 0.2895, + "step": 2705 + }, + { + "epoch": 1.5802496168162907, + "grad_norm": 0.08640992842469485, + "learning_rate": 2.5624142644732177e-06, + "loss": 0.2953, + "step": 2706 + }, + { + "epoch": 1.5808335158017663, + "grad_norm": 0.08966832709155412, + "learning_rate": 2.5556021879938074e-06, + "loss": 0.3056, + "step": 2707 + }, + { + "epoch": 1.581417414787242, + "grad_norm": 0.08451164996270802, + "learning_rate": 2.5487978516021426e-06, + "loss": 0.3375, + "step": 2708 + }, + { + "epoch": 1.5820013137727174, + "grad_norm": 0.08917717416950068, + "learning_rate": 2.542001262372821e-06, + "loss": 0.327, + "step": 2709 + }, + { + "epoch": 1.5825852127581927, + "grad_norm": 0.08250213479910032, + "learning_rate": 2.535212427372393e-06, + "loss": 0.2809, + "step": 2710 + }, + { + "epoch": 1.5831691117436684, + "grad_norm": 0.09866178022007711, + "learning_rate": 2.52843135365935e-06, + "loss": 0.3683, + "step": 2711 + }, + { + "epoch": 1.583753010729144, + "grad_norm": 0.08811572881840915, + "learning_rate": 2.5216580482840993e-06, + "loss": 0.2977, + "step": 2712 + }, + { + "epoch": 1.5843369097146194, + "grad_norm": 0.09172022014120546, + "learning_rate": 2.514892518288988e-06, + "loss": 0.3035, + "step": 2713 + }, + { + "epoch": 1.5849208087000948, + "grad_norm": 0.09378374984722455, + "learning_rate": 2.50813477070827e-06, + "loss": 0.3119, + "step": 2714 + }, + { + "epoch": 1.5855047076855704, + "grad_norm": 0.08909583883690714, + "learning_rate": 2.501384812568104e-06, + "loss": 0.3024, + "step": 2715 + }, + { + "epoch": 1.586088606671046, + "grad_norm": 0.09263310693223463, + "learning_rate": 2.494642650886563e-06, + "loss": 0.356, + "step": 2716 + }, + { + "epoch": 1.5866725056565214, + "grad_norm": 0.08893496883808139, + "learning_rate": 2.4879082926735974e-06, + "loss": 0.2998, + "step": 2717 + }, + { + "epoch": 1.5872564046419968, + "grad_norm": 0.08284151789673047, + "learning_rate": 2.4811817449310615e-06, + "loss": 0.2718, + "step": 2718 + }, + { + "epoch": 1.5878403036274724, + "grad_norm": 0.09084178588015923, + "learning_rate": 2.4744630146526762e-06, + "loss": 0.3206, + "step": 2719 + }, + { + "epoch": 1.588424202612948, + "grad_norm": 0.086323245663931, + "learning_rate": 2.467752108824034e-06, + "loss": 0.2891, + "step": 2720 + }, + { + "epoch": 1.5890081015984234, + "grad_norm": 0.0884577415763019, + "learning_rate": 2.4610490344226034e-06, + "loss": 0.3092, + "step": 2721 + }, + { + "epoch": 1.5895920005838988, + "grad_norm": 0.0818500205141588, + "learning_rate": 2.454353798417698e-06, + "loss": 0.2843, + "step": 2722 + }, + { + "epoch": 1.5901758995693744, + "grad_norm": 0.09416553423981949, + "learning_rate": 2.4476664077704926e-06, + "loss": 0.3576, + "step": 2723 + }, + { + "epoch": 1.59075979855485, + "grad_norm": 0.08986458777721984, + "learning_rate": 2.4409868694339965e-06, + "loss": 0.3143, + "step": 2724 + }, + { + "epoch": 1.5913436975403257, + "grad_norm": 0.08767536768030496, + "learning_rate": 2.434315190353056e-06, + "loss": 0.3165, + "step": 2725 + }, + { + "epoch": 1.591927596525801, + "grad_norm": 0.09330982855020656, + "learning_rate": 2.427651377464353e-06, + "loss": 0.3429, + "step": 2726 + }, + { + "epoch": 1.5925114955112765, + "grad_norm": 0.08746004903527647, + "learning_rate": 2.4209954376963797e-06, + "loss": 0.2963, + "step": 2727 + }, + { + "epoch": 1.593095394496752, + "grad_norm": 0.08303011080905368, + "learning_rate": 2.4143473779694548e-06, + "loss": 0.289, + "step": 2728 + }, + { + "epoch": 1.5936792934822277, + "grad_norm": 0.09308780560836026, + "learning_rate": 2.407707205195694e-06, + "loss": 0.3499, + "step": 2729 + }, + { + "epoch": 1.594263192467703, + "grad_norm": 0.08516663578266107, + "learning_rate": 2.4010749262790136e-06, + "loss": 0.2943, + "step": 2730 + }, + { + "epoch": 1.5948470914531785, + "grad_norm": 0.08465246805407467, + "learning_rate": 2.3944505481151303e-06, + "loss": 0.2953, + "step": 2731 + }, + { + "epoch": 1.595430990438654, + "grad_norm": 0.09259138693675312, + "learning_rate": 2.387834077591538e-06, + "loss": 0.2962, + "step": 2732 + }, + { + "epoch": 1.5960148894241297, + "grad_norm": 0.09176116115783456, + "learning_rate": 2.3812255215875147e-06, + "loss": 0.3214, + "step": 2733 + }, + { + "epoch": 1.5965987884096051, + "grad_norm": 0.08985085094261724, + "learning_rate": 2.374624886974106e-06, + "loss": 0.2923, + "step": 2734 + }, + { + "epoch": 1.5971826873950805, + "grad_norm": 0.09489964262454795, + "learning_rate": 2.3680321806141182e-06, + "loss": 0.3103, + "step": 2735 + }, + { + "epoch": 1.5977665863805561, + "grad_norm": 0.09289017103861547, + "learning_rate": 2.3614474093621255e-06, + "loss": 0.2964, + "step": 2736 + }, + { + "epoch": 1.5983504853660317, + "grad_norm": 0.09140275599245228, + "learning_rate": 2.354870580064439e-06, + "loss": 0.2892, + "step": 2737 + }, + { + "epoch": 1.5989343843515071, + "grad_norm": 0.09087921172597256, + "learning_rate": 2.34830169955912e-06, + "loss": 0.3083, + "step": 2738 + }, + { + "epoch": 1.5995182833369825, + "grad_norm": 0.08788028584753778, + "learning_rate": 2.341740774675968e-06, + "loss": 0.3075, + "step": 2739 + }, + { + "epoch": 1.6001021823224582, + "grad_norm": 0.08525095272695739, + "learning_rate": 2.335187812236499e-06, + "loss": 0.3186, + "step": 2740 + }, + { + "epoch": 1.6006860813079338, + "grad_norm": 0.09574970013068906, + "learning_rate": 2.3286428190539645e-06, + "loss": 0.3483, + "step": 2741 + }, + { + "epoch": 1.6012699802934094, + "grad_norm": 0.08413741784784741, + "learning_rate": 2.322105801933321e-06, + "loss": 0.3083, + "step": 2742 + }, + { + "epoch": 1.6018538792788848, + "grad_norm": 0.09187576858312102, + "learning_rate": 2.3155767676712317e-06, + "loss": 0.3061, + "step": 2743 + }, + { + "epoch": 1.6024377782643602, + "grad_norm": 0.08510561686468221, + "learning_rate": 2.3090557230560673e-06, + "loss": 0.3119, + "step": 2744 + }, + { + "epoch": 1.6030216772498358, + "grad_norm": 0.08860064308458952, + "learning_rate": 2.3025426748678814e-06, + "loss": 0.3292, + "step": 2745 + }, + { + "epoch": 1.6036055762353114, + "grad_norm": 0.08658574015572966, + "learning_rate": 2.296037629878426e-06, + "loss": 0.2817, + "step": 2746 + }, + { + "epoch": 1.6041894752207868, + "grad_norm": 0.08844626757338564, + "learning_rate": 2.289540594851122e-06, + "loss": 0.3226, + "step": 2747 + }, + { + "epoch": 1.6047733742062622, + "grad_norm": 0.08355894308483867, + "learning_rate": 2.283051576541062e-06, + "loss": 0.2772, + "step": 2748 + }, + { + "epoch": 1.6053572731917378, + "grad_norm": 0.0848364271827414, + "learning_rate": 2.2765705816950124e-06, + "loss": 0.3188, + "step": 2749 + }, + { + "epoch": 1.6059411721772134, + "grad_norm": 0.08846065769546503, + "learning_rate": 2.2700976170513855e-06, + "loss": 0.3308, + "step": 2750 + }, + { + "epoch": 1.6065250711626888, + "grad_norm": 0.07819095097985644, + "learning_rate": 2.263632689340257e-06, + "loss": 0.2545, + "step": 2751 + }, + { + "epoch": 1.6071089701481642, + "grad_norm": 0.08058291861617611, + "learning_rate": 2.257175805283338e-06, + "loss": 0.2964, + "step": 2752 + }, + { + "epoch": 1.6076928691336398, + "grad_norm": 0.08405814202322932, + "learning_rate": 2.250726971593976e-06, + "loss": 0.2746, + "step": 2753 + }, + { + "epoch": 1.6082767681191155, + "grad_norm": 0.09328015871229196, + "learning_rate": 2.2442861949771554e-06, + "loss": 0.3506, + "step": 2754 + }, + { + "epoch": 1.6088606671045909, + "grad_norm": 0.08303312664847921, + "learning_rate": 2.237853482129475e-06, + "loss": 0.3033, + "step": 2755 + }, + { + "epoch": 1.6094445660900663, + "grad_norm": 0.07984400527236234, + "learning_rate": 2.231428839739157e-06, + "loss": 0.2857, + "step": 2756 + }, + { + "epoch": 1.6100284650755419, + "grad_norm": 0.08744380592243957, + "learning_rate": 2.225012274486028e-06, + "loss": 0.3037, + "step": 2757 + }, + { + "epoch": 1.6106123640610175, + "grad_norm": 0.09088453544607751, + "learning_rate": 2.218603793041516e-06, + "loss": 0.3001, + "step": 2758 + }, + { + "epoch": 1.611196263046493, + "grad_norm": 0.09026610478464688, + "learning_rate": 2.21220340206865e-06, + "loss": 0.3046, + "step": 2759 + }, + { + "epoch": 1.6117801620319685, + "grad_norm": 0.09098809963333981, + "learning_rate": 2.205811108222038e-06, + "loss": 0.3431, + "step": 2760 + }, + { + "epoch": 1.612364061017444, + "grad_norm": 0.08772021848790403, + "learning_rate": 2.19942691814788e-06, + "loss": 0.3115, + "step": 2761 + }, + { + "epoch": 1.6129479600029195, + "grad_norm": 0.08669736677869853, + "learning_rate": 2.193050838483942e-06, + "loss": 0.294, + "step": 2762 + }, + { + "epoch": 1.6135318589883951, + "grad_norm": 0.08727015598829929, + "learning_rate": 2.186682875859557e-06, + "loss": 0.2925, + "step": 2763 + }, + { + "epoch": 1.6141157579738705, + "grad_norm": 0.09485499357000195, + "learning_rate": 2.1803230368956296e-06, + "loss": 0.3123, + "step": 2764 + }, + { + "epoch": 1.614699656959346, + "grad_norm": 0.08696691410466795, + "learning_rate": 2.1739713282046017e-06, + "loss": 0.3223, + "step": 2765 + }, + { + "epoch": 1.6152835559448215, + "grad_norm": 0.08880831142231013, + "learning_rate": 2.1676277563904747e-06, + "loss": 0.2761, + "step": 2766 + }, + { + "epoch": 1.6158674549302972, + "grad_norm": 0.09106035939165831, + "learning_rate": 2.1612923280487883e-06, + "loss": 0.3114, + "step": 2767 + }, + { + "epoch": 1.6164513539157725, + "grad_norm": 0.08624575921325564, + "learning_rate": 2.1549650497666096e-06, + "loss": 0.325, + "step": 2768 + }, + { + "epoch": 1.617035252901248, + "grad_norm": 0.08125263841724942, + "learning_rate": 2.1486459281225337e-06, + "loss": 0.3086, + "step": 2769 + }, + { + "epoch": 1.6176191518867236, + "grad_norm": 0.08439384503801833, + "learning_rate": 2.14233496968668e-06, + "loss": 0.3098, + "step": 2770 + }, + { + "epoch": 1.6182030508721992, + "grad_norm": 0.08956714966106338, + "learning_rate": 2.136032181020673e-06, + "loss": 0.3273, + "step": 2771 + }, + { + "epoch": 1.6187869498576746, + "grad_norm": 0.09140142113942591, + "learning_rate": 2.1297375686776522e-06, + "loss": 0.3118, + "step": 2772 + }, + { + "epoch": 1.6193708488431502, + "grad_norm": 0.0827603847732067, + "learning_rate": 2.1234511392022473e-06, + "loss": 0.2807, + "step": 2773 + }, + { + "epoch": 1.6199547478286256, + "grad_norm": 0.08607646469223701, + "learning_rate": 2.1171728991305797e-06, + "loss": 0.2888, + "step": 2774 + }, + { + "epoch": 1.6205386468141012, + "grad_norm": 0.09076316567553312, + "learning_rate": 2.110902854990268e-06, + "loss": 0.3209, + "step": 2775 + }, + { + "epoch": 1.6211225457995768, + "grad_norm": 0.09121347688000975, + "learning_rate": 2.1046410133003923e-06, + "loss": 0.3145, + "step": 2776 + }, + { + "epoch": 1.6217064447850522, + "grad_norm": 0.08584428610182016, + "learning_rate": 2.0983873805715216e-06, + "loss": 0.2937, + "step": 2777 + }, + { + "epoch": 1.6222903437705276, + "grad_norm": 0.0934038184338137, + "learning_rate": 2.0921419633056782e-06, + "loss": 0.3033, + "step": 2778 + }, + { + "epoch": 1.6228742427560032, + "grad_norm": 0.07992325979960868, + "learning_rate": 2.085904767996343e-06, + "loss": 0.2831, + "step": 2779 + }, + { + "epoch": 1.6234581417414788, + "grad_norm": 0.0841799170044487, + "learning_rate": 2.0796758011284567e-06, + "loss": 0.2904, + "step": 2780 + }, + { + "epoch": 1.6240420407269542, + "grad_norm": 0.08712466125143144, + "learning_rate": 2.0734550691783937e-06, + "loss": 0.3211, + "step": 2781 + }, + { + "epoch": 1.6246259397124296, + "grad_norm": 0.0872232650039621, + "learning_rate": 2.0672425786139794e-06, + "loss": 0.32, + "step": 2782 + }, + { + "epoch": 1.6252098386979053, + "grad_norm": 0.08754129203366758, + "learning_rate": 2.0610383358944584e-06, + "loss": 0.3377, + "step": 2783 + }, + { + "epoch": 1.6257937376833809, + "grad_norm": 0.08608138777671846, + "learning_rate": 2.0548423474705024e-06, + "loss": 0.3174, + "step": 2784 + }, + { + "epoch": 1.6263776366688563, + "grad_norm": 0.08032686087654864, + "learning_rate": 2.0486546197842096e-06, + "loss": 0.2693, + "step": 2785 + }, + { + "epoch": 1.6269615356543317, + "grad_norm": 0.089483921173535, + "learning_rate": 2.0424751592690762e-06, + "loss": 0.3174, + "step": 2786 + }, + { + "epoch": 1.6275454346398073, + "grad_norm": 0.08617809531575664, + "learning_rate": 2.0363039723500155e-06, + "loss": 0.3063, + "step": 2787 + }, + { + "epoch": 1.628129333625283, + "grad_norm": 0.08968442614503079, + "learning_rate": 2.0301410654433307e-06, + "loss": 0.3144, + "step": 2788 + }, + { + "epoch": 1.6287132326107583, + "grad_norm": 0.08156988283072003, + "learning_rate": 2.023986444956715e-06, + "loss": 0.2785, + "step": 2789 + }, + { + "epoch": 1.629297131596234, + "grad_norm": 0.09116781346257606, + "learning_rate": 2.017840117289254e-06, + "loss": 0.3358, + "step": 2790 + }, + { + "epoch": 1.6298810305817093, + "grad_norm": 0.08401221358505406, + "learning_rate": 2.0117020888313998e-06, + "loss": 0.2911, + "step": 2791 + }, + { + "epoch": 1.630464929567185, + "grad_norm": 0.09653655833916723, + "learning_rate": 2.0055723659649907e-06, + "loss": 0.3142, + "step": 2792 + }, + { + "epoch": 1.6310488285526605, + "grad_norm": 0.08670860622041503, + "learning_rate": 1.999450955063216e-06, + "loss": 0.2979, + "step": 2793 + }, + { + "epoch": 1.631632727538136, + "grad_norm": 0.09858825530519852, + "learning_rate": 1.9933378624906218e-06, + "loss": 0.333, + "step": 2794 + }, + { + "epoch": 1.6322166265236113, + "grad_norm": 0.08623638911889556, + "learning_rate": 1.9872330946031237e-06, + "loss": 0.3059, + "step": 2795 + }, + { + "epoch": 1.632800525509087, + "grad_norm": 0.08598147443545051, + "learning_rate": 1.981136657747963e-06, + "loss": 0.3217, + "step": 2796 + }, + { + "epoch": 1.6333844244945626, + "grad_norm": 0.08502716609781817, + "learning_rate": 1.9750485582637245e-06, + "loss": 0.2828, + "step": 2797 + }, + { + "epoch": 1.633968323480038, + "grad_norm": 0.08505007136886303, + "learning_rate": 1.9689688024803298e-06, + "loss": 0.2924, + "step": 2798 + }, + { + "epoch": 1.6345522224655133, + "grad_norm": 0.08556760222593965, + "learning_rate": 1.962897396719018e-06, + "loss": 0.3065, + "step": 2799 + }, + { + "epoch": 1.635136121450989, + "grad_norm": 0.08744304389546018, + "learning_rate": 1.9568343472923524e-06, + "loss": 0.2994, + "step": 2800 + }, + { + "epoch": 1.6357200204364646, + "grad_norm": 0.09515695038217707, + "learning_rate": 1.950779660504204e-06, + "loss": 0.3091, + "step": 2801 + }, + { + "epoch": 1.63630391942194, + "grad_norm": 0.09493789462280555, + "learning_rate": 1.944733342649748e-06, + "loss": 0.3395, + "step": 2802 + }, + { + "epoch": 1.6368878184074154, + "grad_norm": 0.08673594631596067, + "learning_rate": 1.938695400015467e-06, + "loss": 0.32, + "step": 2803 + }, + { + "epoch": 1.637471717392891, + "grad_norm": 0.08699916410337348, + "learning_rate": 1.932665838879123e-06, + "loss": 0.3058, + "step": 2804 + }, + { + "epoch": 1.6380556163783666, + "grad_norm": 0.09113248366833748, + "learning_rate": 1.926644665509775e-06, + "loss": 0.3075, + "step": 2805 + }, + { + "epoch": 1.638639515363842, + "grad_norm": 0.09110648020788036, + "learning_rate": 1.920631886167754e-06, + "loss": 0.3319, + "step": 2806 + }, + { + "epoch": 1.6392234143493176, + "grad_norm": 0.08911752075097037, + "learning_rate": 1.9146275071046626e-06, + "loss": 0.3135, + "step": 2807 + }, + { + "epoch": 1.639807313334793, + "grad_norm": 0.0882402253142257, + "learning_rate": 1.9086315345633786e-06, + "loss": 0.3224, + "step": 2808 + }, + { + "epoch": 1.6403912123202686, + "grad_norm": 0.09110069464136926, + "learning_rate": 1.9026439747780278e-06, + "loss": 0.3385, + "step": 2809 + }, + { + "epoch": 1.6409751113057442, + "grad_norm": 0.08762147323041139, + "learning_rate": 1.8966648339740002e-06, + "loss": 0.298, + "step": 2810 + }, + { + "epoch": 1.6415590102912196, + "grad_norm": 0.08949794464304249, + "learning_rate": 1.8906941183679227e-06, + "loss": 0.3137, + "step": 2811 + }, + { + "epoch": 1.642142909276695, + "grad_norm": 0.09176597857013076, + "learning_rate": 1.8847318341676657e-06, + "loss": 0.2989, + "step": 2812 + }, + { + "epoch": 1.6427268082621707, + "grad_norm": 0.0994264903196625, + "learning_rate": 1.8787779875723389e-06, + "loss": 0.3702, + "step": 2813 + }, + { + "epoch": 1.6433107072476463, + "grad_norm": 0.09270206638845155, + "learning_rate": 1.8728325847722684e-06, + "loss": 0.3193, + "step": 2814 + }, + { + "epoch": 1.6438946062331217, + "grad_norm": 0.08913036765616508, + "learning_rate": 1.8668956319490128e-06, + "loss": 0.3186, + "step": 2815 + }, + { + "epoch": 1.644478505218597, + "grad_norm": 0.08265056599294728, + "learning_rate": 1.8609671352753367e-06, + "loss": 0.3073, + "step": 2816 + }, + { + "epoch": 1.6450624042040727, + "grad_norm": 0.07911311362680286, + "learning_rate": 1.8550471009152138e-06, + "loss": 0.26, + "step": 2817 + }, + { + "epoch": 1.6456463031895483, + "grad_norm": 0.09621955010602276, + "learning_rate": 1.849135535023825e-06, + "loss": 0.3464, + "step": 2818 + }, + { + "epoch": 1.6462302021750237, + "grad_norm": 0.09756544754200423, + "learning_rate": 1.843232443747538e-06, + "loss": 0.3346, + "step": 2819 + }, + { + "epoch": 1.646814101160499, + "grad_norm": 0.09357448192120124, + "learning_rate": 1.8373378332239177e-06, + "loss": 0.3257, + "step": 2820 + }, + { + "epoch": 1.6473980001459747, + "grad_norm": 0.08859877768876348, + "learning_rate": 1.8314517095817052e-06, + "loss": 0.324, + "step": 2821 + }, + { + "epoch": 1.6479818991314503, + "grad_norm": 0.09579539365119887, + "learning_rate": 1.8255740789408161e-06, + "loss": 0.345, + "step": 2822 + }, + { + "epoch": 1.6485657981169257, + "grad_norm": 0.08721052499665495, + "learning_rate": 1.8197049474123475e-06, + "loss": 0.3247, + "step": 2823 + }, + { + "epoch": 1.6491496971024013, + "grad_norm": 0.09171852034977418, + "learning_rate": 1.8138443210985468e-06, + "loss": 0.3147, + "step": 2824 + }, + { + "epoch": 1.6497335960878767, + "grad_norm": 0.08917116308696059, + "learning_rate": 1.8079922060928223e-06, + "loss": 0.28, + "step": 2825 + }, + { + "epoch": 1.6503174950733523, + "grad_norm": 0.08129940459525777, + "learning_rate": 1.8021486084797368e-06, + "loss": 0.2878, + "step": 2826 + }, + { + "epoch": 1.650901394058828, + "grad_norm": 0.08479943118528724, + "learning_rate": 1.7963135343349914e-06, + "loss": 0.3035, + "step": 2827 + }, + { + "epoch": 1.6514852930443034, + "grad_norm": 0.08221478018297448, + "learning_rate": 1.7904869897254308e-06, + "loss": 0.2929, + "step": 2828 + }, + { + "epoch": 1.6520691920297788, + "grad_norm": 0.09160826681200954, + "learning_rate": 1.7846689807090277e-06, + "loss": 0.3046, + "step": 2829 + }, + { + "epoch": 1.6526530910152544, + "grad_norm": 0.08636458341530073, + "learning_rate": 1.7788595133348796e-06, + "loss": 0.3104, + "step": 2830 + }, + { + "epoch": 1.65323699000073, + "grad_norm": 0.08802047188857515, + "learning_rate": 1.7730585936432077e-06, + "loss": 0.31, + "step": 2831 + }, + { + "epoch": 1.6538208889862054, + "grad_norm": 0.08791509532330856, + "learning_rate": 1.7672662276653384e-06, + "loss": 0.3348, + "step": 2832 + }, + { + "epoch": 1.6544047879716808, + "grad_norm": 0.08297104745574767, + "learning_rate": 1.7614824214237158e-06, + "loss": 0.296, + "step": 2833 + }, + { + "epoch": 1.6549886869571564, + "grad_norm": 0.08791623585611412, + "learning_rate": 1.7557071809318737e-06, + "loss": 0.3143, + "step": 2834 + }, + { + "epoch": 1.655572585942632, + "grad_norm": 0.09316204230757288, + "learning_rate": 1.7499405121944423e-06, + "loss": 0.3388, + "step": 2835 + }, + { + "epoch": 1.6561564849281074, + "grad_norm": 0.08778281671717643, + "learning_rate": 1.7441824212071455e-06, + "loss": 0.3086, + "step": 2836 + }, + { + "epoch": 1.6567403839135828, + "grad_norm": 0.09007331194804802, + "learning_rate": 1.73843291395678e-06, + "loss": 0.2956, + "step": 2837 + }, + { + "epoch": 1.6573242828990584, + "grad_norm": 0.08878832213379277, + "learning_rate": 1.7326919964212275e-06, + "loss": 0.2885, + "step": 2838 + }, + { + "epoch": 1.657908181884534, + "grad_norm": 0.09036430420349095, + "learning_rate": 1.7269596745694295e-06, + "loss": 0.3305, + "step": 2839 + }, + { + "epoch": 1.6584920808700094, + "grad_norm": 0.09216127117646374, + "learning_rate": 1.7212359543613943e-06, + "loss": 0.3372, + "step": 2840 + }, + { + "epoch": 1.659075979855485, + "grad_norm": 0.09820317378606691, + "learning_rate": 1.7155208417481906e-06, + "loss": 0.3366, + "step": 2841 + }, + { + "epoch": 1.6596598788409604, + "grad_norm": 0.09117126477224899, + "learning_rate": 1.7098143426719293e-06, + "loss": 0.3261, + "step": 2842 + }, + { + "epoch": 1.660243777826436, + "grad_norm": 0.08553945948044997, + "learning_rate": 1.7041164630657757e-06, + "loss": 0.3082, + "step": 2843 + }, + { + "epoch": 1.6608276768119117, + "grad_norm": 0.08958712378517425, + "learning_rate": 1.6984272088539256e-06, + "loss": 0.3567, + "step": 2844 + }, + { + "epoch": 1.661411575797387, + "grad_norm": 0.09072469748500103, + "learning_rate": 1.6927465859516057e-06, + "loss": 0.3106, + "step": 2845 + }, + { + "epoch": 1.6619954747828625, + "grad_norm": 0.09860664836170104, + "learning_rate": 1.6870746002650784e-06, + "loss": 0.3399, + "step": 2846 + }, + { + "epoch": 1.662579373768338, + "grad_norm": 0.0827375437898929, + "learning_rate": 1.6814112576916142e-06, + "loss": 0.2813, + "step": 2847 + }, + { + "epoch": 1.6631632727538137, + "grad_norm": 0.09241480139870366, + "learning_rate": 1.6757565641195073e-06, + "loss": 0.3086, + "step": 2848 + }, + { + "epoch": 1.663747171739289, + "grad_norm": 0.09700325855960075, + "learning_rate": 1.6701105254280513e-06, + "loss": 0.3292, + "step": 2849 + }, + { + "epoch": 1.6643310707247645, + "grad_norm": 0.08747121477337459, + "learning_rate": 1.664473147487541e-06, + "loss": 0.3178, + "step": 2850 + }, + { + "epoch": 1.6649149697102401, + "grad_norm": 0.0875923666318606, + "learning_rate": 1.658844436159277e-06, + "loss": 0.3204, + "step": 2851 + }, + { + "epoch": 1.6654988686957157, + "grad_norm": 0.08775752343744311, + "learning_rate": 1.6532243972955397e-06, + "loss": 0.2583, + "step": 2852 + }, + { + "epoch": 1.6660827676811911, + "grad_norm": 0.09289694075783327, + "learning_rate": 1.6476130367395914e-06, + "loss": 0.303, + "step": 2853 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.08927246816150207, + "learning_rate": 1.64201036032568e-06, + "loss": 0.3208, + "step": 2854 + }, + { + "epoch": 1.6672505656521421, + "grad_norm": 0.08946071059840194, + "learning_rate": 1.6364163738790128e-06, + "loss": 0.3224, + "step": 2855 + }, + { + "epoch": 1.6678344646376178, + "grad_norm": 0.08426060293902987, + "learning_rate": 1.6308310832157737e-06, + "loss": 0.309, + "step": 2856 + }, + { + "epoch": 1.6684183636230934, + "grad_norm": 0.08199270750406029, + "learning_rate": 1.6252544941430982e-06, + "loss": 0.2926, + "step": 2857 + }, + { + "epoch": 1.6690022626085688, + "grad_norm": 0.08849252669970706, + "learning_rate": 1.6196866124590737e-06, + "loss": 0.2631, + "step": 2858 + }, + { + "epoch": 1.6695861615940442, + "grad_norm": 0.08163494619932608, + "learning_rate": 1.614127443952741e-06, + "loss": 0.2871, + "step": 2859 + }, + { + "epoch": 1.6701700605795198, + "grad_norm": 0.08585100645751118, + "learning_rate": 1.608576994404074e-06, + "loss": 0.3046, + "step": 2860 + }, + { + "epoch": 1.6707539595649954, + "grad_norm": 0.08546777112764414, + "learning_rate": 1.603035269583989e-06, + "loss": 0.285, + "step": 2861 + }, + { + "epoch": 1.6713378585504708, + "grad_norm": 0.0934959360605802, + "learning_rate": 1.5975022752543247e-06, + "loss": 0.3543, + "step": 2862 + }, + { + "epoch": 1.6719217575359462, + "grad_norm": 0.09005690290461564, + "learning_rate": 1.5919780171678412e-06, + "loss": 0.2755, + "step": 2863 + }, + { + "epoch": 1.6725056565214218, + "grad_norm": 0.09448358362638784, + "learning_rate": 1.5864625010682266e-06, + "loss": 0.3269, + "step": 2864 + }, + { + "epoch": 1.6730895555068974, + "grad_norm": 0.08939949459248223, + "learning_rate": 1.580955732690065e-06, + "loss": 0.2986, + "step": 2865 + }, + { + "epoch": 1.6736734544923728, + "grad_norm": 0.08266845760352323, + "learning_rate": 1.5754577177588581e-06, + "loss": 0.3003, + "step": 2866 + }, + { + "epoch": 1.6742573534778482, + "grad_norm": 0.08615282390753227, + "learning_rate": 1.5699684619909983e-06, + "loss": 0.3261, + "step": 2867 + }, + { + "epoch": 1.6748412524633238, + "grad_norm": 0.08950842787324602, + "learning_rate": 1.5644879710937722e-06, + "loss": 0.3013, + "step": 2868 + }, + { + "epoch": 1.6754251514487994, + "grad_norm": 0.09679790078100706, + "learning_rate": 1.5590162507653573e-06, + "loss": 0.3421, + "step": 2869 + }, + { + "epoch": 1.6760090504342748, + "grad_norm": 0.12027406396866881, + "learning_rate": 1.5535533066948062e-06, + "loss": 0.3047, + "step": 2870 + }, + { + "epoch": 1.6765929494197502, + "grad_norm": 0.09822003416748987, + "learning_rate": 1.5480991445620541e-06, + "loss": 0.3238, + "step": 2871 + }, + { + "epoch": 1.6771768484052259, + "grad_norm": 0.09470422628946744, + "learning_rate": 1.5426537700378985e-06, + "loss": 0.3269, + "step": 2872 + }, + { + "epoch": 1.6777607473907015, + "grad_norm": 0.0848033452909602, + "learning_rate": 1.5372171887840026e-06, + "loss": 0.3114, + "step": 2873 + }, + { + "epoch": 1.678344646376177, + "grad_norm": 0.09106771894639788, + "learning_rate": 1.5317894064528905e-06, + "loss": 0.3544, + "step": 2874 + }, + { + "epoch": 1.6789285453616525, + "grad_norm": 0.08789548279740132, + "learning_rate": 1.5263704286879311e-06, + "loss": 0.3158, + "step": 2875 + }, + { + "epoch": 1.6795124443471279, + "grad_norm": 0.09947805270917219, + "learning_rate": 1.5209602611233465e-06, + "loss": 0.352, + "step": 2876 + }, + { + "epoch": 1.6800963433326035, + "grad_norm": 0.09392885969770205, + "learning_rate": 1.5155589093841939e-06, + "loss": 0.3018, + "step": 2877 + }, + { + "epoch": 1.680680242318079, + "grad_norm": 0.08318944514473092, + "learning_rate": 1.5101663790863597e-06, + "loss": 0.2804, + "step": 2878 + }, + { + "epoch": 1.6812641413035545, + "grad_norm": 0.08881842734414754, + "learning_rate": 1.5047826758365748e-06, + "loss": 0.3515, + "step": 2879 + }, + { + "epoch": 1.68184804028903, + "grad_norm": 0.07913413524904181, + "learning_rate": 1.4994078052323767e-06, + "loss": 0.2792, + "step": 2880 + }, + { + "epoch": 1.6824319392745055, + "grad_norm": 0.08064758730542633, + "learning_rate": 1.4940417728621236e-06, + "loss": 0.2666, + "step": 2881 + }, + { + "epoch": 1.6830158382599811, + "grad_norm": 0.0835113590551349, + "learning_rate": 1.488684584304988e-06, + "loss": 0.2661, + "step": 2882 + }, + { + "epoch": 1.6835997372454565, + "grad_norm": 0.08830738792788109, + "learning_rate": 1.483336245130942e-06, + "loss": 0.2995, + "step": 2883 + }, + { + "epoch": 1.684183636230932, + "grad_norm": 0.0873527134812208, + "learning_rate": 1.477996760900764e-06, + "loss": 0.2852, + "step": 2884 + }, + { + "epoch": 1.6847675352164075, + "grad_norm": 0.08741474010870152, + "learning_rate": 1.4726661371660189e-06, + "loss": 0.302, + "step": 2885 + }, + { + "epoch": 1.6853514342018832, + "grad_norm": 0.08891088209234972, + "learning_rate": 1.467344379469059e-06, + "loss": 0.3187, + "step": 2886 + }, + { + "epoch": 1.6859353331873586, + "grad_norm": 0.08191670759020986, + "learning_rate": 1.4620314933430269e-06, + "loss": 0.2738, + "step": 2887 + }, + { + "epoch": 1.686519232172834, + "grad_norm": 0.09666942280908271, + "learning_rate": 1.4567274843118296e-06, + "loss": 0.369, + "step": 2888 + }, + { + "epoch": 1.6871031311583096, + "grad_norm": 0.08770542723826355, + "learning_rate": 1.4514323578901545e-06, + "loss": 0.2962, + "step": 2889 + }, + { + "epoch": 1.6876870301437852, + "grad_norm": 0.09304335568122375, + "learning_rate": 1.4461461195834491e-06, + "loss": 0.3165, + "step": 2890 + }, + { + "epoch": 1.6882709291292608, + "grad_norm": 0.08177859756561333, + "learning_rate": 1.4408687748879157e-06, + "loss": 0.2905, + "step": 2891 + }, + { + "epoch": 1.6888548281147362, + "grad_norm": 0.08627717706603934, + "learning_rate": 1.4356003292905197e-06, + "loss": 0.3233, + "step": 2892 + }, + { + "epoch": 1.6894387271002116, + "grad_norm": 0.09659418373653059, + "learning_rate": 1.4303407882689635e-06, + "loss": 0.3549, + "step": 2893 + }, + { + "epoch": 1.6900226260856872, + "grad_norm": 0.08102657944480783, + "learning_rate": 1.4250901572917009e-06, + "loss": 0.2768, + "step": 2894 + }, + { + "epoch": 1.6906065250711628, + "grad_norm": 0.08655195203900928, + "learning_rate": 1.4198484418179137e-06, + "loss": 0.3318, + "step": 2895 + }, + { + "epoch": 1.6911904240566382, + "grad_norm": 0.0874243326883842, + "learning_rate": 1.4146156472975147e-06, + "loss": 0.2855, + "step": 2896 + }, + { + "epoch": 1.6917743230421136, + "grad_norm": 0.0872033168816935, + "learning_rate": 1.4093917791711497e-06, + "loss": 0.2704, + "step": 2897 + }, + { + "epoch": 1.6923582220275892, + "grad_norm": 0.08286347600782107, + "learning_rate": 1.404176842870173e-06, + "loss": 0.2949, + "step": 2898 + }, + { + "epoch": 1.6929421210130648, + "grad_norm": 0.0889997850804021, + "learning_rate": 1.3989708438166605e-06, + "loss": 0.3215, + "step": 2899 + }, + { + "epoch": 1.6935260199985402, + "grad_norm": 0.09351430770480976, + "learning_rate": 1.3937737874233913e-06, + "loss": 0.2961, + "step": 2900 + }, + { + "epoch": 1.6941099189840156, + "grad_norm": 0.09202651507901273, + "learning_rate": 1.3885856790938457e-06, + "loss": 0.3509, + "step": 2901 + }, + { + "epoch": 1.6946938179694913, + "grad_norm": 0.09100273540574393, + "learning_rate": 1.383406524222206e-06, + "loss": 0.2945, + "step": 2902 + }, + { + "epoch": 1.6952777169549669, + "grad_norm": 0.09325452734003567, + "learning_rate": 1.3782363281933387e-06, + "loss": 0.3297, + "step": 2903 + }, + { + "epoch": 1.6958616159404423, + "grad_norm": 0.08751548476805564, + "learning_rate": 1.3730750963828033e-06, + "loss": 0.3289, + "step": 2904 + }, + { + "epoch": 1.6964455149259177, + "grad_norm": 0.08400738434019195, + "learning_rate": 1.3679228341568308e-06, + "loss": 0.2702, + "step": 2905 + }, + { + "epoch": 1.6970294139113933, + "grad_norm": 0.09583743080159342, + "learning_rate": 1.362779546872327e-06, + "loss": 0.3294, + "step": 2906 + }, + { + "epoch": 1.697613312896869, + "grad_norm": 0.08729184318782618, + "learning_rate": 1.357645239876879e-06, + "loss": 0.2925, + "step": 2907 + }, + { + "epoch": 1.6981972118823445, + "grad_norm": 0.09232744715271367, + "learning_rate": 1.3525199185087223e-06, + "loss": 0.3039, + "step": 2908 + }, + { + "epoch": 1.69878111086782, + "grad_norm": 0.09499404933458276, + "learning_rate": 1.3474035880967529e-06, + "loss": 0.3302, + "step": 2909 + }, + { + "epoch": 1.6993650098532953, + "grad_norm": 0.09161544085112154, + "learning_rate": 1.3422962539605245e-06, + "loss": 0.3227, + "step": 2910 + }, + { + "epoch": 1.699948908838771, + "grad_norm": 0.08637249323304211, + "learning_rate": 1.3371979214102293e-06, + "loss": 0.2861, + "step": 2911 + }, + { + "epoch": 1.7005328078242465, + "grad_norm": 0.08641529328586525, + "learning_rate": 1.3321085957467107e-06, + "loss": 0.2838, + "step": 2912 + }, + { + "epoch": 1.701116706809722, + "grad_norm": 0.08440164168361179, + "learning_rate": 1.3270282822614366e-06, + "loss": 0.2866, + "step": 2913 + }, + { + "epoch": 1.7017006057951973, + "grad_norm": 0.0866450122459194, + "learning_rate": 1.321956986236509e-06, + "loss": 0.3114, + "step": 2914 + }, + { + "epoch": 1.702284504780673, + "grad_norm": 0.0909469094339131, + "learning_rate": 1.3168947129446574e-06, + "loss": 0.328, + "step": 2915 + }, + { + "epoch": 1.7028684037661486, + "grad_norm": 0.08168635741525229, + "learning_rate": 1.3118414676492252e-06, + "loss": 0.2637, + "step": 2916 + }, + { + "epoch": 1.703452302751624, + "grad_norm": 0.09373405494212547, + "learning_rate": 1.3067972556041753e-06, + "loss": 0.3085, + "step": 2917 + }, + { + "epoch": 1.7040362017370994, + "grad_norm": 0.08649420409310042, + "learning_rate": 1.3017620820540721e-06, + "loss": 0.3057, + "step": 2918 + }, + { + "epoch": 1.704620100722575, + "grad_norm": 0.09578706187759553, + "learning_rate": 1.2967359522340828e-06, + "loss": 0.3494, + "step": 2919 + }, + { + "epoch": 1.7052039997080506, + "grad_norm": 0.0877557310825074, + "learning_rate": 1.2917188713699791e-06, + "loss": 0.2729, + "step": 2920 + }, + { + "epoch": 1.705787898693526, + "grad_norm": 0.08329641444778102, + "learning_rate": 1.286710844678114e-06, + "loss": 0.2709, + "step": 2921 + }, + { + "epoch": 1.7063717976790014, + "grad_norm": 0.09051928367296017, + "learning_rate": 1.2817118773654381e-06, + "loss": 0.3214, + "step": 2922 + }, + { + "epoch": 1.706955696664477, + "grad_norm": 0.0782143053287659, + "learning_rate": 1.2767219746294724e-06, + "loss": 0.2578, + "step": 2923 + }, + { + "epoch": 1.7075395956499526, + "grad_norm": 0.08291342986854032, + "learning_rate": 1.271741141658317e-06, + "loss": 0.2822, + "step": 2924 + }, + { + "epoch": 1.7081234946354282, + "grad_norm": 0.08658723292980255, + "learning_rate": 1.266769383630646e-06, + "loss": 0.3026, + "step": 2925 + }, + { + "epoch": 1.7087073936209036, + "grad_norm": 0.08776376326284951, + "learning_rate": 1.2618067057156901e-06, + "loss": 0.3134, + "step": 2926 + }, + { + "epoch": 1.709291292606379, + "grad_norm": 0.09383811077226842, + "learning_rate": 1.2568531130732498e-06, + "loss": 0.3556, + "step": 2927 + }, + { + "epoch": 1.7098751915918546, + "grad_norm": 0.09178053838688537, + "learning_rate": 1.2519086108536683e-06, + "loss": 0.2995, + "step": 2928 + }, + { + "epoch": 1.7104590905773303, + "grad_norm": 0.08915185058271462, + "learning_rate": 1.2469732041978422e-06, + "loss": 0.3139, + "step": 2929 + }, + { + "epoch": 1.7110429895628056, + "grad_norm": 0.08745948578626743, + "learning_rate": 1.2420468982372158e-06, + "loss": 0.2959, + "step": 2930 + }, + { + "epoch": 1.711626888548281, + "grad_norm": 0.09040872912068385, + "learning_rate": 1.237129698093762e-06, + "loss": 0.3113, + "step": 2931 + }, + { + "epoch": 1.7122107875337567, + "grad_norm": 0.09024185716782082, + "learning_rate": 1.2322216088799955e-06, + "loss": 0.3388, + "step": 2932 + }, + { + "epoch": 1.7127946865192323, + "grad_norm": 0.08785765278970642, + "learning_rate": 1.227322635698952e-06, + "loss": 0.2675, + "step": 2933 + }, + { + "epoch": 1.7133785855047077, + "grad_norm": 0.09708399148400682, + "learning_rate": 1.2224327836441863e-06, + "loss": 0.3259, + "step": 2934 + }, + { + "epoch": 1.713962484490183, + "grad_norm": 0.08303100067502155, + "learning_rate": 1.2175520577997834e-06, + "loss": 0.2971, + "step": 2935 + }, + { + "epoch": 1.7145463834756587, + "grad_norm": 0.0865470488383438, + "learning_rate": 1.2126804632403255e-06, + "loss": 0.3114, + "step": 2936 + }, + { + "epoch": 1.7151302824611343, + "grad_norm": 0.08849184980666493, + "learning_rate": 1.207818005030904e-06, + "loss": 0.3161, + "step": 2937 + }, + { + "epoch": 1.7157141814466097, + "grad_norm": 0.08130631698112382, + "learning_rate": 1.2029646882271173e-06, + "loss": 0.2786, + "step": 2938 + }, + { + "epoch": 1.716298080432085, + "grad_norm": 0.09210220549558422, + "learning_rate": 1.1981205178750511e-06, + "loss": 0.3518, + "step": 2939 + }, + { + "epoch": 1.7168819794175607, + "grad_norm": 0.0842619466820285, + "learning_rate": 1.1932854990112896e-06, + "loss": 0.2723, + "step": 2940 + }, + { + "epoch": 1.7174658784030363, + "grad_norm": 0.09256310164226095, + "learning_rate": 1.1884596366628942e-06, + "loss": 0.2973, + "step": 2941 + }, + { + "epoch": 1.718049777388512, + "grad_norm": 0.08740234343292702, + "learning_rate": 1.1836429358474077e-06, + "loss": 0.284, + "step": 2942 + }, + { + "epoch": 1.7186336763739873, + "grad_norm": 0.0867983165798341, + "learning_rate": 1.1788354015728543e-06, + "loss": 0.2805, + "step": 2943 + }, + { + "epoch": 1.7192175753594627, + "grad_norm": 0.09313683951093196, + "learning_rate": 1.1740370388377188e-06, + "loss": 0.3446, + "step": 2944 + }, + { + "epoch": 1.7198014743449384, + "grad_norm": 0.08906207898572369, + "learning_rate": 1.1692478526309558e-06, + "loss": 0.3216, + "step": 2945 + }, + { + "epoch": 1.720385373330414, + "grad_norm": 0.08675038774491685, + "learning_rate": 1.1644678479319772e-06, + "loss": 0.3022, + "step": 2946 + }, + { + "epoch": 1.7209692723158894, + "grad_norm": 0.08624644735214483, + "learning_rate": 1.1596970297106458e-06, + "loss": 0.2864, + "step": 2947 + }, + { + "epoch": 1.7215531713013648, + "grad_norm": 0.09155820786327558, + "learning_rate": 1.1549354029272786e-06, + "loss": 0.3108, + "step": 2948 + }, + { + "epoch": 1.7221370702868404, + "grad_norm": 0.09491452711812041, + "learning_rate": 1.1501829725326307e-06, + "loss": 0.36, + "step": 2949 + }, + { + "epoch": 1.722720969272316, + "grad_norm": 0.08732012073743244, + "learning_rate": 1.1454397434679022e-06, + "loss": 0.2865, + "step": 2950 + }, + { + "epoch": 1.7233048682577914, + "grad_norm": 0.08664279021422953, + "learning_rate": 1.1407057206647188e-06, + "loss": 0.2898, + "step": 2951 + }, + { + "epoch": 1.7238887672432668, + "grad_norm": 0.0840888574089827, + "learning_rate": 1.1359809090451357e-06, + "loss": 0.2696, + "step": 2952 + }, + { + "epoch": 1.7244726662287424, + "grad_norm": 0.09106068531170415, + "learning_rate": 1.131265313521639e-06, + "loss": 0.3154, + "step": 2953 + }, + { + "epoch": 1.725056565214218, + "grad_norm": 0.09892522841508108, + "learning_rate": 1.126558938997121e-06, + "loss": 0.3392, + "step": 2954 + }, + { + "epoch": 1.7256404641996934, + "grad_norm": 0.0834147120833047, + "learning_rate": 1.1218617903648966e-06, + "loss": 0.2962, + "step": 2955 + }, + { + "epoch": 1.7262243631851688, + "grad_norm": 0.08924313457398392, + "learning_rate": 1.1171738725086833e-06, + "loss": 0.3053, + "step": 2956 + }, + { + "epoch": 1.7268082621706444, + "grad_norm": 0.07689446135943286, + "learning_rate": 1.1124951903025981e-06, + "loss": 0.254, + "step": 2957 + }, + { + "epoch": 1.72739216115612, + "grad_norm": 0.08080578482931383, + "learning_rate": 1.1078257486111654e-06, + "loss": 0.2747, + "step": 2958 + }, + { + "epoch": 1.7279760601415957, + "grad_norm": 0.09742601349018544, + "learning_rate": 1.1031655522892915e-06, + "loss": 0.301, + "step": 2959 + }, + { + "epoch": 1.728559959127071, + "grad_norm": 0.08815284400574176, + "learning_rate": 1.0985146061822794e-06, + "loss": 0.3067, + "step": 2960 + }, + { + "epoch": 1.7291438581125465, + "grad_norm": 0.08313534309618158, + "learning_rate": 1.0938729151258065e-06, + "loss": 0.3058, + "step": 2961 + }, + { + "epoch": 1.729727757098022, + "grad_norm": 0.09196632768067683, + "learning_rate": 1.0892404839459269e-06, + "loss": 0.3049, + "step": 2962 + }, + { + "epoch": 1.7303116560834977, + "grad_norm": 0.08402073914174016, + "learning_rate": 1.0846173174590802e-06, + "loss": 0.2882, + "step": 2963 + }, + { + "epoch": 1.730895555068973, + "grad_norm": 0.09098693028795556, + "learning_rate": 1.0800034204720588e-06, + "loss": 0.3065, + "step": 2964 + }, + { + "epoch": 1.7314794540544485, + "grad_norm": 0.09000081961362168, + "learning_rate": 1.0753987977820214e-06, + "loss": 0.3095, + "step": 2965 + }, + { + "epoch": 1.732063353039924, + "grad_norm": 0.08326249600881369, + "learning_rate": 1.07080345417649e-06, + "loss": 0.303, + "step": 2966 + }, + { + "epoch": 1.7326472520253997, + "grad_norm": 0.0909999083086441, + "learning_rate": 1.0662173944333288e-06, + "loss": 0.3126, + "step": 2967 + }, + { + "epoch": 1.733231151010875, + "grad_norm": 0.0941694256557739, + "learning_rate": 1.06164062332076e-06, + "loss": 0.3538, + "step": 2968 + }, + { + "epoch": 1.7338150499963505, + "grad_norm": 0.08057751956313591, + "learning_rate": 1.0570731455973415e-06, + "loss": 0.2837, + "step": 2969 + }, + { + "epoch": 1.7343989489818261, + "grad_norm": 0.0815138782520205, + "learning_rate": 1.052514966011966e-06, + "loss": 0.2796, + "step": 2970 + }, + { + "epoch": 1.7349828479673017, + "grad_norm": 0.08065742893302558, + "learning_rate": 1.0479660893038702e-06, + "loss": 0.2646, + "step": 2971 + }, + { + "epoch": 1.7355667469527771, + "grad_norm": 0.08943583689932598, + "learning_rate": 1.043426520202605e-06, + "loss": 0.3316, + "step": 2972 + }, + { + "epoch": 1.7361506459382525, + "grad_norm": 0.08422663568711866, + "learning_rate": 1.0388962634280543e-06, + "loss": 0.2825, + "step": 2973 + }, + { + "epoch": 1.7367345449237281, + "grad_norm": 0.08429212547010055, + "learning_rate": 1.0343753236904152e-06, + "loss": 0.2754, + "step": 2974 + }, + { + "epoch": 1.7373184439092038, + "grad_norm": 0.08150254497547402, + "learning_rate": 1.029863705690195e-06, + "loss": 0.2806, + "step": 2975 + }, + { + "epoch": 1.7379023428946794, + "grad_norm": 0.0916345786964083, + "learning_rate": 1.0253614141182167e-06, + "loss": 0.2894, + "step": 2976 + }, + { + "epoch": 1.7384862418801548, + "grad_norm": 0.0922043541935784, + "learning_rate": 1.0208684536555968e-06, + "loss": 0.3091, + "step": 2977 + }, + { + "epoch": 1.7390701408656302, + "grad_norm": 0.08857601696544157, + "learning_rate": 1.016384828973761e-06, + "loss": 0.3187, + "step": 2978 + }, + { + "epoch": 1.7396540398511058, + "grad_norm": 0.09026624999364392, + "learning_rate": 1.0119105447344203e-06, + "loss": 0.314, + "step": 2979 + }, + { + "epoch": 1.7402379388365814, + "grad_norm": 0.08468503589420602, + "learning_rate": 1.007445605589573e-06, + "loss": 0.3014, + "step": 2980 + }, + { + "epoch": 1.7408218378220568, + "grad_norm": 0.09143415533377398, + "learning_rate": 1.0029900161815109e-06, + "loss": 0.3564, + "step": 2981 + }, + { + "epoch": 1.7414057368075322, + "grad_norm": 0.09070453059558233, + "learning_rate": 9.985437811427934e-07, + "loss": 0.3301, + "step": 2982 + }, + { + "epoch": 1.7419896357930078, + "grad_norm": 0.09294933017119607, + "learning_rate": 9.941069050962626e-07, + "loss": 0.2998, + "step": 2983 + }, + { + "epoch": 1.7425735347784834, + "grad_norm": 0.09132636691491733, + "learning_rate": 9.896793926550252e-07, + "loss": 0.2987, + "step": 2984 + }, + { + "epoch": 1.7431574337639588, + "grad_norm": 0.08708747938142473, + "learning_rate": 9.8526124842245e-07, + "loss": 0.2997, + "step": 2985 + }, + { + "epoch": 1.7437413327494342, + "grad_norm": 0.08754272732357196, + "learning_rate": 9.808524769921756e-07, + "loss": 0.2844, + "step": 2986 + }, + { + "epoch": 1.7443252317349098, + "grad_norm": 0.0905031943791269, + "learning_rate": 9.764530829480822e-07, + "loss": 0.3172, + "step": 2987 + }, + { + "epoch": 1.7449091307203854, + "grad_norm": 0.08341318940248507, + "learning_rate": 9.720630708643131e-07, + "loss": 0.2987, + "step": 2988 + }, + { + "epoch": 1.7454930297058608, + "grad_norm": 0.09037358424546788, + "learning_rate": 9.67682445305248e-07, + "loss": 0.2983, + "step": 2989 + }, + { + "epoch": 1.7460769286913362, + "grad_norm": 0.09080141037102694, + "learning_rate": 9.63311210825505e-07, + "loss": 0.3061, + "step": 2990 + }, + { + "epoch": 1.7466608276768119, + "grad_norm": 0.08646664830662824, + "learning_rate": 9.589493719699517e-07, + "loss": 0.2912, + "step": 2991 + }, + { + "epoch": 1.7472447266622875, + "grad_norm": 0.09155215076753025, + "learning_rate": 9.545969332736748e-07, + "loss": 0.3379, + "step": 2992 + }, + { + "epoch": 1.747828625647763, + "grad_norm": 0.08199975647187971, + "learning_rate": 9.502538992619892e-07, + "loss": 0.2594, + "step": 2993 + }, + { + "epoch": 1.7484125246332385, + "grad_norm": 0.08403297780926129, + "learning_rate": 9.459202744504359e-07, + "loss": 0.2858, + "step": 2994 + }, + { + "epoch": 1.7489964236187139, + "grad_norm": 0.08201713475118468, + "learning_rate": 9.415960633447674e-07, + "loss": 0.285, + "step": 2995 + }, + { + "epoch": 1.7495803226041895, + "grad_norm": 0.08223497669645283, + "learning_rate": 9.372812704409551e-07, + "loss": 0.2784, + "step": 2996 + }, + { + "epoch": 1.7501642215896651, + "grad_norm": 0.09487527326438247, + "learning_rate": 9.329759002251726e-07, + "loss": 0.3322, + "step": 2997 + }, + { + "epoch": 1.7507481205751405, + "grad_norm": 0.08352656473342496, + "learning_rate": 9.286799571737981e-07, + "loss": 0.3052, + "step": 2998 + }, + { + "epoch": 1.751332019560616, + "grad_norm": 0.09107316249596624, + "learning_rate": 9.243934457534098e-07, + "loss": 0.3206, + "step": 2999 + }, + { + "epoch": 1.7519159185460915, + "grad_norm": 0.09041574054746052, + "learning_rate": 9.201163704207771e-07, + "loss": 0.303, + "step": 3000 + }, + { + "epoch": 1.7524998175315671, + "grad_norm": 0.09145235721638599, + "learning_rate": 9.158487356228618e-07, + "loss": 0.3052, + "step": 3001 + }, + { + "epoch": 1.7530837165170425, + "grad_norm": 0.0876363328396699, + "learning_rate": 9.115905457968077e-07, + "loss": 0.3052, + "step": 3002 + }, + { + "epoch": 1.753667615502518, + "grad_norm": 0.08514432919750392, + "learning_rate": 9.073418053699368e-07, + "loss": 0.3057, + "step": 3003 + }, + { + "epoch": 1.7542515144879935, + "grad_norm": 0.08385041002415641, + "learning_rate": 9.031025187597519e-07, + "loss": 0.2858, + "step": 3004 + }, + { + "epoch": 1.7548354134734692, + "grad_norm": 0.08340917680319916, + "learning_rate": 8.988726903739197e-07, + "loss": 0.291, + "step": 3005 + }, + { + "epoch": 1.7554193124589446, + "grad_norm": 0.08114825170155826, + "learning_rate": 8.946523246102811e-07, + "loss": 0.2785, + "step": 3006 + }, + { + "epoch": 1.75600321144442, + "grad_norm": 0.09022391405996857, + "learning_rate": 8.904414258568306e-07, + "loss": 0.2993, + "step": 3007 + }, + { + "epoch": 1.7565871104298956, + "grad_norm": 0.08489748961471949, + "learning_rate": 8.862399984917214e-07, + "loss": 0.2736, + "step": 3008 + }, + { + "epoch": 1.7571710094153712, + "grad_norm": 0.09423216732161882, + "learning_rate": 8.820480468832649e-07, + "loss": 0.3161, + "step": 3009 + }, + { + "epoch": 1.7577549084008468, + "grad_norm": 0.09609931893809386, + "learning_rate": 8.778655753899124e-07, + "loss": 0.3407, + "step": 3010 + }, + { + "epoch": 1.7583388073863222, + "grad_norm": 0.08174269781267338, + "learning_rate": 8.736925883602665e-07, + "loss": 0.3036, + "step": 3011 + }, + { + "epoch": 1.7589227063717976, + "grad_norm": 0.08679874988879148, + "learning_rate": 8.695290901330611e-07, + "loss": 0.3187, + "step": 3012 + }, + { + "epoch": 1.7595066053572732, + "grad_norm": 0.08565262568904367, + "learning_rate": 8.653750850371667e-07, + "loss": 0.291, + "step": 3013 + }, + { + "epoch": 1.7600905043427488, + "grad_norm": 0.08631909365571475, + "learning_rate": 8.612305773915886e-07, + "loss": 0.3094, + "step": 3014 + }, + { + "epoch": 1.7606744033282242, + "grad_norm": 0.08967244308173025, + "learning_rate": 8.570955715054496e-07, + "loss": 0.3198, + "step": 3015 + }, + { + "epoch": 1.7612583023136996, + "grad_norm": 0.0827073093083439, + "learning_rate": 8.529700716780009e-07, + "loss": 0.3192, + "step": 3016 + }, + { + "epoch": 1.7618422012991752, + "grad_norm": 0.09134001466499862, + "learning_rate": 8.488540821986035e-07, + "loss": 0.3058, + "step": 3017 + }, + { + "epoch": 1.7624261002846509, + "grad_norm": 0.08854772270827294, + "learning_rate": 8.447476073467309e-07, + "loss": 0.3083, + "step": 3018 + }, + { + "epoch": 1.7630099992701262, + "grad_norm": 0.08804646503800304, + "learning_rate": 8.406506513919721e-07, + "loss": 0.2933, + "step": 3019 + }, + { + "epoch": 1.7635938982556016, + "grad_norm": 0.09229637551480614, + "learning_rate": 8.365632185940109e-07, + "loss": 0.3081, + "step": 3020 + }, + { + "epoch": 1.7641777972410773, + "grad_norm": 0.08508570586340651, + "learning_rate": 8.3248531320263e-07, + "loss": 0.2944, + "step": 3021 + }, + { + "epoch": 1.7647616962265529, + "grad_norm": 0.09076148310745154, + "learning_rate": 8.284169394577124e-07, + "loss": 0.31, + "step": 3022 + }, + { + "epoch": 1.7653455952120283, + "grad_norm": 0.09073344326954152, + "learning_rate": 8.243581015892221e-07, + "loss": 0.3002, + "step": 3023 + }, + { + "epoch": 1.7659294941975037, + "grad_norm": 0.08409036947594231, + "learning_rate": 8.203088038172169e-07, + "loss": 0.2842, + "step": 3024 + }, + { + "epoch": 1.7665133931829793, + "grad_norm": 0.08309645792566152, + "learning_rate": 8.1626905035183e-07, + "loss": 0.2745, + "step": 3025 + }, + { + "epoch": 1.767097292168455, + "grad_norm": 0.08426566264507526, + "learning_rate": 8.122388453932728e-07, + "loss": 0.2882, + "step": 3026 + }, + { + "epoch": 1.7676811911539305, + "grad_norm": 0.08188924654134366, + "learning_rate": 8.082181931318311e-07, + "loss": 0.2795, + "step": 3027 + }, + { + "epoch": 1.768265090139406, + "grad_norm": 0.09312566151926178, + "learning_rate": 8.042070977478533e-07, + "loss": 0.2967, + "step": 3028 + }, + { + "epoch": 1.7688489891248813, + "grad_norm": 0.08957555992163242, + "learning_rate": 8.002055634117578e-07, + "loss": 0.3018, + "step": 3029 + }, + { + "epoch": 1.769432888110357, + "grad_norm": 0.08867518747282177, + "learning_rate": 7.962135942840188e-07, + "loss": 0.3127, + "step": 3030 + }, + { + "epoch": 1.7700167870958325, + "grad_norm": 0.0903875916347569, + "learning_rate": 7.922311945151629e-07, + "loss": 0.3284, + "step": 3031 + }, + { + "epoch": 1.770600686081308, + "grad_norm": 0.0861983829225451, + "learning_rate": 7.882583682457734e-07, + "loss": 0.3311, + "step": 3032 + }, + { + "epoch": 1.7711845850667833, + "grad_norm": 0.08457649661870313, + "learning_rate": 7.84295119606473e-07, + "loss": 0.2896, + "step": 3033 + }, + { + "epoch": 1.771768484052259, + "grad_norm": 0.09091313036882272, + "learning_rate": 7.803414527179343e-07, + "loss": 0.33, + "step": 3034 + }, + { + "epoch": 1.7723523830377346, + "grad_norm": 0.08099493046895603, + "learning_rate": 7.76397371690859e-07, + "loss": 0.2584, + "step": 3035 + }, + { + "epoch": 1.77293628202321, + "grad_norm": 0.09022967842005367, + "learning_rate": 7.72462880625986e-07, + "loss": 0.3135, + "step": 3036 + }, + { + "epoch": 1.7735201810086854, + "grad_norm": 0.08760234305283941, + "learning_rate": 7.685379836140872e-07, + "loss": 0.3009, + "step": 3037 + }, + { + "epoch": 1.774104079994161, + "grad_norm": 0.08079301449708769, + "learning_rate": 7.646226847359506e-07, + "loss": 0.2914, + "step": 3038 + }, + { + "epoch": 1.7746879789796366, + "grad_norm": 0.09348629642500565, + "learning_rate": 7.607169880623955e-07, + "loss": 0.3373, + "step": 3039 + }, + { + "epoch": 1.775271877965112, + "grad_norm": 0.08341310157485701, + "learning_rate": 7.568208976542491e-07, + "loss": 0.2839, + "step": 3040 + }, + { + "epoch": 1.7758557769505874, + "grad_norm": 0.08800133414103774, + "learning_rate": 7.529344175623521e-07, + "loss": 0.3029, + "step": 3041 + }, + { + "epoch": 1.776439675936063, + "grad_norm": 0.08137749533851439, + "learning_rate": 7.490575518275589e-07, + "loss": 0.2921, + "step": 3042 + }, + { + "epoch": 1.7770235749215386, + "grad_norm": 0.08354796555055849, + "learning_rate": 7.451903044807185e-07, + "loss": 0.2969, + "step": 3043 + }, + { + "epoch": 1.7776074739070142, + "grad_norm": 0.08921737817137815, + "learning_rate": 7.4133267954269e-07, + "loss": 0.2876, + "step": 3044 + }, + { + "epoch": 1.7781913728924896, + "grad_norm": 0.086145328649435, + "learning_rate": 7.374846810243197e-07, + "loss": 0.3139, + "step": 3045 + }, + { + "epoch": 1.778775271877965, + "grad_norm": 0.0928396701260336, + "learning_rate": 7.336463129264437e-07, + "loss": 0.3365, + "step": 3046 + }, + { + "epoch": 1.7793591708634406, + "grad_norm": 0.08560000141224082, + "learning_rate": 7.298175792398976e-07, + "loss": 0.2987, + "step": 3047 + }, + { + "epoch": 1.7799430698489163, + "grad_norm": 0.08864910068645142, + "learning_rate": 7.25998483945487e-07, + "loss": 0.3314, + "step": 3048 + }, + { + "epoch": 1.7805269688343917, + "grad_norm": 0.08432115089380217, + "learning_rate": 7.22189031013999e-07, + "loss": 0.303, + "step": 3049 + }, + { + "epoch": 1.781110867819867, + "grad_norm": 0.08015805136973005, + "learning_rate": 7.183892244062018e-07, + "loss": 0.2773, + "step": 3050 + }, + { + "epoch": 1.7816947668053427, + "grad_norm": 0.0887181756401946, + "learning_rate": 7.145990680728243e-07, + "loss": 0.3154, + "step": 3051 + }, + { + "epoch": 1.7822786657908183, + "grad_norm": 0.07888148611941954, + "learning_rate": 7.10818565954573e-07, + "loss": 0.2723, + "step": 3052 + }, + { + "epoch": 1.7828625647762937, + "grad_norm": 0.09311012535185148, + "learning_rate": 7.07047721982107e-07, + "loss": 0.3349, + "step": 3053 + }, + { + "epoch": 1.783446463761769, + "grad_norm": 0.08862758114884235, + "learning_rate": 7.032865400760469e-07, + "loss": 0.3282, + "step": 3054 + }, + { + "epoch": 1.7840303627472447, + "grad_norm": 0.0949043846434443, + "learning_rate": 6.995350241469701e-07, + "loss": 0.3344, + "step": 3055 + }, + { + "epoch": 1.7846142617327203, + "grad_norm": 0.08897902186861191, + "learning_rate": 6.957931780954008e-07, + "loss": 0.3363, + "step": 3056 + }, + { + "epoch": 1.7851981607181957, + "grad_norm": 0.08622222944415293, + "learning_rate": 6.920610058118105e-07, + "loss": 0.3136, + "step": 3057 + }, + { + "epoch": 1.785782059703671, + "grad_norm": 0.09154743012033052, + "learning_rate": 6.883385111766139e-07, + "loss": 0.3339, + "step": 3058 + }, + { + "epoch": 1.7863659586891467, + "grad_norm": 0.08477672730173282, + "learning_rate": 6.846256980601596e-07, + "loss": 0.2817, + "step": 3059 + }, + { + "epoch": 1.7869498576746223, + "grad_norm": 0.09238347471564864, + "learning_rate": 6.809225703227352e-07, + "loss": 0.2901, + "step": 3060 + }, + { + "epoch": 1.787533756660098, + "grad_norm": 0.0846504706803368, + "learning_rate": 6.772291318145541e-07, + "loss": 0.2981, + "step": 3061 + }, + { + "epoch": 1.7881176556455733, + "grad_norm": 0.08259263860792515, + "learning_rate": 6.735453863757602e-07, + "loss": 0.2484, + "step": 3062 + }, + { + "epoch": 1.7887015546310487, + "grad_norm": 0.09247416458650663, + "learning_rate": 6.698713378364142e-07, + "loss": 0.3453, + "step": 3063 + }, + { + "epoch": 1.7892854536165244, + "grad_norm": 0.08602191423466199, + "learning_rate": 6.662069900164969e-07, + "loss": 0.277, + "step": 3064 + }, + { + "epoch": 1.789869352602, + "grad_norm": 0.08297330720469921, + "learning_rate": 6.625523467259043e-07, + "loss": 0.3102, + "step": 3065 + }, + { + "epoch": 1.7904532515874754, + "grad_norm": 0.08451481325810808, + "learning_rate": 6.589074117644411e-07, + "loss": 0.3003, + "step": 3066 + }, + { + "epoch": 1.7910371505729508, + "grad_norm": 0.0884134010554438, + "learning_rate": 6.552721889218194e-07, + "loss": 0.3113, + "step": 3067 + }, + { + "epoch": 1.7916210495584264, + "grad_norm": 0.09159335526547072, + "learning_rate": 6.516466819776502e-07, + "loss": 0.3343, + "step": 3068 + }, + { + "epoch": 1.792204948543902, + "grad_norm": 0.0905556787552893, + "learning_rate": 6.480308947014458e-07, + "loss": 0.3029, + "step": 3069 + }, + { + "epoch": 1.7927888475293774, + "grad_norm": 0.09113663962438948, + "learning_rate": 6.444248308526125e-07, + "loss": 0.336, + "step": 3070 + }, + { + "epoch": 1.7933727465148528, + "grad_norm": 0.08722885557501651, + "learning_rate": 6.408284941804444e-07, + "loss": 0.2998, + "step": 3071 + }, + { + "epoch": 1.7939566455003284, + "grad_norm": 0.08871877859970728, + "learning_rate": 6.372418884241271e-07, + "loss": 0.2955, + "step": 3072 + }, + { + "epoch": 1.794540544485804, + "grad_norm": 0.08915213859264737, + "learning_rate": 6.336650173127224e-07, + "loss": 0.307, + "step": 3073 + }, + { + "epoch": 1.7951244434712794, + "grad_norm": 0.08801229458970226, + "learning_rate": 6.300978845651728e-07, + "loss": 0.3126, + "step": 3074 + }, + { + "epoch": 1.795708342456755, + "grad_norm": 0.08808806124962226, + "learning_rate": 6.26540493890303e-07, + "loss": 0.323, + "step": 3075 + }, + { + "epoch": 1.7962922414422304, + "grad_norm": 0.08356787568934955, + "learning_rate": 6.229928489867987e-07, + "loss": 0.3003, + "step": 3076 + }, + { + "epoch": 1.796876140427706, + "grad_norm": 0.08647916119010585, + "learning_rate": 6.194549535432137e-07, + "loss": 0.2951, + "step": 3077 + }, + { + "epoch": 1.7974600394131817, + "grad_norm": 0.08558280134261098, + "learning_rate": 6.159268112379734e-07, + "loss": 0.2868, + "step": 3078 + }, + { + "epoch": 1.798043938398657, + "grad_norm": 0.08355155447964693, + "learning_rate": 6.124084257393525e-07, + "loss": 0.2875, + "step": 3079 + }, + { + "epoch": 1.7986278373841325, + "grad_norm": 0.0850014748520539, + "learning_rate": 6.088998007054903e-07, + "loss": 0.3057, + "step": 3080 + }, + { + "epoch": 1.799211736369608, + "grad_norm": 0.09389667408684704, + "learning_rate": 6.054009397843708e-07, + "loss": 0.3198, + "step": 3081 + }, + { + "epoch": 1.7997956353550837, + "grad_norm": 0.08536081789943037, + "learning_rate": 6.019118466138285e-07, + "loss": 0.2919, + "step": 3082 + }, + { + "epoch": 1.800379534340559, + "grad_norm": 0.08760033626259021, + "learning_rate": 5.98432524821545e-07, + "loss": 0.3166, + "step": 3083 + }, + { + "epoch": 1.8009634333260345, + "grad_norm": 0.08865546248746785, + "learning_rate": 5.949629780250376e-07, + "loss": 0.3198, + "step": 3084 + }, + { + "epoch": 1.80154733231151, + "grad_norm": 0.08911828220258214, + "learning_rate": 5.915032098316653e-07, + "loss": 0.3129, + "step": 3085 + }, + { + "epoch": 1.8021312312969857, + "grad_norm": 0.08954187857557658, + "learning_rate": 5.880532238386161e-07, + "loss": 0.3073, + "step": 3086 + }, + { + "epoch": 1.802715130282461, + "grad_norm": 0.09058640671903884, + "learning_rate": 5.846130236329073e-07, + "loss": 0.342, + "step": 3087 + }, + { + "epoch": 1.8032990292679365, + "grad_norm": 0.08137842106277654, + "learning_rate": 5.811826127913855e-07, + "loss": 0.2415, + "step": 3088 + }, + { + "epoch": 1.8038829282534121, + "grad_norm": 0.08491472253587062, + "learning_rate": 5.777619948807156e-07, + "loss": 0.2942, + "step": 3089 + }, + { + "epoch": 1.8044668272388877, + "grad_norm": 0.08442518925589029, + "learning_rate": 5.743511734573837e-07, + "loss": 0.2897, + "step": 3090 + }, + { + "epoch": 1.8050507262243631, + "grad_norm": 0.09006109519836486, + "learning_rate": 5.709501520676853e-07, + "loss": 0.3157, + "step": 3091 + }, + { + "epoch": 1.8056346252098388, + "grad_norm": 0.08700723878646559, + "learning_rate": 5.675589342477305e-07, + "loss": 0.2486, + "step": 3092 + }, + { + "epoch": 1.8062185241953141, + "grad_norm": 0.08660593405409539, + "learning_rate": 5.641775235234381e-07, + "loss": 0.3109, + "step": 3093 + }, + { + "epoch": 1.8068024231807898, + "grad_norm": 0.08669610604282885, + "learning_rate": 5.608059234105234e-07, + "loss": 0.3354, + "step": 3094 + }, + { + "epoch": 1.8073863221662654, + "grad_norm": 0.08967327278694774, + "learning_rate": 5.5744413741451e-07, + "loss": 0.2912, + "step": 3095 + }, + { + "epoch": 1.8079702211517408, + "grad_norm": 0.09392617532249603, + "learning_rate": 5.540921690307111e-07, + "loss": 0.3093, + "step": 3096 + }, + { + "epoch": 1.8085541201372162, + "grad_norm": 0.09267261391599414, + "learning_rate": 5.507500217442341e-07, + "loss": 0.3394, + "step": 3097 + }, + { + "epoch": 1.8091380191226918, + "grad_norm": 0.08855291533625607, + "learning_rate": 5.474176990299773e-07, + "loss": 0.3028, + "step": 3098 + }, + { + "epoch": 1.8097219181081674, + "grad_norm": 0.09123201002158064, + "learning_rate": 5.440952043526215e-07, + "loss": 0.3083, + "step": 3099 + }, + { + "epoch": 1.8103058170936428, + "grad_norm": 0.09245820941953903, + "learning_rate": 5.407825411666312e-07, + "loss": 0.3281, + "step": 3100 + }, + { + "epoch": 1.8108897160791182, + "grad_norm": 0.08547364043156434, + "learning_rate": 5.374797129162468e-07, + "loss": 0.2933, + "step": 3101 + }, + { + "epoch": 1.8114736150645938, + "grad_norm": 0.0916370172092071, + "learning_rate": 5.341867230354824e-07, + "loss": 0.3154, + "step": 3102 + }, + { + "epoch": 1.8120575140500694, + "grad_norm": 0.09641031418825906, + "learning_rate": 5.309035749481295e-07, + "loss": 0.3122, + "step": 3103 + }, + { + "epoch": 1.8126414130355448, + "grad_norm": 0.09701937212809623, + "learning_rate": 5.276302720677395e-07, + "loss": 0.3263, + "step": 3104 + }, + { + "epoch": 1.8132253120210202, + "grad_norm": 0.08972202822830284, + "learning_rate": 5.243668177976291e-07, + "loss": 0.3048, + "step": 3105 + }, + { + "epoch": 1.8138092110064958, + "grad_norm": 0.09089359741301575, + "learning_rate": 5.211132155308785e-07, + "loss": 0.3385, + "step": 3106 + }, + { + "epoch": 1.8143931099919715, + "grad_norm": 0.08963015016028791, + "learning_rate": 5.178694686503205e-07, + "loss": 0.3261, + "step": 3107 + }, + { + "epoch": 1.8149770089774468, + "grad_norm": 0.0810265122122705, + "learning_rate": 5.146355805285452e-07, + "loss": 0.2696, + "step": 3108 + }, + { + "epoch": 1.8155609079629225, + "grad_norm": 0.09485773282543213, + "learning_rate": 5.114115545278875e-07, + "loss": 0.3415, + "step": 3109 + }, + { + "epoch": 1.8161448069483979, + "grad_norm": 0.08575443383043431, + "learning_rate": 5.081973940004315e-07, + "loss": 0.2829, + "step": 3110 + }, + { + "epoch": 1.8167287059338735, + "grad_norm": 0.0909113042016174, + "learning_rate": 5.049931022880061e-07, + "loss": 0.32, + "step": 3111 + }, + { + "epoch": 1.817312604919349, + "grad_norm": 0.09155412333265182, + "learning_rate": 5.017986827221733e-07, + "loss": 0.3639, + "step": 3112 + }, + { + "epoch": 1.8178965039048245, + "grad_norm": 0.0950729738919312, + "learning_rate": 4.986141386242371e-07, + "loss": 0.3304, + "step": 3113 + }, + { + "epoch": 1.8184804028902999, + "grad_norm": 0.08359608827368223, + "learning_rate": 4.954394733052293e-07, + "loss": 0.2912, + "step": 3114 + }, + { + "epoch": 1.8190643018757755, + "grad_norm": 0.08340285916748422, + "learning_rate": 4.922746900659125e-07, + "loss": 0.278, + "step": 3115 + }, + { + "epoch": 1.8196482008612511, + "grad_norm": 0.09105220308455507, + "learning_rate": 4.89119792196776e-07, + "loss": 0.3371, + "step": 3116 + }, + { + "epoch": 1.8202320998467265, + "grad_norm": 0.08625856811506136, + "learning_rate": 4.85974782978027e-07, + "loss": 0.3058, + "step": 3117 + }, + { + "epoch": 1.820815998832202, + "grad_norm": 0.08661146497987254, + "learning_rate": 4.828396656795964e-07, + "loss": 0.3247, + "step": 3118 + }, + { + "epoch": 1.8213998978176775, + "grad_norm": 0.08947981937411116, + "learning_rate": 4.797144435611256e-07, + "loss": 0.2972, + "step": 3119 + }, + { + "epoch": 1.8219837968031531, + "grad_norm": 0.08081022071185949, + "learning_rate": 4.76599119871971e-07, + "loss": 0.2688, + "step": 3120 + }, + { + "epoch": 1.8225676957886285, + "grad_norm": 0.08484992124099142, + "learning_rate": 4.734936978511961e-07, + "loss": 0.299, + "step": 3121 + }, + { + "epoch": 1.823151594774104, + "grad_norm": 0.08866610057065485, + "learning_rate": 4.7039818072756927e-07, + "loss": 0.3156, + "step": 3122 + }, + { + "epoch": 1.8237354937595796, + "grad_norm": 0.09024101994688459, + "learning_rate": 4.6731257171956256e-07, + "loss": 0.2958, + "step": 3123 + }, + { + "epoch": 1.8243193927450552, + "grad_norm": 0.09436832638470785, + "learning_rate": 4.642368740353431e-07, + "loss": 0.3464, + "step": 3124 + }, + { + "epoch": 1.8249032917305306, + "grad_norm": 0.0933704039055966, + "learning_rate": 4.61171090872774e-07, + "loss": 0.3272, + "step": 3125 + }, + { + "epoch": 1.8254871907160062, + "grad_norm": 0.08312294560880973, + "learning_rate": 4.581152254194121e-07, + "loss": 0.2833, + "step": 3126 + }, + { + "epoch": 1.8260710897014816, + "grad_norm": 0.08134414595542594, + "learning_rate": 4.5506928085250033e-07, + "loss": 0.27, + "step": 3127 + }, + { + "epoch": 1.8266549886869572, + "grad_norm": 0.08271946304081075, + "learning_rate": 4.520332603389699e-07, + "loss": 0.2817, + "step": 3128 + }, + { + "epoch": 1.8272388876724328, + "grad_norm": 0.08975232644690755, + "learning_rate": 4.490071670354279e-07, + "loss": 0.3041, + "step": 3129 + }, + { + "epoch": 1.8278227866579082, + "grad_norm": 0.08517706877136907, + "learning_rate": 4.459910040881632e-07, + "loss": 0.3204, + "step": 3130 + }, + { + "epoch": 1.8284066856433836, + "grad_norm": 0.08675107749475566, + "learning_rate": 4.42984774633145e-07, + "loss": 0.3113, + "step": 3131 + }, + { + "epoch": 1.8289905846288592, + "grad_norm": 0.09207978385102382, + "learning_rate": 4.399884817960065e-07, + "loss": 0.3372, + "step": 3132 + }, + { + "epoch": 1.8295744836143348, + "grad_norm": 0.08902457663714347, + "learning_rate": 4.3700212869205117e-07, + "loss": 0.3016, + "step": 3133 + }, + { + "epoch": 1.8301583825998102, + "grad_norm": 0.09055004541382568, + "learning_rate": 4.34025718426252e-07, + "loss": 0.3281, + "step": 3134 + }, + { + "epoch": 1.8307422815852856, + "grad_norm": 0.0902215366329159, + "learning_rate": 4.310592540932401e-07, + "loss": 0.3215, + "step": 3135 + }, + { + "epoch": 1.8313261805707612, + "grad_norm": 0.09277319308869392, + "learning_rate": 4.2810273877730843e-07, + "loss": 0.2996, + "step": 3136 + }, + { + "epoch": 1.8319100795562369, + "grad_norm": 0.08520605147793228, + "learning_rate": 4.251561755524036e-07, + "loss": 0.2708, + "step": 3137 + }, + { + "epoch": 1.8324939785417123, + "grad_norm": 0.08252671735121442, + "learning_rate": 4.222195674821239e-07, + "loss": 0.2999, + "step": 3138 + }, + { + "epoch": 1.8330778775271876, + "grad_norm": 0.07794689632844597, + "learning_rate": 4.192929176197236e-07, + "loss": 0.2772, + "step": 3139 + }, + { + "epoch": 1.8336617765126633, + "grad_norm": 0.09676878242431126, + "learning_rate": 4.1637622900809304e-07, + "loss": 0.3266, + "step": 3140 + }, + { + "epoch": 1.8342456754981389, + "grad_norm": 0.08424717311023028, + "learning_rate": 4.1346950467977545e-07, + "loss": 0.2831, + "step": 3141 + }, + { + "epoch": 1.8348295744836145, + "grad_norm": 0.08266643241591842, + "learning_rate": 4.1057274765694765e-07, + "loss": 0.2987, + "step": 3142 + }, + { + "epoch": 1.83541347346909, + "grad_norm": 0.0788840629318298, + "learning_rate": 4.0768596095142497e-07, + "loss": 0.2812, + "step": 3143 + }, + { + "epoch": 1.8359973724545653, + "grad_norm": 0.08668299173422872, + "learning_rate": 4.048091475646576e-07, + "loss": 0.2945, + "step": 3144 + }, + { + "epoch": 1.836581271440041, + "grad_norm": 0.08751908467981026, + "learning_rate": 4.0194231048772514e-07, + "loss": 0.3292, + "step": 3145 + }, + { + "epoch": 1.8371651704255165, + "grad_norm": 0.08191513577987361, + "learning_rate": 3.9908545270133436e-07, + "loss": 0.31, + "step": 3146 + }, + { + "epoch": 1.837749069410992, + "grad_norm": 0.09072225843059746, + "learning_rate": 3.9623857717581813e-07, + "loss": 0.3147, + "step": 3147 + }, + { + "epoch": 1.8383329683964673, + "grad_norm": 0.08811102019457796, + "learning_rate": 3.934016868711266e-07, + "loss": 0.3006, + "step": 3148 + }, + { + "epoch": 1.838916867381943, + "grad_norm": 0.08229933280202427, + "learning_rate": 3.905747847368335e-07, + "loss": 0.309, + "step": 3149 + }, + { + "epoch": 1.8395007663674185, + "grad_norm": 0.08669267020055269, + "learning_rate": 3.8775787371212346e-07, + "loss": 0.3207, + "step": 3150 + }, + { + "epoch": 1.840084665352894, + "grad_norm": 0.07615405138252046, + "learning_rate": 3.8495095672579584e-07, + "loss": 0.2614, + "step": 3151 + }, + { + "epoch": 1.8406685643383693, + "grad_norm": 0.09286334404839111, + "learning_rate": 3.8215403669625726e-07, + "loss": 0.3345, + "step": 3152 + }, + { + "epoch": 1.841252463323845, + "grad_norm": 0.09033208564816443, + "learning_rate": 3.793671165315194e-07, + "loss": 0.295, + "step": 3153 + }, + { + "epoch": 1.8418363623093206, + "grad_norm": 0.08391988391945902, + "learning_rate": 3.765901991291998e-07, + "loss": 0.3069, + "step": 3154 + }, + { + "epoch": 1.842420261294796, + "grad_norm": 0.08685905883417439, + "learning_rate": 3.738232873765146e-07, + "loss": 0.3281, + "step": 3155 + }, + { + "epoch": 1.8430041602802714, + "grad_norm": 0.09148053741292843, + "learning_rate": 3.7106638415027594e-07, + "loss": 0.3313, + "step": 3156 + }, + { + "epoch": 1.843588059265747, + "grad_norm": 0.09315875406467528, + "learning_rate": 3.6831949231689203e-07, + "loss": 0.3537, + "step": 3157 + }, + { + "epoch": 1.8441719582512226, + "grad_norm": 0.09241275711250932, + "learning_rate": 3.65582614732356e-07, + "loss": 0.2842, + "step": 3158 + }, + { + "epoch": 1.8447558572366982, + "grad_norm": 0.08976572847134162, + "learning_rate": 3.628557542422606e-07, + "loss": 0.2999, + "step": 3159 + }, + { + "epoch": 1.8453397562221736, + "grad_norm": 0.08776205783874906, + "learning_rate": 3.6013891368177345e-07, + "loss": 0.2993, + "step": 3160 + }, + { + "epoch": 1.845923655207649, + "grad_norm": 0.09459194394554675, + "learning_rate": 3.574320958756461e-07, + "loss": 0.3606, + "step": 3161 + }, + { + "epoch": 1.8465075541931246, + "grad_norm": 0.09054482305214508, + "learning_rate": 3.547353036382117e-07, + "loss": 0.3253, + "step": 3162 + }, + { + "epoch": 1.8470914531786002, + "grad_norm": 0.08725923341958984, + "learning_rate": 3.520485397733786e-07, + "loss": 0.2928, + "step": 3163 + }, + { + "epoch": 1.8476753521640756, + "grad_norm": 0.07802876513240707, + "learning_rate": 3.493718070746299e-07, + "loss": 0.263, + "step": 3164 + }, + { + "epoch": 1.848259251149551, + "grad_norm": 0.08322598692498386, + "learning_rate": 3.467051083250161e-07, + "loss": 0.2909, + "step": 3165 + }, + { + "epoch": 1.8488431501350266, + "grad_norm": 0.08410033918342226, + "learning_rate": 3.440484462971549e-07, + "loss": 0.2846, + "step": 3166 + }, + { + "epoch": 1.8494270491205023, + "grad_norm": 0.08197128175713382, + "learning_rate": 3.414018237532335e-07, + "loss": 0.2799, + "step": 3167 + }, + { + "epoch": 1.8500109481059777, + "grad_norm": 0.0863145794435158, + "learning_rate": 3.3876524344499507e-07, + "loss": 0.3151, + "step": 3168 + }, + { + "epoch": 1.850594847091453, + "grad_norm": 0.08723388037548001, + "learning_rate": 3.3613870811374574e-07, + "loss": 0.2897, + "step": 3169 + }, + { + "epoch": 1.8511787460769287, + "grad_norm": 0.08269365993072068, + "learning_rate": 3.335222204903477e-07, + "loss": 0.2977, + "step": 3170 + }, + { + "epoch": 1.8517626450624043, + "grad_norm": 0.08447040225722642, + "learning_rate": 3.3091578329521147e-07, + "loss": 0.2883, + "step": 3171 + }, + { + "epoch": 1.8523465440478797, + "grad_norm": 0.08950847030679537, + "learning_rate": 3.283193992383049e-07, + "loss": 0.3109, + "step": 3172 + }, + { + "epoch": 1.852930443033355, + "grad_norm": 0.08620602734830173, + "learning_rate": 3.2573307101913756e-07, + "loss": 0.288, + "step": 3173 + }, + { + "epoch": 1.8535143420188307, + "grad_norm": 0.08192045479089176, + "learning_rate": 3.231568013267672e-07, + "loss": 0.2672, + "step": 3174 + }, + { + "epoch": 1.8540982410043063, + "grad_norm": 0.09826024001649672, + "learning_rate": 3.205905928397923e-07, + "loss": 0.328, + "step": 3175 + }, + { + "epoch": 1.854682139989782, + "grad_norm": 0.0972440038984401, + "learning_rate": 3.180344482263487e-07, + "loss": 0.3021, + "step": 3176 + }, + { + "epoch": 1.8552660389752573, + "grad_norm": 0.08485490383572236, + "learning_rate": 3.154883701441136e-07, + "loss": 0.2946, + "step": 3177 + }, + { + "epoch": 1.8558499379607327, + "grad_norm": 0.0840354739846377, + "learning_rate": 3.129523612402918e-07, + "loss": 0.2746, + "step": 3178 + }, + { + "epoch": 1.8564338369462083, + "grad_norm": 0.0871703862470433, + "learning_rate": 3.1042642415162526e-07, + "loss": 0.3186, + "step": 3179 + }, + { + "epoch": 1.857017735931684, + "grad_norm": 0.08852151011154678, + "learning_rate": 3.079105615043787e-07, + "loss": 0.3198, + "step": 3180 + }, + { + "epoch": 1.8576016349171594, + "grad_norm": 0.0914341386024884, + "learning_rate": 3.0540477591434415e-07, + "loss": 0.3241, + "step": 3181 + }, + { + "epoch": 1.8581855339026347, + "grad_norm": 0.08669972534431317, + "learning_rate": 3.0290906998683755e-07, + "loss": 0.3018, + "step": 3182 + }, + { + "epoch": 1.8587694328881104, + "grad_norm": 0.08806061336196293, + "learning_rate": 3.0042344631669217e-07, + "loss": 0.2952, + "step": 3183 + }, + { + "epoch": 1.859353331873586, + "grad_norm": 0.08952733878076594, + "learning_rate": 2.9794790748826184e-07, + "loss": 0.3581, + "step": 3184 + }, + { + "epoch": 1.8599372308590614, + "grad_norm": 0.08687185133534035, + "learning_rate": 2.9548245607541326e-07, + "loss": 0.2757, + "step": 3185 + }, + { + "epoch": 1.8605211298445368, + "grad_norm": 0.08786819796985532, + "learning_rate": 2.9302709464152144e-07, + "loss": 0.3136, + "step": 3186 + }, + { + "epoch": 1.8611050288300124, + "grad_norm": 0.08575676580216932, + "learning_rate": 2.905818257394799e-07, + "loss": 0.2794, + "step": 3187 + }, + { + "epoch": 1.861688927815488, + "grad_norm": 0.07891634390871168, + "learning_rate": 2.881466519116793e-07, + "loss": 0.2793, + "step": 3188 + }, + { + "epoch": 1.8622728268009634, + "grad_norm": 0.08007119691365285, + "learning_rate": 2.857215756900189e-07, + "loss": 0.2819, + "step": 3189 + }, + { + "epoch": 1.8628567257864388, + "grad_norm": 0.08195681699011033, + "learning_rate": 2.8330659959589944e-07, + "loss": 0.2647, + "step": 3190 + }, + { + "epoch": 1.8634406247719144, + "grad_norm": 0.09650253635797564, + "learning_rate": 2.8090172614021804e-07, + "loss": 0.3238, + "step": 3191 + }, + { + "epoch": 1.86402452375739, + "grad_norm": 0.08551426047704232, + "learning_rate": 2.7850695782337124e-07, + "loss": 0.2657, + "step": 3192 + }, + { + "epoch": 1.8646084227428656, + "grad_norm": 0.08837075820897847, + "learning_rate": 2.761222971352451e-07, + "loss": 0.3137, + "step": 3193 + }, + { + "epoch": 1.865192321728341, + "grad_norm": 0.09077610751019068, + "learning_rate": 2.737477465552174e-07, + "loss": 0.2963, + "step": 3194 + }, + { + "epoch": 1.8657762207138164, + "grad_norm": 0.09455282402306084, + "learning_rate": 2.713833085521589e-07, + "loss": 0.3593, + "step": 3195 + }, + { + "epoch": 1.866360119699292, + "grad_norm": 0.08081325103718767, + "learning_rate": 2.690289855844186e-07, + "loss": 0.2983, + "step": 3196 + }, + { + "epoch": 1.8669440186847677, + "grad_norm": 0.08506019789340659, + "learning_rate": 2.666847800998362e-07, + "loss": 0.2689, + "step": 3197 + }, + { + "epoch": 1.867527917670243, + "grad_norm": 0.08644219973309167, + "learning_rate": 2.643506945357277e-07, + "loss": 0.3247, + "step": 3198 + }, + { + "epoch": 1.8681118166557185, + "grad_norm": 0.08987540255514424, + "learning_rate": 2.620267313188862e-07, + "loss": 0.3064, + "step": 3199 + }, + { + "epoch": 1.868695715641194, + "grad_norm": 0.09317588496188967, + "learning_rate": 2.5971289286558455e-07, + "loss": 0.3136, + "step": 3200 + }, + { + "epoch": 1.8692796146266697, + "grad_norm": 0.08539757352217105, + "learning_rate": 2.574091815815649e-07, + "loss": 0.2734, + "step": 3201 + }, + { + "epoch": 1.869863513612145, + "grad_norm": 0.08161787722936374, + "learning_rate": 2.5511559986204247e-07, + "loss": 0.2826, + "step": 3202 + }, + { + "epoch": 1.8704474125976205, + "grad_norm": 0.09213231695788839, + "learning_rate": 2.528321500916986e-07, + "loss": 0.3368, + "step": 3203 + }, + { + "epoch": 1.871031311583096, + "grad_norm": 0.08361574694619404, + "learning_rate": 2.505588346446808e-07, + "loss": 0.2908, + "step": 3204 + }, + { + "epoch": 1.8716152105685717, + "grad_norm": 0.08821455724135002, + "learning_rate": 2.482956558846017e-07, + "loss": 0.3271, + "step": 3205 + }, + { + "epoch": 1.8721991095540471, + "grad_norm": 0.08229075952173019, + "learning_rate": 2.460426161645324e-07, + "loss": 0.2664, + "step": 3206 + }, + { + "epoch": 1.8727830085395225, + "grad_norm": 0.09415507736178977, + "learning_rate": 2.437997178270035e-07, + "loss": 0.2907, + "step": 3207 + }, + { + "epoch": 1.8733669075249981, + "grad_norm": 0.08627940505611785, + "learning_rate": 2.4156696320399963e-07, + "loss": 0.2713, + "step": 3208 + }, + { + "epoch": 1.8739508065104737, + "grad_norm": 0.09680665753777715, + "learning_rate": 2.3934435461695936e-07, + "loss": 0.3187, + "step": 3209 + }, + { + "epoch": 1.8745347054959494, + "grad_norm": 0.09255995190827426, + "learning_rate": 2.371318943767753e-07, + "loss": 0.3413, + "step": 3210 + }, + { + "epoch": 1.8751186044814248, + "grad_norm": 0.08541804195173397, + "learning_rate": 2.3492958478378402e-07, + "loss": 0.281, + "step": 3211 + }, + { + "epoch": 1.8757025034669002, + "grad_norm": 0.09123942261150293, + "learning_rate": 2.3273742812777166e-07, + "loss": 0.3434, + "step": 3212 + }, + { + "epoch": 1.8762864024523758, + "grad_norm": 0.08855450411921056, + "learning_rate": 2.3055542668796617e-07, + "loss": 0.3128, + "step": 3213 + }, + { + "epoch": 1.8768703014378514, + "grad_norm": 0.08515644474409495, + "learning_rate": 2.2838358273303717e-07, + "loss": 0.2754, + "step": 3214 + }, + { + "epoch": 1.8774542004233268, + "grad_norm": 0.09387018942326927, + "learning_rate": 2.26221898521094e-07, + "loss": 0.3245, + "step": 3215 + }, + { + "epoch": 1.8780380994088022, + "grad_norm": 0.09034704966195904, + "learning_rate": 2.2407037629968431e-07, + "loss": 0.3223, + "step": 3216 + }, + { + "epoch": 1.8786219983942778, + "grad_norm": 0.09026738537397574, + "learning_rate": 2.219290183057865e-07, + "loss": 0.3291, + "step": 3217 + }, + { + "epoch": 1.8792058973797534, + "grad_norm": 0.09068078026589278, + "learning_rate": 2.1979782676581408e-07, + "loss": 0.2897, + "step": 3218 + }, + { + "epoch": 1.8797897963652288, + "grad_norm": 0.08551304592587101, + "learning_rate": 2.1767680389560785e-07, + "loss": 0.2922, + "step": 3219 + }, + { + "epoch": 1.8803736953507042, + "grad_norm": 0.09132081659079024, + "learning_rate": 2.1556595190043718e-07, + "loss": 0.3332, + "step": 3220 + }, + { + "epoch": 1.8809575943361798, + "grad_norm": 0.08358816343847154, + "learning_rate": 2.1346527297499752e-07, + "loss": 0.2982, + "step": 3221 + }, + { + "epoch": 1.8815414933216554, + "grad_norm": 0.0871492176785439, + "learning_rate": 2.1137476930340628e-07, + "loss": 0.304, + "step": 3222 + }, + { + "epoch": 1.8821253923071308, + "grad_norm": 0.08380579270789314, + "learning_rate": 2.0929444305920142e-07, + "loss": 0.297, + "step": 3223 + }, + { + "epoch": 1.8827092912926062, + "grad_norm": 0.08541066393223072, + "learning_rate": 2.0722429640533948e-07, + "loss": 0.2874, + "step": 3224 + }, + { + "epoch": 1.8832931902780818, + "grad_norm": 0.09346538334309482, + "learning_rate": 2.051643314941909e-07, + "loss": 0.302, + "step": 3225 + }, + { + "epoch": 1.8838770892635575, + "grad_norm": 0.08917579921590958, + "learning_rate": 2.0311455046754581e-07, + "loss": 0.3014, + "step": 3226 + }, + { + "epoch": 1.884460988249033, + "grad_norm": 0.09379681249232001, + "learning_rate": 2.0107495545659829e-07, + "loss": 0.3221, + "step": 3227 + }, + { + "epoch": 1.8850448872345085, + "grad_norm": 0.0874846419113198, + "learning_rate": 1.990455485819587e-07, + "loss": 0.3033, + "step": 3228 + }, + { + "epoch": 1.8856287862199839, + "grad_norm": 0.09225148192881429, + "learning_rate": 1.9702633195363918e-07, + "loss": 0.3028, + "step": 3229 + }, + { + "epoch": 1.8862126852054595, + "grad_norm": 0.09251500315968184, + "learning_rate": 1.9501730767106043e-07, + "loss": 0.3127, + "step": 3230 + }, + { + "epoch": 1.886796584190935, + "grad_norm": 0.08183849793934239, + "learning_rate": 1.9301847782304484e-07, + "loss": 0.2865, + "step": 3231 + }, + { + "epoch": 1.8873804831764105, + "grad_norm": 0.08956749551953964, + "learning_rate": 1.9102984448781337e-07, + "loss": 0.34, + "step": 3232 + }, + { + "epoch": 1.887964382161886, + "grad_norm": 0.08418174445155997, + "learning_rate": 1.8905140973299096e-07, + "loss": 0.2815, + "step": 3233 + }, + { + "epoch": 1.8885482811473615, + "grad_norm": 0.08640429200500463, + "learning_rate": 1.870831756155933e-07, + "loss": 0.3057, + "step": 3234 + }, + { + "epoch": 1.8891321801328371, + "grad_norm": 0.08955601072010039, + "learning_rate": 1.851251441820323e-07, + "loss": 0.2839, + "step": 3235 + }, + { + "epoch": 1.8897160791183125, + "grad_norm": 0.08500965608715504, + "learning_rate": 1.8317731746811285e-07, + "loss": 0.2892, + "step": 3236 + }, + { + "epoch": 1.890299978103788, + "grad_norm": 0.08840280275707525, + "learning_rate": 1.8123969749902714e-07, + "loss": 0.3383, + "step": 3237 + }, + { + "epoch": 1.8908838770892635, + "grad_norm": 0.09212901245702498, + "learning_rate": 1.7931228628935926e-07, + "loss": 0.3501, + "step": 3238 + }, + { + "epoch": 1.8914677760747391, + "grad_norm": 0.0915260124277174, + "learning_rate": 1.773950858430762e-07, + "loss": 0.3023, + "step": 3239 + }, + { + "epoch": 1.8920516750602145, + "grad_norm": 0.08610838755883235, + "learning_rate": 1.7548809815352785e-07, + "loss": 0.3081, + "step": 3240 + }, + { + "epoch": 1.89263557404569, + "grad_norm": 0.09094026213243935, + "learning_rate": 1.7359132520344823e-07, + "loss": 0.3193, + "step": 3241 + }, + { + "epoch": 1.8932194730311656, + "grad_norm": 0.09093317323230175, + "learning_rate": 1.717047689649487e-07, + "loss": 0.3123, + "step": 3242 + }, + { + "epoch": 1.8938033720166412, + "grad_norm": 0.09071846100800283, + "learning_rate": 1.6982843139952022e-07, + "loss": 0.2915, + "step": 3243 + }, + { + "epoch": 1.8943872710021168, + "grad_norm": 0.08854311548053366, + "learning_rate": 1.6796231445802892e-07, + "loss": 0.3019, + "step": 3244 + }, + { + "epoch": 1.8949711699875922, + "grad_norm": 0.0810464142052983, + "learning_rate": 1.6610642008071166e-07, + "loss": 0.2763, + "step": 3245 + }, + { + "epoch": 1.8955550689730676, + "grad_norm": 0.08696444133058868, + "learning_rate": 1.6426075019717935e-07, + "loss": 0.3007, + "step": 3246 + }, + { + "epoch": 1.8961389679585432, + "grad_norm": 0.08732098495461693, + "learning_rate": 1.6242530672641143e-07, + "loss": 0.272, + "step": 3247 + }, + { + "epoch": 1.8967228669440188, + "grad_norm": 0.09043033265572574, + "learning_rate": 1.6060009157675472e-07, + "loss": 0.3239, + "step": 3248 + }, + { + "epoch": 1.8973067659294942, + "grad_norm": 0.0864583436748374, + "learning_rate": 1.5878510664592116e-07, + "loss": 0.26, + "step": 3249 + }, + { + "epoch": 1.8978906649149696, + "grad_norm": 0.0862446901961188, + "learning_rate": 1.5698035382098687e-07, + "loss": 0.2877, + "step": 3250 + }, + { + "epoch": 1.8984745639004452, + "grad_norm": 0.08838148299393109, + "learning_rate": 1.551858349783908e-07, + "loss": 0.3028, + "step": 3251 + }, + { + "epoch": 1.8990584628859208, + "grad_norm": 0.0852029115766844, + "learning_rate": 1.5340155198392716e-07, + "loss": 0.314, + "step": 3252 + }, + { + "epoch": 1.8996423618713962, + "grad_norm": 0.08452315696803864, + "learning_rate": 1.5162750669274973e-07, + "loss": 0.2915, + "step": 3253 + }, + { + "epoch": 1.9002262608568716, + "grad_norm": 0.08500862640558815, + "learning_rate": 1.4986370094937197e-07, + "loss": 0.3058, + "step": 3254 + }, + { + "epoch": 1.9008101598423472, + "grad_norm": 0.08023575392964903, + "learning_rate": 1.4811013658765471e-07, + "loss": 0.2742, + "step": 3255 + }, + { + "epoch": 1.9013940588278229, + "grad_norm": 0.08335248608549668, + "learning_rate": 1.46366815430814e-07, + "loss": 0.28, + "step": 3256 + }, + { + "epoch": 1.9019779578132983, + "grad_norm": 0.09386766595997481, + "learning_rate": 1.4463373929141766e-07, + "loss": 0.3319, + "step": 3257 + }, + { + "epoch": 1.9025618567987737, + "grad_norm": 0.08778873662607665, + "learning_rate": 1.4291090997137547e-07, + "loss": 0.2702, + "step": 3258 + }, + { + "epoch": 1.9031457557842493, + "grad_norm": 0.08459530262756508, + "learning_rate": 1.411983292619501e-07, + "loss": 0.2601, + "step": 3259 + }, + { + "epoch": 1.9037296547697249, + "grad_norm": 0.07978226563882382, + "learning_rate": 1.3949599894374276e-07, + "loss": 0.2743, + "step": 3260 + }, + { + "epoch": 1.9043135537552005, + "grad_norm": 0.08933399276973576, + "learning_rate": 1.3780392078670436e-07, + "loss": 0.3094, + "step": 3261 + }, + { + "epoch": 1.904897452740676, + "grad_norm": 0.08394492857382206, + "learning_rate": 1.3612209655011866e-07, + "loss": 0.2761, + "step": 3262 + }, + { + "epoch": 1.9054813517261513, + "grad_norm": 0.08297989498196962, + "learning_rate": 1.3445052798261137e-07, + "loss": 0.312, + "step": 3263 + }, + { + "epoch": 1.906065250711627, + "grad_norm": 0.08870465484248745, + "learning_rate": 1.3278921682214784e-07, + "loss": 0.3179, + "step": 3264 + }, + { + "epoch": 1.9066491496971025, + "grad_norm": 0.08651584062026267, + "learning_rate": 1.3113816479602304e-07, + "loss": 0.2962, + "step": 3265 + }, + { + "epoch": 1.907233048682578, + "grad_norm": 0.08057590478577395, + "learning_rate": 1.2949737362087156e-07, + "loss": 0.2697, + "step": 3266 + }, + { + "epoch": 1.9078169476680533, + "grad_norm": 0.0883433968015239, + "learning_rate": 1.2786684500265546e-07, + "loss": 0.3076, + "step": 3267 + }, + { + "epoch": 1.908400846653529, + "grad_norm": 0.08601988352867314, + "learning_rate": 1.262465806366664e-07, + "loss": 0.2864, + "step": 3268 + }, + { + "epoch": 1.9089847456390046, + "grad_norm": 0.08587146410231444, + "learning_rate": 1.2463658220752683e-07, + "loss": 0.2905, + "step": 3269 + }, + { + "epoch": 1.90956864462448, + "grad_norm": 0.08497012121751156, + "learning_rate": 1.230368513891822e-07, + "loss": 0.3095, + "step": 3270 + }, + { + "epoch": 1.9101525436099553, + "grad_norm": 0.08658101437799964, + "learning_rate": 1.2144738984490533e-07, + "loss": 0.2855, + "step": 3271 + }, + { + "epoch": 1.910736442595431, + "grad_norm": 0.09022665165076132, + "learning_rate": 1.1986819922729209e-07, + "loss": 0.301, + "step": 3272 + }, + { + "epoch": 1.9113203415809066, + "grad_norm": 0.08494441160215055, + "learning_rate": 1.1829928117825685e-07, + "loss": 0.2975, + "step": 3273 + }, + { + "epoch": 1.911904240566382, + "grad_norm": 0.0887324999354136, + "learning_rate": 1.1674063732903473e-07, + "loss": 0.3195, + "step": 3274 + }, + { + "epoch": 1.9124881395518574, + "grad_norm": 0.07889434482168667, + "learning_rate": 1.1519226930017946e-07, + "loss": 0.2604, + "step": 3275 + }, + { + "epoch": 1.913072038537333, + "grad_norm": 0.09327428987189089, + "learning_rate": 1.1365417870155881e-07, + "loss": 0.3578, + "step": 3276 + }, + { + "epoch": 1.9136559375228086, + "grad_norm": 0.09219320530632974, + "learning_rate": 1.1212636713235581e-07, + "loss": 0.3211, + "step": 3277 + }, + { + "epoch": 1.9142398365082842, + "grad_norm": 0.08727352800569778, + "learning_rate": 1.1060883618106754e-07, + "loss": 0.3034, + "step": 3278 + }, + { + "epoch": 1.9148237354937596, + "grad_norm": 0.08822691973215406, + "learning_rate": 1.0910158742550081e-07, + "loss": 0.2905, + "step": 3279 + }, + { + "epoch": 1.915407634479235, + "grad_norm": 0.08479653754798519, + "learning_rate": 1.0760462243277204e-07, + "loss": 0.3003, + "step": 3280 + }, + { + "epoch": 1.9159915334647106, + "grad_norm": 0.08395137530334444, + "learning_rate": 1.0611794275930398e-07, + "loss": 0.2804, + "step": 3281 + }, + { + "epoch": 1.9165754324501862, + "grad_norm": 0.08533474144185332, + "learning_rate": 1.0464154995082909e-07, + "loss": 0.2889, + "step": 3282 + }, + { + "epoch": 1.9171593314356616, + "grad_norm": 0.08872486457432453, + "learning_rate": 1.0317544554238058e-07, + "loss": 0.2968, + "step": 3283 + }, + { + "epoch": 1.917743230421137, + "grad_norm": 0.08906032507389257, + "learning_rate": 1.0171963105829686e-07, + "loss": 0.3126, + "step": 3284 + }, + { + "epoch": 1.9183271294066127, + "grad_norm": 0.08841227870501739, + "learning_rate": 1.0027410801221604e-07, + "loss": 0.3096, + "step": 3285 + }, + { + "epoch": 1.9189110283920883, + "grad_norm": 0.08875778063133168, + "learning_rate": 9.883887790707814e-08, + "loss": 0.305, + "step": 3286 + }, + { + "epoch": 1.9194949273775637, + "grad_norm": 0.08844902158080259, + "learning_rate": 9.741394223512057e-08, + "loss": 0.3411, + "step": 3287 + }, + { + "epoch": 1.920078826363039, + "grad_norm": 0.09321319693522438, + "learning_rate": 9.599930247787604e-08, + "loss": 0.3062, + "step": 3288 + }, + { + "epoch": 1.9206627253485147, + "grad_norm": 0.09077616996406981, + "learning_rate": 9.459496010617464e-08, + "loss": 0.3125, + "step": 3289 + }, + { + "epoch": 1.9212466243339903, + "grad_norm": 0.09069869127901617, + "learning_rate": 9.320091658013841e-08, + "loss": 0.3162, + "step": 3290 + }, + { + "epoch": 1.9218305233194657, + "grad_norm": 0.08328455797508777, + "learning_rate": 9.181717334918127e-08, + "loss": 0.2872, + "step": 3291 + }, + { + "epoch": 1.922414422304941, + "grad_norm": 0.09466832648116058, + "learning_rate": 9.044373185200906e-08, + "loss": 0.3247, + "step": 3292 + }, + { + "epoch": 1.9229983212904167, + "grad_norm": 0.08217708261164038, + "learning_rate": 8.908059351661725e-08, + "loss": 0.2542, + "step": 3293 + }, + { + "epoch": 1.9235822202758923, + "grad_norm": 0.0913592477222516, + "learning_rate": 8.772775976028547e-08, + "loss": 0.3086, + "step": 3294 + }, + { + "epoch": 1.924166119261368, + "grad_norm": 0.08236324041744228, + "learning_rate": 8.638523198958415e-08, + "loss": 0.2972, + "step": 3295 + }, + { + "epoch": 1.9247500182468433, + "grad_norm": 0.08887099958280814, + "learning_rate": 8.505301160036339e-08, + "loss": 0.2934, + "step": 3296 + }, + { + "epoch": 1.9253339172323187, + "grad_norm": 0.09258338075557108, + "learning_rate": 8.373109997776185e-08, + "loss": 0.2764, + "step": 3297 + }, + { + "epoch": 1.9259178162177943, + "grad_norm": 0.08911681232115273, + "learning_rate": 8.241949849619457e-08, + "loss": 0.3032, + "step": 3298 + }, + { + "epoch": 1.92650171520327, + "grad_norm": 0.09884327370409937, + "learning_rate": 8.11182085193607e-08, + "loss": 0.3608, + "step": 3299 + }, + { + "epoch": 1.9270856141887454, + "grad_norm": 0.08698796043019967, + "learning_rate": 7.982723140023906e-08, + "loss": 0.2787, + "step": 3300 + }, + { + "epoch": 1.9276695131742208, + "grad_norm": 0.08841198577699688, + "learning_rate": 7.854656848108044e-08, + "loss": 0.2987, + "step": 3301 + }, + { + "epoch": 1.9282534121596964, + "grad_norm": 0.08674667157787658, + "learning_rate": 7.727622109341859e-08, + "loss": 0.2928, + "step": 3302 + }, + { + "epoch": 1.928837311145172, + "grad_norm": 0.0848531687198172, + "learning_rate": 7.601619055805697e-08, + "loss": 0.2786, + "step": 3303 + }, + { + "epoch": 1.9294212101306474, + "grad_norm": 0.09849198412348105, + "learning_rate": 7.476647818507542e-08, + "loss": 0.3635, + "step": 3304 + }, + { + "epoch": 1.9300051091161228, + "grad_norm": 0.0770603658737684, + "learning_rate": 7.352708527382346e-08, + "loss": 0.2653, + "step": 3305 + }, + { + "epoch": 1.9305890081015984, + "grad_norm": 0.08742500671343845, + "learning_rate": 7.229801311292361e-08, + "loss": 0.3357, + "step": 3306 + }, + { + "epoch": 1.931172907087074, + "grad_norm": 0.08965121359081024, + "learning_rate": 7.10792629802659e-08, + "loss": 0.285, + "step": 3307 + }, + { + "epoch": 1.9317568060725494, + "grad_norm": 0.09325005219602046, + "learning_rate": 6.987083614300893e-08, + "loss": 0.3329, + "step": 3308 + }, + { + "epoch": 1.9323407050580248, + "grad_norm": 0.08704457724374133, + "learning_rate": 6.867273385757767e-08, + "loss": 0.305, + "step": 3309 + }, + { + "epoch": 1.9329246040435004, + "grad_norm": 0.09213902376029563, + "learning_rate": 6.748495736966454e-08, + "loss": 0.3348, + "step": 3310 + }, + { + "epoch": 1.933508503028976, + "grad_norm": 0.09103459544423603, + "learning_rate": 6.630750791422169e-08, + "loss": 0.3223, + "step": 3311 + }, + { + "epoch": 1.9340924020144517, + "grad_norm": 0.0907849348138741, + "learning_rate": 6.514038671546874e-08, + "loss": 0.3002, + "step": 3312 + }, + { + "epoch": 1.934676300999927, + "grad_norm": 0.08652477787517233, + "learning_rate": 6.398359498688278e-08, + "loss": 0.2882, + "step": 3313 + }, + { + "epoch": 1.9352601999854024, + "grad_norm": 0.07840541697859246, + "learning_rate": 6.283713393120505e-08, + "loss": 0.2593, + "step": 3314 + }, + { + "epoch": 1.935844098970878, + "grad_norm": 0.08785658898893908, + "learning_rate": 6.170100474043206e-08, + "loss": 0.3051, + "step": 3315 + }, + { + "epoch": 1.9364279979563537, + "grad_norm": 0.09123019641097002, + "learning_rate": 6.05752085958211e-08, + "loss": 0.3515, + "step": 3316 + }, + { + "epoch": 1.937011896941829, + "grad_norm": 0.0795072560081246, + "learning_rate": 5.945974666788479e-08, + "loss": 0.276, + "step": 3317 + }, + { + "epoch": 1.9375957959273045, + "grad_norm": 0.07905638050592936, + "learning_rate": 5.835462011638982e-08, + "loss": 0.2815, + "step": 3318 + }, + { + "epoch": 1.93817969491278, + "grad_norm": 0.08695193261786537, + "learning_rate": 5.725983009035818e-08, + "loss": 0.3222, + "step": 3319 + }, + { + "epoch": 1.9387635938982557, + "grad_norm": 0.0813946885777394, + "learning_rate": 5.617537772806603e-08, + "loss": 0.2794, + "step": 3320 + }, + { + "epoch": 1.939347492883731, + "grad_norm": 0.08273690380557347, + "learning_rate": 5.5101264157039203e-08, + "loss": 0.2738, + "step": 3321 + }, + { + "epoch": 1.9399313918692065, + "grad_norm": 0.09070383419452105, + "learning_rate": 5.403749049405438e-08, + "loss": 0.3323, + "step": 3322 + }, + { + "epoch": 1.940515290854682, + "grad_norm": 0.08072343948107576, + "learning_rate": 5.298405784513905e-08, + "loss": 0.2551, + "step": 3323 + }, + { + "epoch": 1.9410991898401577, + "grad_norm": 0.08646296618674335, + "learning_rate": 5.194096730556708e-08, + "loss": 0.3257, + "step": 3324 + }, + { + "epoch": 1.9416830888256331, + "grad_norm": 0.07884393258923861, + "learning_rate": 5.090821995986095e-08, + "loss": 0.2873, + "step": 3325 + }, + { + "epoch": 1.9422669878111085, + "grad_norm": 0.08348931694479016, + "learning_rate": 4.9885816881787287e-08, + "loss": 0.267, + "step": 3326 + }, + { + "epoch": 1.9428508867965841, + "grad_norm": 0.08332780384321477, + "learning_rate": 4.887375913436132e-08, + "loss": 0.3016, + "step": 3327 + }, + { + "epoch": 1.9434347857820597, + "grad_norm": 0.08451296097732952, + "learning_rate": 4.787204776983689e-08, + "loss": 0.3031, + "step": 3328 + }, + { + "epoch": 1.9440186847675354, + "grad_norm": 0.08472349298659076, + "learning_rate": 4.6880683829715335e-08, + "loss": 0.318, + "step": 3329 + }, + { + "epoch": 1.9446025837530108, + "grad_norm": 0.08163409201244898, + "learning_rate": 4.589966834473547e-08, + "loss": 0.2596, + "step": 3330 + }, + { + "epoch": 1.9451864827384862, + "grad_norm": 0.08754727073441718, + "learning_rate": 4.492900233488029e-08, + "loss": 0.2864, + "step": 3331 + }, + { + "epoch": 1.9457703817239618, + "grad_norm": 0.08789166939607805, + "learning_rate": 4.3968686809369165e-08, + "loss": 0.2927, + "step": 3332 + }, + { + "epoch": 1.9463542807094374, + "grad_norm": 0.08464519343437119, + "learning_rate": 4.3018722766661194e-08, + "loss": 0.3091, + "step": 3333 + }, + { + "epoch": 1.9469381796949128, + "grad_norm": 0.08476477891720595, + "learning_rate": 4.207911119445296e-08, + "loss": 0.2826, + "step": 3334 + }, + { + "epoch": 1.9475220786803882, + "grad_norm": 0.08904957977063978, + "learning_rate": 4.114985306967745e-08, + "loss": 0.2641, + "step": 3335 + }, + { + "epoch": 1.9481059776658638, + "grad_norm": 0.08464610184850326, + "learning_rate": 4.0230949358502915e-08, + "loss": 0.2791, + "step": 3336 + }, + { + "epoch": 1.9486898766513394, + "grad_norm": 0.09278286151654397, + "learning_rate": 3.932240101633178e-08, + "loss": 0.3096, + "step": 3337 + }, + { + "epoch": 1.9492737756368148, + "grad_norm": 0.08051276040016539, + "learning_rate": 3.8424208987798415e-08, + "loss": 0.2509, + "step": 3338 + }, + { + "epoch": 1.9498576746222902, + "grad_norm": 0.09173248125211747, + "learning_rate": 3.7536374206772475e-08, + "loss": 0.3343, + "step": 3339 + }, + { + "epoch": 1.9504415736077658, + "grad_norm": 0.07917066091148602, + "learning_rate": 3.665889759635222e-08, + "loss": 0.2797, + "step": 3340 + }, + { + "epoch": 1.9510254725932414, + "grad_norm": 0.08229874907161107, + "learning_rate": 3.579178006886896e-08, + "loss": 0.2824, + "step": 3341 + }, + { + "epoch": 1.9516093715787168, + "grad_norm": 0.09232997470928599, + "learning_rate": 3.4935022525880434e-08, + "loss": 0.3606, + "step": 3342 + }, + { + "epoch": 1.9521932705641922, + "grad_norm": 0.08183483324580507, + "learning_rate": 3.4088625858174075e-08, + "loss": 0.273, + "step": 3343 + }, + { + "epoch": 1.9527771695496678, + "grad_norm": 0.08489117350869732, + "learning_rate": 3.3252590945767047e-08, + "loss": 0.2642, + "step": 3344 + }, + { + "epoch": 1.9533610685351435, + "grad_norm": 0.08412711197238158, + "learning_rate": 3.242691865790071e-08, + "loss": 0.3106, + "step": 3345 + }, + { + "epoch": 1.953944967520619, + "grad_norm": 0.09557969904867417, + "learning_rate": 3.161160985304168e-08, + "loss": 0.3246, + "step": 3346 + }, + { + "epoch": 1.9545288665060945, + "grad_norm": 0.08577636345884287, + "learning_rate": 3.0806665378884106e-08, + "loss": 0.3002, + "step": 3347 + }, + { + "epoch": 1.9551127654915699, + "grad_norm": 0.08980144537186979, + "learning_rate": 3.001208607234407e-08, + "loss": 0.2999, + "step": 3348 + }, + { + "epoch": 1.9556966644770455, + "grad_norm": 0.0890925496002654, + "learning_rate": 2.922787275956074e-08, + "loss": 0.2776, + "step": 3349 + }, + { + "epoch": 1.956280563462521, + "grad_norm": 0.08984306896658098, + "learning_rate": 2.845402625589855e-08, + "loss": 0.314, + "step": 3350 + }, + { + "epoch": 1.9568644624479965, + "grad_norm": 0.08664814232702836, + "learning_rate": 2.7690547365938348e-08, + "loss": 0.283, + "step": 3351 + }, + { + "epoch": 1.957448361433472, + "grad_norm": 0.08143319030871435, + "learning_rate": 2.693743688348627e-08, + "loss": 0.2782, + "step": 3352 + }, + { + "epoch": 1.9580322604189475, + "grad_norm": 0.08140564318331081, + "learning_rate": 2.6194695591563733e-08, + "loss": 0.2659, + "step": 3353 + }, + { + "epoch": 1.9586161594044231, + "grad_norm": 0.08888703012248274, + "learning_rate": 2.546232426241635e-08, + "loss": 0.3236, + "step": 3354 + }, + { + "epoch": 1.9592000583898985, + "grad_norm": 0.08154302229921201, + "learning_rate": 2.4740323657503895e-08, + "loss": 0.2981, + "step": 3355 + }, + { + "epoch": 1.959783957375374, + "grad_norm": 0.09102327625408586, + "learning_rate": 2.4028694527503673e-08, + "loss": 0.2992, + "step": 3356 + }, + { + "epoch": 1.9603678563608495, + "grad_norm": 0.09610666429269728, + "learning_rate": 2.33274376123116e-08, + "loss": 0.3541, + "step": 3357 + }, + { + "epoch": 1.9609517553463252, + "grad_norm": 0.08324357583038143, + "learning_rate": 2.2636553641040003e-08, + "loss": 0.3018, + "step": 3358 + }, + { + "epoch": 1.9615356543318005, + "grad_norm": 0.10307374085729537, + "learning_rate": 2.1956043332010957e-08, + "loss": 0.3122, + "step": 3359 + }, + { + "epoch": 1.9621195533172762, + "grad_norm": 0.08211493065982868, + "learning_rate": 2.1285907392767367e-08, + "loss": 0.261, + "step": 3360 + }, + { + "epoch": 1.9627034523027516, + "grad_norm": 0.0934916042603021, + "learning_rate": 2.0626146520061897e-08, + "loss": 0.3094, + "step": 3361 + }, + { + "epoch": 1.9632873512882272, + "grad_norm": 0.08952816445594301, + "learning_rate": 1.997676139986138e-08, + "loss": 0.307, + "step": 3362 + }, + { + "epoch": 1.9638712502737028, + "grad_norm": 0.08375374089120795, + "learning_rate": 1.9337752707343504e-08, + "loss": 0.3022, + "step": 3363 + }, + { + "epoch": 1.9644551492591782, + "grad_norm": 0.0827587774657481, + "learning_rate": 1.8709121106899043e-08, + "loss": 0.2856, + "step": 3364 + }, + { + "epoch": 1.9650390482446536, + "grad_norm": 0.09215451055855153, + "learning_rate": 1.8090867252127387e-08, + "loss": 0.292, + "step": 3365 + }, + { + "epoch": 1.9656229472301292, + "grad_norm": 0.0887176089513695, + "learning_rate": 1.748299178584101e-08, + "loss": 0.2768, + "step": 3366 + }, + { + "epoch": 1.9662068462156048, + "grad_norm": 0.09426260499073368, + "learning_rate": 1.68854953400599e-08, + "loss": 0.325, + "step": 3367 + }, + { + "epoch": 1.9667907452010802, + "grad_norm": 0.09086486207370185, + "learning_rate": 1.6298378536012682e-08, + "loss": 0.312, + "step": 3368 + }, + { + "epoch": 1.9673746441865556, + "grad_norm": 0.08646753252498654, + "learning_rate": 1.5721641984135505e-08, + "loss": 0.2984, + "step": 3369 + }, + { + "epoch": 1.9679585431720312, + "grad_norm": 0.08946262499391498, + "learning_rate": 1.5155286284073146e-08, + "loss": 0.3071, + "step": 3370 + }, + { + "epoch": 1.9685424421575068, + "grad_norm": 0.08803312222684086, + "learning_rate": 1.4599312024676792e-08, + "loss": 0.3285, + "step": 3371 + }, + { + "epoch": 1.9691263411429822, + "grad_norm": 0.08662475313648645, + "learning_rate": 1.405371978400516e-08, + "loss": 0.296, + "step": 3372 + }, + { + "epoch": 1.9697102401284576, + "grad_norm": 0.08639409919511111, + "learning_rate": 1.351851012931893e-08, + "loss": 0.3196, + "step": 3373 + }, + { + "epoch": 1.9702941391139333, + "grad_norm": 0.0933340151108549, + "learning_rate": 1.2993683617088526e-08, + "loss": 0.2836, + "step": 3374 + }, + { + "epoch": 1.9708780380994089, + "grad_norm": 0.093734848637988, + "learning_rate": 1.2479240792985237e-08, + "loss": 0.3101, + "step": 3375 + }, + { + "epoch": 1.9714619370848843, + "grad_norm": 0.10618346075726733, + "learning_rate": 1.1975182191885648e-08, + "loss": 0.3271, + "step": 3376 + }, + { + "epoch": 1.9720458360703599, + "grad_norm": 0.07955618016827691, + "learning_rate": 1.1481508337869429e-08, + "loss": 0.2615, + "step": 3377 + }, + { + "epoch": 1.9726297350558353, + "grad_norm": 0.0987702973032722, + "learning_rate": 1.099821974421933e-08, + "loss": 0.3695, + "step": 3378 + }, + { + "epoch": 1.973213634041311, + "grad_norm": 0.08623602168514154, + "learning_rate": 1.0525316913420069e-08, + "loss": 0.2829, + "step": 3379 + }, + { + "epoch": 1.9737975330267865, + "grad_norm": 0.09087316313493338, + "learning_rate": 1.006280033715723e-08, + "loss": 0.3156, + "step": 3380 + }, + { + "epoch": 1.974381432012262, + "grad_norm": 0.08866776408556791, + "learning_rate": 9.610670496319475e-09, + "loss": 0.3128, + "step": 3381 + }, + { + "epoch": 1.9749653309977373, + "grad_norm": 0.08821511509593097, + "learning_rate": 9.168927860994104e-09, + "loss": 0.2894, + "step": 3382 + }, + { + "epoch": 1.975549229983213, + "grad_norm": 0.09090044101590138, + "learning_rate": 8.737572890470391e-09, + "loss": 0.3017, + "step": 3383 + }, + { + "epoch": 1.9761331289686885, + "grad_norm": 0.0845520084678788, + "learning_rate": 8.316606033237362e-09, + "loss": 0.2905, + "step": 3384 + }, + { + "epoch": 1.976717027954164, + "grad_norm": 0.08871731832767937, + "learning_rate": 7.906027726981568e-09, + "loss": 0.3017, + "step": 3385 + }, + { + "epoch": 1.9773009269396393, + "grad_norm": 0.08901246375127354, + "learning_rate": 7.505838398589316e-09, + "loss": 0.3163, + "step": 3386 + }, + { + "epoch": 1.977884825925115, + "grad_norm": 0.08779010049941986, + "learning_rate": 7.1160384641455475e-09, + "loss": 0.2856, + "step": 3387 + }, + { + "epoch": 1.9784687249105906, + "grad_norm": 0.0933899253582005, + "learning_rate": 6.736628328933847e-09, + "loss": 0.3405, + "step": 3388 + }, + { + "epoch": 1.979052623896066, + "grad_norm": 0.08680388451624943, + "learning_rate": 6.367608387433111e-09, + "loss": 0.3109, + "step": 3389 + }, + { + "epoch": 1.9796365228815413, + "grad_norm": 0.09083585629966501, + "learning_rate": 6.008979023320871e-09, + "loss": 0.3488, + "step": 3390 + }, + { + "epoch": 1.980220421867017, + "grad_norm": 0.08659789749481435, + "learning_rate": 5.660740609472193e-09, + "loss": 0.2973, + "step": 3391 + }, + { + "epoch": 1.9808043208524926, + "grad_norm": 0.11025254732687806, + "learning_rate": 5.322893507956339e-09, + "loss": 0.438, + "step": 3392 + }, + { + "epoch": 1.981388219837968, + "grad_norm": 0.08460400568312539, + "learning_rate": 4.995438070041214e-09, + "loss": 0.3323, + "step": 3393 + }, + { + "epoch": 1.9819721188234436, + "grad_norm": 0.09191371619527383, + "learning_rate": 4.6783746361867e-09, + "loss": 0.3334, + "step": 3394 + }, + { + "epoch": 1.982556017808919, + "grad_norm": 0.08893325004301772, + "learning_rate": 4.3717035360502094e-09, + "loss": 0.3305, + "step": 3395 + }, + { + "epoch": 1.9831399167943946, + "grad_norm": 0.08986526888407086, + "learning_rate": 4.075425088485574e-09, + "loss": 0.3256, + "step": 3396 + }, + { + "epoch": 1.9837238157798702, + "grad_norm": 0.08030255842195448, + "learning_rate": 3.7895396015374955e-09, + "loss": 0.275, + "step": 3397 + }, + { + "epoch": 1.9843077147653456, + "grad_norm": 0.0858559715651346, + "learning_rate": 3.514047372448204e-09, + "loss": 0.2945, + "step": 3398 + }, + { + "epoch": 1.984891613750821, + "grad_norm": 0.08766707655086647, + "learning_rate": 3.248948687650799e-09, + "loss": 0.3256, + "step": 3399 + }, + { + "epoch": 1.9854755127362966, + "grad_norm": 0.0837384840388082, + "learning_rate": 2.9942438227748004e-09, + "loss": 0.2773, + "step": 3400 + }, + { + "epoch": 1.9860594117217722, + "grad_norm": 0.08891988599712869, + "learning_rate": 2.749933042641706e-09, + "loss": 0.2874, + "step": 3401 + }, + { + "epoch": 1.9866433107072476, + "grad_norm": 0.08856080988206466, + "learning_rate": 2.5160166012661024e-09, + "loss": 0.2812, + "step": 3402 + }, + { + "epoch": 1.987227209692723, + "grad_norm": 0.0862566727606461, + "learning_rate": 2.2924947418556666e-09, + "loss": 0.2968, + "step": 3403 + }, + { + "epoch": 1.9878111086781987, + "grad_norm": 0.09030188029164822, + "learning_rate": 2.079367696810053e-09, + "loss": 0.306, + "step": 3404 + }, + { + "epoch": 1.9883950076636743, + "grad_norm": 0.09290966332988419, + "learning_rate": 1.876635687722006e-09, + "loss": 0.3154, + "step": 3405 + }, + { + "epoch": 1.9889789066491497, + "grad_norm": 0.08336321890156337, + "learning_rate": 1.684298925377359e-09, + "loss": 0.2941, + "step": 3406 + }, + { + "epoch": 1.989562805634625, + "grad_norm": 0.0899593545824502, + "learning_rate": 1.502357609749483e-09, + "loss": 0.3371, + "step": 3407 + }, + { + "epoch": 1.9901467046201007, + "grad_norm": 0.0843544968127468, + "learning_rate": 1.3308119300092793e-09, + "loss": 0.2889, + "step": 3408 + }, + { + "epoch": 1.9907306036055763, + "grad_norm": 0.09301561557058147, + "learning_rate": 1.1696620645140765e-09, + "loss": 0.3545, + "step": 3409 + }, + { + "epoch": 1.9913145025910517, + "grad_norm": 0.09596198953632827, + "learning_rate": 1.0189081808154033e-09, + "loss": 0.3755, + "step": 3410 + }, + { + "epoch": 1.9918984015765273, + "grad_norm": 0.0874992607583942, + "learning_rate": 8.785504356556562e-10, + "loss": 0.2879, + "step": 3411 + }, + { + "epoch": 1.9924823005620027, + "grad_norm": 0.08332511592467594, + "learning_rate": 7.485889749658803e-10, + "loss": 0.2684, + "step": 3412 + }, + { + "epoch": 1.9930661995474783, + "grad_norm": 0.09436630021936444, + "learning_rate": 6.290239338724302e-10, + "loss": 0.3268, + "step": 3413 + }, + { + "epoch": 1.993650098532954, + "grad_norm": 0.0817893443220606, + "learning_rate": 5.198554366858676e-10, + "loss": 0.2969, + "step": 3414 + }, + { + "epoch": 1.9942339975184293, + "grad_norm": 0.09119111132593043, + "learning_rate": 4.210835969142846e-10, + "loss": 0.3097, + "step": 3415 + }, + { + "epoch": 1.9948178965039047, + "grad_norm": 0.09194966295337331, + "learning_rate": 3.3270851724998e-10, + "loss": 0.331, + "step": 3416 + }, + { + "epoch": 1.9954017954893803, + "grad_norm": 0.08897365414412585, + "learning_rate": 2.5473028957945234e-10, + "loss": 0.3331, + "step": 3417 + }, + { + "epoch": 1.995985694474856, + "grad_norm": 0.08345200609249673, + "learning_rate": 1.8714899497895845e-10, + "loss": 0.3003, + "step": 3418 + }, + { + "epoch": 1.9965695934603314, + "grad_norm": 0.08325422449737846, + "learning_rate": 1.2996470371229307e-10, + "loss": 0.2933, + "step": 3419 + }, + { + "epoch": 1.9971534924458068, + "grad_norm": 0.08783635772778162, + "learning_rate": 8.317747523745035e-11, + "loss": 0.3164, + "step": 3420 + }, + { + "epoch": 1.9977373914312824, + "grad_norm": 0.08554606822048123, + "learning_rate": 4.678735819774183e-11, + "loss": 0.2979, + "step": 3421 + }, + { + "epoch": 1.998321290416758, + "grad_norm": 0.08920880701367692, + "learning_rate": 2.0794390429568212e-11, + "loss": 0.3194, + "step": 3422 + }, + { + "epoch": 1.9989051894022334, + "grad_norm": 0.0843178557930931, + "learning_rate": 5.198598959088586e-12, + "loss": 0.2895, + "step": 3423 + }, + { + "epoch": 1.9994890883877088, + "grad_norm": 0.08418720064891395, + "learning_rate": 0.0, + "loss": 0.3179, + "step": 3424 + }, + { + "epoch": 1.9994890883877088, + "step": 3424, + "total_flos": 311934616780800.0, + "train_loss": 0.38544139599375354, + "train_runtime": 12440.8908, + "train_samples_per_second": 17.62, + "train_steps_per_second": 0.275 + } + ], + "logging_steps": 1, + "max_steps": 3424, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 311934616780800.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}