diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,3306 +1,3921 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 3.0, + "epoch": 1.9963963963963964, "eval_steps": 500, - "global_step": 465, + "global_step": 554, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0064516129032258064, - "grad_norm": 0.15169227525554954, - "learning_rate": 4.255319148936171e-06, - "loss": 0.1803, + "epoch": 0.0036036036036036037, + "grad_norm": 0.1164047870616515, + "learning_rate": 3.5714285714285714e-06, + "loss": 0.1263, "step": 1 }, { - "epoch": 0.012903225806451613, - "grad_norm": 0.1108587617034759, - "learning_rate": 8.510638297872341e-06, - "loss": 0.1441, + "epoch": 0.007207207207207207, + "grad_norm": 0.1627219461416066, + "learning_rate": 7.142857142857143e-06, + "loss": 0.1446, "step": 2 }, { - "epoch": 0.01935483870967742, - "grad_norm": 0.08345105115839851, - "learning_rate": 1.2765957446808511e-05, - "loss": 0.1118, + "epoch": 0.010810810810810811, + "grad_norm": 0.1157756817304506, + "learning_rate": 1.0714285714285714e-05, + "loss": 0.1441, "step": 3 }, { - "epoch": 0.025806451612903226, - "grad_norm": 0.1426234633379846, - "learning_rate": 1.7021276595744682e-05, - "loss": 0.1313, + "epoch": 0.014414414414414415, + "grad_norm": 0.14566785288918435, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.1466, "step": 4 }, { - "epoch": 0.03225806451612903, - "grad_norm": 0.14217530777409554, - "learning_rate": 2.1276595744680852e-05, - "loss": 0.1195, + "epoch": 0.018018018018018018, + "grad_norm": 0.13068033224281192, + "learning_rate": 1.785714285714286e-05, + "loss": 0.1342, "step": 5 }, { - "epoch": 0.03870967741935484, - "grad_norm": 0.1459141077043439, - "learning_rate": 2.5531914893617022e-05, - "loss": 0.1453, + "epoch": 0.021621621621621623, + "grad_norm": 0.15128910055561917, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.1263, "step": 6 }, { - "epoch": 0.04516129032258064, - "grad_norm": 0.10195797727709278, - "learning_rate": 2.9787234042553192e-05, - "loss": 0.1119, + "epoch": 0.025225225225225224, + "grad_norm": 0.12625301643275005, + "learning_rate": 2.5e-05, + "loss": 0.1306, "step": 7 }, { - "epoch": 0.05161290322580645, - "grad_norm": 0.13553076939195616, - "learning_rate": 3.4042553191489365e-05, - "loss": 0.1791, + "epoch": 0.02882882882882883, + "grad_norm": 0.1341542973939784, + "learning_rate": 2.857142857142857e-05, + "loss": 0.1132, "step": 8 }, { - "epoch": 0.05806451612903226, - "grad_norm": 0.15732711391586654, - "learning_rate": 3.829787234042553e-05, - "loss": 0.1282, + "epoch": 0.032432432432432434, + "grad_norm": 0.09503727827074428, + "learning_rate": 3.2142857142857144e-05, + "loss": 0.0921, "step": 9 }, { - "epoch": 0.06451612903225806, - "grad_norm": 0.1192108991071351, - "learning_rate": 4.2553191489361704e-05, - "loss": 0.1288, + "epoch": 0.036036036036036036, + "grad_norm": 0.13724411508071346, + "learning_rate": 3.571428571428572e-05, + "loss": 0.123, "step": 10 }, { - "epoch": 0.07096774193548387, - "grad_norm": 0.1534053229135478, - "learning_rate": 4.680851063829788e-05, - "loss": 0.15, + "epoch": 0.03963963963963964, + "grad_norm": 0.1249162520121657, + "learning_rate": 3.928571428571429e-05, + "loss": 0.1183, "step": 11 }, { - "epoch": 0.07741935483870968, - "grad_norm": 0.11808236646609899, - "learning_rate": 5.1063829787234044e-05, - "loss": 0.1098, + "epoch": 0.043243243243243246, + "grad_norm": 0.14002577339626954, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.1297, "step": 12 }, { - "epoch": 0.08387096774193549, - "grad_norm": 0.1127350100020763, - "learning_rate": 5.531914893617022e-05, - "loss": 0.1214, + "epoch": 0.04684684684684685, + "grad_norm": 0.12032689516166056, + "learning_rate": 4.642857142857143e-05, + "loss": 0.1144, "step": 13 }, { - "epoch": 0.09032258064516129, - "grad_norm": 0.13802666747672132, - "learning_rate": 5.9574468085106384e-05, - "loss": 0.1555, + "epoch": 0.05045045045045045, + "grad_norm": 0.11822508923100593, + "learning_rate": 5e-05, + "loss": 0.127, "step": 14 }, { - "epoch": 0.0967741935483871, - "grad_norm": 0.19187355993638458, - "learning_rate": 6.382978723404256e-05, - "loss": 0.1474, + "epoch": 0.05405405405405406, + "grad_norm": 0.12626847998511856, + "learning_rate": 5.3571428571428575e-05, + "loss": 0.1246, "step": 15 }, { - "epoch": 0.1032258064516129, - "grad_norm": 0.16430813087649648, - "learning_rate": 6.808510638297873e-05, - "loss": 0.1461, + "epoch": 0.05765765765765766, + "grad_norm": 0.1394135180306787, + "learning_rate": 5.714285714285714e-05, + "loss": 0.1249, "step": 16 }, { - "epoch": 0.10967741935483871, - "grad_norm": 0.13789502094335715, - "learning_rate": 7.23404255319149e-05, - "loss": 0.1226, + "epoch": 0.06126126126126126, + "grad_norm": 0.1486627737985617, + "learning_rate": 6.0714285714285715e-05, + "loss": 0.1156, "step": 17 }, { - "epoch": 0.11612903225806452, - "grad_norm": 0.173980091579583, - "learning_rate": 7.659574468085106e-05, - "loss": 0.1558, + "epoch": 0.06486486486486487, + "grad_norm": 0.08348858837628631, + "learning_rate": 6.428571428571429e-05, + "loss": 0.0972, "step": 18 }, { - "epoch": 0.12258064516129032, - "grad_norm": 0.1418687450951512, - "learning_rate": 8.085106382978723e-05, - "loss": 0.1379, + "epoch": 0.06846846846846846, + "grad_norm": 0.19888777350730014, + "learning_rate": 6.785714285714286e-05, + "loss": 0.146, "step": 19 }, { - "epoch": 0.12903225806451613, - "grad_norm": 0.1765388069700401, - "learning_rate": 8.510638297872341e-05, - "loss": 0.149, + "epoch": 0.07207207207207207, + "grad_norm": 0.12757635434343284, + "learning_rate": 7.142857142857143e-05, + "loss": 0.109, "step": 20 }, { - "epoch": 0.13548387096774195, - "grad_norm": 0.17659794510341198, - "learning_rate": 8.936170212765958e-05, - "loss": 0.1101, + "epoch": 0.07567567567567568, + "grad_norm": 0.17261365946211904, + "learning_rate": 7.500000000000001e-05, + "loss": 0.139, "step": 21 }, { - "epoch": 0.14193548387096774, - "grad_norm": 0.22633201272355616, - "learning_rate": 9.361702127659576e-05, - "loss": 0.1656, + "epoch": 0.07927927927927927, + "grad_norm": 0.19568066543467844, + "learning_rate": 7.857142857142858e-05, + "loss": 0.1366, "step": 22 }, { - "epoch": 0.14838709677419354, - "grad_norm": 0.16004849032165383, - "learning_rate": 9.787234042553192e-05, - "loss": 0.1616, + "epoch": 0.08288288288288288, + "grad_norm": 0.20224174296046235, + "learning_rate": 8.214285714285714e-05, + "loss": 0.1437, "step": 23 }, { - "epoch": 0.15483870967741936, - "grad_norm": 0.16981049690586422, - "learning_rate": 0.00010212765957446809, - "loss": 0.1321, + "epoch": 0.08648648648648649, + "grad_norm": 0.24075864691751864, + "learning_rate": 8.571428571428571e-05, + "loss": 0.16, "step": 24 }, { - "epoch": 0.16129032258064516, - "grad_norm": 0.13581353274905067, - "learning_rate": 0.00010638297872340425, - "loss": 0.1122, + "epoch": 0.09009009009009009, + "grad_norm": 0.1506076228406242, + "learning_rate": 8.92857142857143e-05, + "loss": 0.1074, "step": 25 }, { - "epoch": 0.16774193548387098, - "grad_norm": 0.2089185781750498, - "learning_rate": 0.00011063829787234043, - "loss": 0.1527, + "epoch": 0.0936936936936937, + "grad_norm": 0.16087708567600026, + "learning_rate": 9.285714285714286e-05, + "loss": 0.1071, "step": 26 }, { - "epoch": 0.17419354838709677, - "grad_norm": 0.16125655225492164, - "learning_rate": 0.00011489361702127661, - "loss": 0.1263, + "epoch": 0.0972972972972973, + "grad_norm": 0.15632293632911032, + "learning_rate": 9.642857142857143e-05, + "loss": 0.115, "step": 27 }, { - "epoch": 0.18064516129032257, - "grad_norm": 0.13753328983503424, - "learning_rate": 0.00011914893617021277, - "loss": 0.1224, + "epoch": 0.1009009009009009, + "grad_norm": 0.15351274455794925, + "learning_rate": 0.0001, + "loss": 0.1083, "step": 28 }, { - "epoch": 0.1870967741935484, - "grad_norm": 0.19089590529877826, - "learning_rate": 0.00012340425531914893, - "loss": 0.168, + "epoch": 0.1045045045045045, + "grad_norm": 0.17410060629498864, + "learning_rate": 0.00010357142857142859, + "loss": 0.1143, "step": 29 }, { - "epoch": 0.1935483870967742, - "grad_norm": 0.15419681175807715, - "learning_rate": 0.00012765957446808513, - "loss": 0.138, + "epoch": 0.10810810810810811, + "grad_norm": 0.11003201267949979, + "learning_rate": 0.00010714285714285715, + "loss": 0.0822, "step": 30 }, { - "epoch": 0.2, - "grad_norm": 0.18178665024271073, - "learning_rate": 0.00013191489361702127, - "loss": 0.1955, + "epoch": 0.11171171171171171, + "grad_norm": 0.14850855070731758, + "learning_rate": 0.00011071428571428572, + "loss": 0.1422, "step": 31 }, { - "epoch": 0.2064516129032258, - "grad_norm": 0.11760523051627117, - "learning_rate": 0.00013617021276595746, - "loss": 0.1367, + "epoch": 0.11531531531531532, + "grad_norm": 0.11574892051893418, + "learning_rate": 0.00011428571428571428, + "loss": 0.0917, "step": 32 }, { - "epoch": 0.2129032258064516, - "grad_norm": 0.10853469531105876, - "learning_rate": 0.00014042553191489363, - "loss": 0.1208, + "epoch": 0.11891891891891893, + "grad_norm": 0.12172342587299105, + "learning_rate": 0.00011785714285714287, + "loss": 0.1125, "step": 33 }, { - "epoch": 0.21935483870967742, - "grad_norm": 0.12890024443339382, - "learning_rate": 0.0001446808510638298, - "loss": 0.142, + "epoch": 0.12252252252252252, + "grad_norm": 0.10233939138594608, + "learning_rate": 0.00012142857142857143, + "loss": 0.0916, "step": 34 }, { - "epoch": 0.22580645161290322, - "grad_norm": 0.12692879603118554, - "learning_rate": 0.00014893617021276596, - "loss": 0.1268, + "epoch": 0.12612612612612611, + "grad_norm": 0.1277262526433454, + "learning_rate": 0.000125, + "loss": 0.1168, "step": 35 }, { - "epoch": 0.23225806451612904, - "grad_norm": 0.1529612097417899, - "learning_rate": 0.00015319148936170213, - "loss": 0.1194, + "epoch": 0.12972972972972974, + "grad_norm": 0.1510932624260595, + "learning_rate": 0.00012857142857142858, + "loss": 0.1366, "step": 36 }, { - "epoch": 0.23870967741935484, - "grad_norm": 0.129432114467059, - "learning_rate": 0.00015744680851063832, - "loss": 0.128, + "epoch": 0.13333333333333333, + "grad_norm": 0.1381015918157766, + "learning_rate": 0.00013214285714285715, + "loss": 0.1112, "step": 37 }, { - "epoch": 0.24516129032258063, - "grad_norm": 0.11343138573627701, - "learning_rate": 0.00016170212765957446, - "loss": 0.102, + "epoch": 0.13693693693693693, + "grad_norm": 0.12744142332679428, + "learning_rate": 0.00013571428571428572, + "loss": 0.1169, "step": 38 }, { - "epoch": 0.25161290322580643, - "grad_norm": 0.16577469112721435, - "learning_rate": 0.00016595744680851065, - "loss": 0.1691, + "epoch": 0.14054054054054055, + "grad_norm": 0.12605036861900049, + "learning_rate": 0.0001392857142857143, + "loss": 0.1265, "step": 39 }, { - "epoch": 0.25806451612903225, - "grad_norm": 0.13614466586953358, - "learning_rate": 0.00017021276595744682, - "loss": 0.1348, + "epoch": 0.14414414414414414, + "grad_norm": 0.09494597572244792, + "learning_rate": 0.00014285714285714287, + "loss": 0.0856, "step": 40 }, { - "epoch": 0.2645161290322581, - "grad_norm": 0.1524812917659128, - "learning_rate": 0.00017446808510638298, - "loss": 0.1281, + "epoch": 0.14774774774774774, + "grad_norm": 0.11563858827548382, + "learning_rate": 0.00014642857142857141, + "loss": 0.091, "step": 41 }, { - "epoch": 0.2709677419354839, - "grad_norm": 0.14519269708506746, - "learning_rate": 0.00017872340425531915, - "loss": 0.1349, + "epoch": 0.15135135135135136, + "grad_norm": 0.10978464408087514, + "learning_rate": 0.00015000000000000001, + "loss": 0.0934, "step": 42 }, { - "epoch": 0.27741935483870966, - "grad_norm": 0.18300481897670345, - "learning_rate": 0.00018297872340425532, - "loss": 0.1694, + "epoch": 0.15495495495495495, + "grad_norm": 0.14748905325763195, + "learning_rate": 0.0001535714285714286, + "loss": 0.1042, "step": 43 }, { - "epoch": 0.2838709677419355, - "grad_norm": 0.11929331561330575, - "learning_rate": 0.0001872340425531915, - "loss": 0.1331, + "epoch": 0.15855855855855855, + "grad_norm": 0.15028736539177057, + "learning_rate": 0.00015714285714285716, + "loss": 0.1232, "step": 44 }, { - "epoch": 0.2903225806451613, - "grad_norm": 0.12604932986068976, - "learning_rate": 0.00019148936170212768, - "loss": 0.0949, + "epoch": 0.16216216216216217, + "grad_norm": 0.16834971174993532, + "learning_rate": 0.00016071428571428573, + "loss": 0.1388, "step": 45 }, { - "epoch": 0.2967741935483871, - "grad_norm": 0.14226790630565908, - "learning_rate": 0.00019574468085106384, - "loss": 0.1477, + "epoch": 0.16576576576576577, + "grad_norm": 0.1364758084150375, + "learning_rate": 0.00016428571428571428, + "loss": 0.1245, "step": 46 }, { - "epoch": 0.3032258064516129, - "grad_norm": 0.1465980952326029, - "learning_rate": 0.0002, - "loss": 0.1408, + "epoch": 0.16936936936936936, + "grad_norm": 0.1906244918264085, + "learning_rate": 0.00016785714285714288, + "loss": 0.1419, "step": 47 }, { - "epoch": 0.3096774193548387, - "grad_norm": 0.14535685898764863, - "learning_rate": 0.0001999971756719333, - "loss": 0.1474, + "epoch": 0.17297297297297298, + "grad_norm": 0.15582113106280285, + "learning_rate": 0.00017142857142857143, + "loss": 0.1284, "step": 48 }, { - "epoch": 0.3161290322580645, - "grad_norm": 0.1559112597851861, - "learning_rate": 0.00019998870284726968, - "loss": 0.1568, + "epoch": 0.17657657657657658, + "grad_norm": 0.1653708513472312, + "learning_rate": 0.000175, + "loss": 0.1265, "step": 49 }, { - "epoch": 0.3225806451612903, - "grad_norm": 0.14392375991483491, - "learning_rate": 0.00019997458200460993, - "loss": 0.1526, + "epoch": 0.18018018018018017, + "grad_norm": 0.11080370021143991, + "learning_rate": 0.0001785714285714286, + "loss": 0.1136, "step": 50 }, { - "epoch": 0.32903225806451614, - "grad_norm": 0.18920444771524633, - "learning_rate": 0.00019995481394159188, - "loss": 0.1384, + "epoch": 0.1837837837837838, + "grad_norm": 0.14497196744014715, + "learning_rate": 0.00018214285714285714, + "loss": 0.1336, "step": 51 }, { - "epoch": 0.33548387096774196, - "grad_norm": 0.17943388616343298, - "learning_rate": 0.0001999293997748454, - "loss": 0.1555, + "epoch": 0.1873873873873874, + "grad_norm": 0.11471991362976224, + "learning_rate": 0.00018571428571428572, + "loss": 0.1009, "step": 52 }, { - "epoch": 0.3419354838709677, - "grad_norm": 0.16752646291327727, - "learning_rate": 0.00019989834093992945, - "loss": 0.1634, + "epoch": 0.19099099099099098, + "grad_norm": 0.1518492774928798, + "learning_rate": 0.0001892857142857143, + "loss": 0.1445, "step": 53 }, { - "epoch": 0.34838709677419355, - "grad_norm": 0.1484740777328073, - "learning_rate": 0.00019986163919125075, - "loss": 0.124, + "epoch": 0.1945945945945946, + "grad_norm": 0.15638927885876117, + "learning_rate": 0.00019285714285714286, + "loss": 0.1095, "step": 54 }, { - "epoch": 0.3548387096774194, - "grad_norm": 0.15268630472434244, - "learning_rate": 0.00019981929660196492, - "loss": 0.1228, + "epoch": 0.1981981981981982, + "grad_norm": 0.12245693248057901, + "learning_rate": 0.00019642857142857144, + "loss": 0.099, "step": 55 }, { - "epoch": 0.36129032258064514, - "grad_norm": 0.19583236181584973, - "learning_rate": 0.0001997713155638592, - "loss": 0.1227, + "epoch": 0.2018018018018018, + "grad_norm": 0.13146029758520172, + "learning_rate": 0.0002, + "loss": 0.1215, "step": 56 }, { - "epoch": 0.36774193548387096, - "grad_norm": 0.1469999186659826, - "learning_rate": 0.00019971769878721743, - "loss": 0.1188, + "epoch": 0.20540540540540542, + "grad_norm": 0.1449923810118862, + "learning_rate": 0.00019999801019909556, + "loss": 0.1376, "step": 57 }, { - "epoch": 0.3741935483870968, - "grad_norm": 0.10845193424043034, - "learning_rate": 0.000199658449300667, - "loss": 0.1177, + "epoch": 0.209009009009009, + "grad_norm": 0.12275432649506118, + "learning_rate": 0.0001999920408755684, + "loss": 0.111, "step": 58 }, { - "epoch": 0.38064516129032255, - "grad_norm": 0.1845188187089657, - "learning_rate": 0.00019959357045100764, - "loss": 0.1726, + "epoch": 0.2126126126126126, + "grad_norm": 0.13294914056261917, + "learning_rate": 0.00019998209226697376, + "loss": 0.1184, "step": 59 }, { - "epoch": 0.3870967741935484, - "grad_norm": 0.14769124665651473, - "learning_rate": 0.00019952306590302247, - "loss": 0.1634, + "epoch": 0.21621621621621623, + "grad_norm": 0.11587311682416103, + "learning_rate": 0.00019996816476922677, + "loss": 0.1029, "step": 60 }, { - "epoch": 0.3935483870967742, - "grad_norm": 0.16408468392163889, - "learning_rate": 0.00019944693963927092, - "loss": 0.1584, + "epoch": 0.21981981981981982, + "grad_norm": 0.211706197616785, + "learning_rate": 0.00019995025893658627, + "loss": 0.1323, "step": 61 }, { - "epoch": 0.4, - "grad_norm": 0.15156206723802879, - "learning_rate": 0.00019936519595986394, - "loss": 0.1454, + "epoch": 0.22342342342342342, + "grad_norm": 0.135734632583536, + "learning_rate": 0.00019992837548163316, + "loss": 0.1073, "step": 62 }, { - "epoch": 0.4064516129032258, - "grad_norm": 0.15835599927161748, - "learning_rate": 0.00019927783948222084, - "loss": 0.1621, + "epoch": 0.22702702702702704, + "grad_norm": 0.1916821730614324, + "learning_rate": 0.00019990251527524178, + "loss": 0.1287, "step": 63 }, { - "epoch": 0.4129032258064516, - "grad_norm": 0.1333411665662589, - "learning_rate": 0.00019918487514080865, - "loss": 0.1293, + "epoch": 0.23063063063063063, + "grad_norm": 0.1696786518231171, + "learning_rate": 0.00019987267934654538, + "loss": 0.1467, "step": 64 }, { - "epoch": 0.41935483870967744, - "grad_norm": 0.1589874051481937, - "learning_rate": 0.00019908630818686338, - "loss": 0.1391, + "epoch": 0.23423423423423423, + "grad_norm": 0.1556597769170162, + "learning_rate": 0.00019983886888289514, + "loss": 0.1074, "step": 65 }, { - "epoch": 0.4258064516129032, - "grad_norm": 0.12399538770065353, - "learning_rate": 0.0001989821441880933, - "loss": 0.1208, + "epoch": 0.23783783783783785, + "grad_norm": 0.11435037299616506, + "learning_rate": 0.00019980108522981284, + "loss": 0.1054, "step": 66 }, { - "epoch": 0.432258064516129, - "grad_norm": 0.16949673743147955, - "learning_rate": 0.00019887238902836448, - "loss": 0.1139, + "epoch": 0.24144144144144145, + "grad_norm": 0.1377527772698083, + "learning_rate": 0.00019975932989093747, + "loss": 0.1167, "step": 67 }, { - "epoch": 0.43870967741935485, - "grad_norm": 0.1875430265393267, - "learning_rate": 0.00019875704890736853, - "loss": 0.1856, + "epoch": 0.24504504504504504, + "grad_norm": 0.13089085083126692, + "learning_rate": 0.00019971360452796522, + "loss": 0.1268, "step": 68 }, { - "epoch": 0.44516129032258067, - "grad_norm": 0.14927152386929934, - "learning_rate": 0.00019863613034027224, - "loss": 0.1516, + "epoch": 0.24864864864864866, + "grad_norm": 0.1461221542311374, + "learning_rate": 0.00019966391096058346, + "loss": 0.1353, "step": 69 }, { - "epoch": 0.45161290322580644, - "grad_norm": 0.17501653378304205, - "learning_rate": 0.0001985096401573497, - "loss": 0.161, + "epoch": 0.25225225225225223, + "grad_norm": 0.10972604571483792, + "learning_rate": 0.0001996102511663983, + "loss": 0.0839, "step": 70 }, { - "epoch": 0.45806451612903226, - "grad_norm": 0.17582762670350804, - "learning_rate": 0.00019837758550359636, - "loss": 0.1564, + "epoch": 0.25585585585585585, + "grad_norm": 0.14694411558687645, + "learning_rate": 0.0001995526272808559, + "loss": 0.1266, "step": 71 }, { - "epoch": 0.4645161290322581, - "grad_norm": 0.15692049884405931, - "learning_rate": 0.0001982399738383255, - "loss": 0.1282, + "epoch": 0.2594594594594595, + "grad_norm": 0.2623900472581046, + "learning_rate": 0.00019949104159715743, + "loss": 0.1192, "step": 72 }, { - "epoch": 0.47096774193548385, - "grad_norm": 0.14436392088728167, - "learning_rate": 0.00019809681293474693, - "loss": 0.1299, + "epoch": 0.26306306306306304, + "grad_norm": 0.14253202316127417, + "learning_rate": 0.0001994254965661679, + "loss": 0.1268, "step": 73 }, { - "epoch": 0.4774193548387097, - "grad_norm": 0.19048463762976417, - "learning_rate": 0.0001979481108795278, - "loss": 0.1983, + "epoch": 0.26666666666666666, + "grad_norm": 0.22775504622269988, + "learning_rate": 0.0001993559947963185, + "loss": 0.1624, "step": 74 }, { - "epoch": 0.4838709677419355, - "grad_norm": 0.13475205452089994, - "learning_rate": 0.00019779387607233586, - "loss": 0.156, + "epoch": 0.2702702702702703, + "grad_norm": 0.18974052313619846, + "learning_rate": 0.00019928253905350296, + "loss": 0.1656, "step": 75 }, { - "epoch": 0.49032258064516127, - "grad_norm": 0.14145398172929924, - "learning_rate": 0.00019763411722536502, - "loss": 0.1355, + "epoch": 0.27387387387387385, + "grad_norm": 0.18281811162027828, + "learning_rate": 0.00019920513226096733, + "loss": 0.1512, "step": 76 }, { - "epoch": 0.4967741935483871, - "grad_norm": 0.14400838677113517, - "learning_rate": 0.00019746884336284317, - "loss": 0.1371, + "epoch": 0.2774774774774775, + "grad_norm": 0.15981121539784604, + "learning_rate": 0.00019912377749919374, + "loss": 0.1414, "step": 77 }, { - "epoch": 0.5032258064516129, - "grad_norm": 0.13421206031331137, - "learning_rate": 0.00019729806382052248, - "loss": 0.1156, + "epoch": 0.2810810810810811, + "grad_norm": 0.11107015310290616, + "learning_rate": 0.00019903847800577777, + "loss": 0.0732, "step": 78 }, { - "epoch": 0.5096774193548387, - "grad_norm": 0.11613915473665105, - "learning_rate": 0.00019712178824515212, - "loss": 0.1293, + "epoch": 0.28468468468468466, + "grad_norm": 0.11807284847655806, + "learning_rate": 0.00019894923717529955, + "loss": 0.1158, "step": 79 }, { - "epoch": 0.5161290322580645, - "grad_norm": 0.12472786830602234, - "learning_rate": 0.00019694002659393305, - "loss": 0.1189, + "epoch": 0.2882882882882883, + "grad_norm": 0.1674132871400004, + "learning_rate": 0.00019885605855918885, + "loss": 0.1363, "step": 80 }, { - "epoch": 0.5225806451612903, - "grad_norm": 0.11050712097688373, - "learning_rate": 0.00019675278913395606, - "loss": 0.12, + "epoch": 0.2918918918918919, + "grad_norm": 0.09521837972620555, + "learning_rate": 0.00019875894586558355, + "loss": 0.0761, "step": 81 }, { - "epoch": 0.5290322580645161, - "grad_norm": 0.1303276052231771, - "learning_rate": 0.0001965600864416213, - "loss": 0.1428, + "epoch": 0.2954954954954955, + "grad_norm": 0.13313059706266978, + "learning_rate": 0.00019865790295918212, + "loss": 0.114, "step": 82 }, { - "epoch": 0.535483870967742, - "grad_norm": 0.13774570876008593, - "learning_rate": 0.00019636192940204134, - "loss": 0.1438, + "epoch": 0.2990990990990991, + "grad_norm": 0.15752991450823575, + "learning_rate": 0.00019855293386108992, + "loss": 0.1143, "step": 83 }, { - "epoch": 0.5419354838709678, - "grad_norm": 0.14810394982940484, - "learning_rate": 0.00019615832920842586, - "loss": 0.1404, + "epoch": 0.3027027027027027, + "grad_norm": 0.11681361221271575, + "learning_rate": 0.0001984440427486591, + "loss": 0.0955, "step": 84 }, { - "epoch": 0.5483870967741935, - "grad_norm": 0.144275852100491, - "learning_rate": 0.00019594929736144976, - "loss": 0.1357, + "epoch": 0.3063063063063063, + "grad_norm": 0.1435158350817726, + "learning_rate": 0.00019833123395532226, + "loss": 0.1292, "step": 85 }, { - "epoch": 0.5548387096774193, - "grad_norm": 0.16879600090790034, - "learning_rate": 0.0001957348456686032, - "loss": 0.1578, + "epoch": 0.3099099099099099, + "grad_norm": 0.1174821097766054, + "learning_rate": 0.00019821451197042026, + "loss": 0.119, "step": 86 }, { - "epoch": 0.5612903225806452, - "grad_norm": 0.1588074767274021, - "learning_rate": 0.00019551498624352496, - "loss": 0.1228, + "epoch": 0.31351351351351353, + "grad_norm": 0.14421204301690782, + "learning_rate": 0.00019809388143902332, + "loss": 0.1313, "step": 87 }, { - "epoch": 0.567741935483871, - "grad_norm": 0.15067916918276408, - "learning_rate": 0.00019528973150531787, - "loss": 0.1323, + "epoch": 0.3171171171171171, + "grad_norm": 0.13517965622709482, + "learning_rate": 0.0001979693471617462, + "loss": 0.1297, "step": 88 }, { - "epoch": 0.5741935483870968, - "grad_norm": 0.16266341913656662, - "learning_rate": 0.00019505909417784754, - "loss": 0.13, + "epoch": 0.3207207207207207, + "grad_norm": 0.17145867539050777, + "learning_rate": 0.00019784091409455728, + "loss": 0.1359, "step": 89 }, { - "epoch": 0.5806451612903226, - "grad_norm": 0.121529537729675, - "learning_rate": 0.00019482308728902356, - "loss": 0.1067, + "epoch": 0.32432432432432434, + "grad_norm": 0.11953352813577937, + "learning_rate": 0.00019770858734858126, + "loss": 0.0878, "step": 90 }, { - "epoch": 0.5870967741935483, - "grad_norm": 0.1740468182649888, - "learning_rate": 0.00019458172417006347, - "loss": 0.1513, + "epoch": 0.3279279279279279, + "grad_norm": 0.13673174823647213, + "learning_rate": 0.00019757237218989563, + "loss": 0.1265, "step": 91 }, { - "epoch": 0.5935483870967742, - "grad_norm": 0.14062538318374462, - "learning_rate": 0.00019433501845473995, - "loss": 0.1438, + "epoch": 0.33153153153153153, + "grad_norm": 0.16490479257041854, + "learning_rate": 0.00019743227403932134, + "loss": 0.1393, "step": 92 }, { - "epoch": 0.6, - "grad_norm": 0.17690034130801896, - "learning_rate": 0.00019408298407861042, - "loss": 0.1356, + "epoch": 0.33513513513513515, + "grad_norm": 0.12343228690652325, + "learning_rate": 0.000197288298472207, + "loss": 0.1174, "step": 93 }, { - "epoch": 0.6064516129032258, - "grad_norm": 0.18789482750619546, - "learning_rate": 0.00019382563527823026, - "loss": 0.1758, + "epoch": 0.3387387387387387, + "grad_norm": 0.16745916159569352, + "learning_rate": 0.00019714045121820676, + "loss": 0.1235, "step": 94 }, { - "epoch": 0.6129032258064516, - "grad_norm": 0.16993067663839118, - "learning_rate": 0.00019356298659034817, - "loss": 0.1599, + "epoch": 0.34234234234234234, + "grad_norm": 0.16922526895888806, + "learning_rate": 0.00019698873816105273, + "loss": 0.0975, "step": 95 }, { - "epoch": 0.6193548387096774, - "grad_norm": 0.16495058136550117, - "learning_rate": 0.00019329505285108542, - "loss": 0.1283, + "epoch": 0.34594594594594597, + "grad_norm": 0.129484224418453, + "learning_rate": 0.00019683316533832042, + "loss": 0.0928, "step": 96 }, { - "epoch": 0.6258064516129033, - "grad_norm": 0.15497395377626808, - "learning_rate": 0.00019302184919509755, - "loss": 0.1493, + "epoch": 0.34954954954954953, + "grad_norm": 0.1514002748369919, + "learning_rate": 0.0001966737389411887, + "loss": 0.1341, "step": 97 }, { - "epoch": 0.632258064516129, - "grad_norm": 0.13660921526912856, - "learning_rate": 0.00019274339105471971, - "loss": 0.1307, + "epoch": 0.35315315315315315, + "grad_norm": 0.11053077601153272, + "learning_rate": 0.00019651046531419332, + "loss": 0.09, "step": 98 }, { - "epoch": 0.6387096774193548, - "grad_norm": 0.18246231884688152, - "learning_rate": 0.00019245969415909465, - "loss": 0.1598, + "epoch": 0.3567567567567568, + "grad_norm": 0.11955256535981768, + "learning_rate": 0.00019634335095497458, + "loss": 0.0978, "step": 99 }, { - "epoch": 0.6451612903225806, - "grad_norm": 0.11344749529118914, - "learning_rate": 0.00019217077453328449, - "loss": 0.1304, + "epoch": 0.36036036036036034, + "grad_norm": 0.12318848470518083, + "learning_rate": 0.0001961724025140185, + "loss": 0.1123, "step": 100 }, { - "epoch": 0.6516129032258065, - "grad_norm": 0.11682283731326468, - "learning_rate": 0.0001918766484973654, - "loss": 0.0977, + "epoch": 0.36396396396396397, + "grad_norm": 0.19878288570661823, + "learning_rate": 0.0001959976267943923, + "loss": 0.1449, "step": 101 }, { - "epoch": 0.6580645161290323, - "grad_norm": 0.14494532290577813, - "learning_rate": 0.00019157733266550575, - "loss": 0.1338, + "epoch": 0.3675675675675676, + "grad_norm": 0.11498348089609609, + "learning_rate": 0.0001958190307514737, + "loss": 0.101, "step": 102 }, { - "epoch": 0.6645161290322581, - "grad_norm": 0.12095275202026515, - "learning_rate": 0.0001912728439450276, - "loss": 0.1513, + "epoch": 0.37117117117117115, + "grad_norm": 0.12807480490548945, + "learning_rate": 0.00019563662149267406, + "loss": 0.1115, "step": 103 }, { - "epoch": 0.6709677419354839, - "grad_norm": 0.13423702981009097, - "learning_rate": 0.00019096319953545185, - "loss": 0.1335, + "epoch": 0.3747747747747748, + "grad_norm": 0.1537951698344796, + "learning_rate": 0.0001954504062771555, + "loss": 0.1099, "step": 104 }, { - "epoch": 0.6774193548387096, - "grad_norm": 0.1670132860946028, - "learning_rate": 0.0001906484169275263, - "loss": 0.1607, + "epoch": 0.3783783783783784, + "grad_norm": 0.13376774584465406, + "learning_rate": 0.0001952603925155422, + "loss": 0.0945, "step": 105 }, { - "epoch": 0.6838709677419355, - "grad_norm": 0.14053854153152684, - "learning_rate": 0.00019032851390223812, - "loss": 0.1365, + "epoch": 0.38198198198198197, + "grad_norm": 0.11095795904499461, + "learning_rate": 0.0001950665877696252, + "loss": 0.1001, "step": 106 }, { - "epoch": 0.6903225806451613, - "grad_norm": 0.1399807021991922, - "learning_rate": 0.00019000350852980909, - "loss": 0.1589, + "epoch": 0.3855855855855856, + "grad_norm": 0.1176293890483276, + "learning_rate": 0.00019486899975206166, + "loss": 0.1114, "step": 107 }, { - "epoch": 0.6967741935483871, - "grad_norm": 0.15473299551506894, - "learning_rate": 0.00018967341916867518, - "loss": 0.166, + "epoch": 0.3891891891891892, + "grad_norm": 0.16600471258328028, + "learning_rate": 0.0001946676363260679, + "loss": 0.1565, "step": 108 }, { - "epoch": 0.7032258064516129, - "grad_norm": 0.1536872117526864, - "learning_rate": 0.00018933826446444933, - "loss": 0.1657, + "epoch": 0.3927927927927928, + "grad_norm": 0.12969105825015786, + "learning_rate": 0.0001944625055051065, + "loss": 0.0942, "step": 109 }, { - "epoch": 0.7096774193548387, - "grad_norm": 0.1282004975556196, - "learning_rate": 0.0001889980633488683, - "loss": 0.1212, + "epoch": 0.3963963963963964, + "grad_norm": 0.1260399594140325, + "learning_rate": 0.00019425361545256727, + "loss": 0.1151, "step": 110 }, { - "epoch": 0.7161290322580646, - "grad_norm": 0.16458328388975405, - "learning_rate": 0.00018865283503872324, - "loss": 0.1655, + "epoch": 0.4, + "grad_norm": 0.12141152738985596, + "learning_rate": 0.00019404097448144257, + "loss": 0.0953, "step": 111 }, { - "epoch": 0.7225806451612903, - "grad_norm": 0.1505113828615181, - "learning_rate": 0.00018830259903477426, - "loss": 0.1571, + "epoch": 0.4036036036036036, + "grad_norm": 0.16522441528864815, + "learning_rate": 0.00019382459105399632, + "loss": 0.1483, "step": 112 }, { - "epoch": 0.7290322580645161, - "grad_norm": 0.16575595088070239, - "learning_rate": 0.0001879473751206489, - "loss": 0.1504, + "epoch": 0.4072072072072072, + "grad_norm": 0.16464941562962845, + "learning_rate": 0.00019360447378142728, + "loss": 0.1145, "step": 113 }, { - "epoch": 0.7354838709677419, - "grad_norm": 0.1484230902451611, - "learning_rate": 0.0001875871833617246, - "loss": 0.1498, + "epoch": 0.41081081081081083, + "grad_norm": 0.1301041115410939, + "learning_rate": 0.00019338063142352644, + "loss": 0.109, "step": 114 }, { - "epoch": 0.7419354838709677, - "grad_norm": 0.2059907492830938, - "learning_rate": 0.0001872220441039952, - "loss": 0.134, + "epoch": 0.4144144144144144, + "grad_norm": 0.15394069789981274, + "learning_rate": 0.00019315307288832835, + "loss": 0.1484, "step": 115 }, { - "epoch": 0.7483870967741936, - "grad_norm": 0.1491671097000444, - "learning_rate": 0.0001868519779729218, - "loss": 0.1374, + "epoch": 0.418018018018018, + "grad_norm": 0.1337025315682798, + "learning_rate": 0.00019292180723175654, + "loss": 0.1083, "step": 116 }, { - "epoch": 0.7548387096774194, - "grad_norm": 0.1727675086328308, - "learning_rate": 0.0001864770058722676, - "loss": 0.1624, + "epoch": 0.42162162162162165, + "grad_norm": 0.11697191221298965, + "learning_rate": 0.00019268684365726326, + "loss": 0.1104, "step": 117 }, { - "epoch": 0.7612903225806451, - "grad_norm": 0.16357318423846662, - "learning_rate": 0.00018609714898291718, - "loss": 0.1528, + "epoch": 0.4252252252252252, + "grad_norm": 0.14885108765057334, + "learning_rate": 0.00019244819151546322, + "loss": 0.1349, "step": 118 }, { - "epoch": 0.7677419354838709, - "grad_norm": 0.1584052674932312, - "learning_rate": 0.00018571242876167996, - "loss": 0.1321, + "epoch": 0.42882882882882883, + "grad_norm": 0.16748465670739565, + "learning_rate": 0.00019220586030376134, + "loss": 0.1375, "step": 119 }, { - "epoch": 0.7741935483870968, - "grad_norm": 0.21714396600094343, - "learning_rate": 0.0001853228669400784, - "loss": 0.1748, + "epoch": 0.43243243243243246, + "grad_norm": 0.16065529576883042, + "learning_rate": 0.00019195985966597494, + "loss": 0.1158, "step": 120 }, { - "epoch": 0.7806451612903226, - "grad_norm": 0.13743651841776636, - "learning_rate": 0.00018492848552312014, - "loss": 0.1493, + "epoch": 0.436036036036036, + "grad_norm": 0.14710431466862364, + "learning_rate": 0.0001917101993919498, + "loss": 0.1123, "step": 121 }, { - "epoch": 0.7870967741935484, - "grad_norm": 0.1541126978032927, - "learning_rate": 0.00018452930678805536, - "loss": 0.1331, + "epoch": 0.43963963963963965, + "grad_norm": 0.1535583565878682, + "learning_rate": 0.00019145688941717075, + "loss": 0.1244, "step": 122 }, { - "epoch": 0.7935483870967742, - "grad_norm": 0.1571822882709535, - "learning_rate": 0.00018412535328311814, - "loss": 0.1427, + "epoch": 0.44324324324324327, + "grad_norm": 0.15887496082691002, + "learning_rate": 0.00019119993982236606, + "loss": 0.1099, "step": 123 }, { - "epoch": 0.8, - "grad_norm": 0.17825934340851243, - "learning_rate": 0.00018371664782625287, - "loss": 0.1871, + "epoch": 0.44684684684684683, + "grad_norm": 0.17132720394894463, + "learning_rate": 0.00019093936083310653, + "loss": 0.1366, "step": 124 }, { - "epoch": 0.8064516129032258, - "grad_norm": 0.1745800846806893, - "learning_rate": 0.00018330321350382544, - "loss": 0.1672, + "epoch": 0.45045045045045046, + "grad_norm": 0.1304195997449305, + "learning_rate": 0.00019067516281939825, + "loss": 0.1042, "step": 125 }, { - "epoch": 0.8129032258064516, - "grad_norm": 0.13970218689990957, - "learning_rate": 0.00018288507366931905, - "loss": 0.1715, + "epoch": 0.4540540540540541, + "grad_norm": 0.13720183539624425, + "learning_rate": 0.00019040735629527027, + "loss": 0.0939, "step": 126 }, { - "epoch": 0.8193548387096774, - "grad_norm": 0.15100292169016535, - "learning_rate": 0.00018246225194201517, - "loss": 0.1411, + "epoch": 0.45765765765765765, + "grad_norm": 0.1878348429175824, + "learning_rate": 0.00019013595191835574, + "loss": 0.1421, "step": 127 }, { - "epoch": 0.8258064516129032, - "grad_norm": 0.1538126122981586, - "learning_rate": 0.00018203477220565912, - "loss": 0.1516, + "epoch": 0.46126126126126127, + "grad_norm": 0.15221296411188612, + "learning_rate": 0.00018986096048946824, + "loss": 0.1207, "step": 128 }, { - "epoch": 0.832258064516129, - "grad_norm": 0.15630735898296536, - "learning_rate": 0.00018160265860711134, - "loss": 0.1636, + "epoch": 0.4648648648648649, + "grad_norm": 0.12530318604533355, + "learning_rate": 0.0001895823929521716, + "loss": 0.1101, "step": 129 }, { - "epoch": 0.8387096774193549, - "grad_norm": 0.1427718560771215, - "learning_rate": 0.00018116593555498307, - "loss": 0.1297, + "epoch": 0.46846846846846846, + "grad_norm": 0.11753990553496706, + "learning_rate": 0.0001893002603923446, + "loss": 0.0814, "step": 130 }, { - "epoch": 0.8451612903225807, - "grad_norm": 0.11911885929622754, - "learning_rate": 0.0001807246277182578, - "loss": 0.1115, + "epoch": 0.4720720720720721, + "grad_norm": 0.15143122574748422, + "learning_rate": 0.00018901457403773967, + "loss": 0.1259, "step": 131 }, { - "epoch": 0.8516129032258064, - "grad_norm": 0.13702745725576418, - "learning_rate": 0.0001802787600248977, - "loss": 0.157, + "epoch": 0.4756756756756757, + "grad_norm": 0.12436714806981373, + "learning_rate": 0.00018872534525753615, + "loss": 0.1148, "step": 132 }, { - "epoch": 0.8580645161290322, - "grad_norm": 0.17220644805792673, - "learning_rate": 0.0001798283576604356, - "loss": 0.1561, + "epoch": 0.47927927927927927, + "grad_norm": 0.13100199539485474, + "learning_rate": 0.00018843258556188787, + "loss": 0.1189, "step": 133 }, { - "epoch": 0.864516129032258, - "grad_norm": 0.196867226472361, - "learning_rate": 0.0001793734460665523, - "loss": 0.1657, + "epoch": 0.4828828828828829, + "grad_norm": 0.1667053146851425, + "learning_rate": 0.00018813630660146488, + "loss": 0.1494, "step": 134 }, { - "epoch": 0.8709677419354839, - "grad_norm": 0.1759359784948508, - "learning_rate": 0.00017891405093963938, - "loss": 0.1909, + "epoch": 0.4864864864864865, + "grad_norm": 0.09073673518826318, + "learning_rate": 0.00018783652016699014, + "loss": 0.0799, "step": 135 }, { - "epoch": 0.8774193548387097, - "grad_norm": 0.16063163300244113, - "learning_rate": 0.0001784501982293479, - "loss": 0.1552, + "epoch": 0.4900900900900901, + "grad_norm": 0.13343062211829884, + "learning_rate": 0.0001875332381887699, + "loss": 0.1241, "step": 136 }, { - "epoch": 0.8838709677419355, - "grad_norm": 0.17034183209183734, - "learning_rate": 0.00017798191413712243, - "loss": 0.1502, + "epoch": 0.4936936936936937, + "grad_norm": 0.12750970397329575, + "learning_rate": 0.0001872264727362194, + "loss": 0.1386, "step": 137 }, { - "epoch": 0.8903225806451613, - "grad_norm": 0.14694529779128243, - "learning_rate": 0.0001775092251147211, - "loss": 0.1277, + "epoch": 0.4972972972972973, + "grad_norm": 0.10441877909622974, + "learning_rate": 0.00018691623601738199, + "loss": 0.0888, "step": 138 }, { - "epoch": 0.896774193548387, - "grad_norm": 0.13174927545002138, - "learning_rate": 0.0001770321578627213, - "loss": 0.1277, + "epoch": 0.5009009009009009, + "grad_norm": 0.14478179850573814, + "learning_rate": 0.00018660254037844388, + "loss": 0.1056, "step": 139 }, { - "epoch": 0.9032258064516129, - "grad_norm": 0.15784079444219237, - "learning_rate": 0.00017655073932901168, - "loss": 0.1534, + "epoch": 0.5045045045045045, + "grad_norm": 0.13712755003139512, + "learning_rate": 0.00018628539830324229, + "loss": 0.1489, "step": 140 }, { - "epoch": 0.9096774193548387, - "grad_norm": 0.19691370909219638, - "learning_rate": 0.0001760649967072697, - "loss": 0.1688, + "epoch": 0.5081081081081081, + "grad_norm": 0.12379021926600628, + "learning_rate": 0.000185964822412769, + "loss": 0.1071, "step": 141 }, { - "epoch": 0.9161290322580645, - "grad_norm": 0.15697904965484202, - "learning_rate": 0.00017557495743542585, - "loss": 0.1523, + "epoch": 0.5117117117117117, + "grad_norm": 0.12266406139545731, + "learning_rate": 0.00018564082546466805, + "loss": 0.1141, "step": 142 }, { - "epoch": 0.9225806451612903, - "grad_norm": 0.1759980765103477, - "learning_rate": 0.00017508064919411344, - "loss": 0.15, + "epoch": 0.5153153153153153, + "grad_norm": 0.09332411107267007, + "learning_rate": 0.00018531342035272766, + "loss": 0.0876, "step": 143 }, { - "epoch": 0.9290322580645162, - "grad_norm": 0.19254834048997346, - "learning_rate": 0.00017458209990510527, - "loss": 0.1474, + "epoch": 0.518918918918919, + "grad_norm": 0.12722229275266542, + "learning_rate": 0.00018498262010636774, + "loss": 0.123, "step": 144 }, { - "epoch": 0.9354838709677419, - "grad_norm": 0.19220369870461818, - "learning_rate": 0.00017407933772973637, - "loss": 0.1678, + "epoch": 0.5225225225225225, + "grad_norm": 0.1934624210241968, + "learning_rate": 0.00018464843789012085, + "loss": 0.1891, "step": 145 }, { - "epoch": 0.9419354838709677, - "grad_norm": 0.14789056250556576, - "learning_rate": 0.00017357239106731317, - "loss": 0.1634, + "epoch": 0.5261261261261261, + "grad_norm": 0.1202206919464269, + "learning_rate": 0.00018431088700310844, + "loss": 0.1157, "step": 146 }, { - "epoch": 0.9483870967741935, - "grad_norm": 0.15823815803270533, - "learning_rate": 0.00017306128855350942, - "loss": 0.1744, + "epoch": 0.5297297297297298, + "grad_norm": 0.11855534138749764, + "learning_rate": 0.0001839699808785118, + "loss": 0.1126, "step": 147 }, { - "epoch": 0.9548387096774194, - "grad_norm": 0.159128130793647, - "learning_rate": 0.0001725460590587486, - "loss": 0.1732, + "epoch": 0.5333333333333333, + "grad_norm": 0.10719514027165045, + "learning_rate": 0.00018362573308303718, + "loss": 0.0907, "step": 148 }, { - "epoch": 0.9612903225806452, - "grad_norm": 0.1420932941022579, - "learning_rate": 0.00017202673168657318, - "loss": 0.1193, + "epoch": 0.5369369369369369, + "grad_norm": 0.11210467216409752, + "learning_rate": 0.00018327815731637612, + "loss": 0.1007, "step": 149 }, { - "epoch": 0.967741935483871, - "grad_norm": 0.1581041276537875, - "learning_rate": 0.0001715033357720006, - "loss": 0.157, + "epoch": 0.5405405405405406, + "grad_norm": 0.12526617885340885, + "learning_rate": 0.00018292726741066007, + "loss": 0.1049, "step": 150 }, { - "epoch": 0.9741935483870968, - "grad_norm": 0.13409040520330398, - "learning_rate": 0.00017097590087986633, - "loss": 0.1187, + "epoch": 0.5441441441441441, + "grad_norm": 0.1477303393799172, + "learning_rate": 0.00018257307732991008, + "loss": 0.1516, "step": 151 }, { - "epoch": 0.9806451612903225, - "grad_norm": 0.15236261967510367, - "learning_rate": 0.00017044445680315372, - "loss": 0.1541, + "epoch": 0.5477477477477477, + "grad_norm": 0.14857702506705278, + "learning_rate": 0.00018221560116948103, + "loss": 0.1453, "step": 152 }, { - "epoch": 0.9870967741935484, - "grad_norm": 0.15028826750982388, - "learning_rate": 0.00016990903356131124, - "loss": 0.1462, + "epoch": 0.5513513513513514, + "grad_norm": 0.14008518634545825, + "learning_rate": 0.0001818548531555006, + "loss": 0.1297, "step": 153 }, { - "epoch": 0.9935483870967742, - "grad_norm": 0.14170487741522195, - "learning_rate": 0.00016936966139855663, - "loss": 0.1275, + "epoch": 0.554954954954955, + "grad_norm": 0.12658212522638404, + "learning_rate": 0.0001814908476443034, + "loss": 0.1155, "step": 154 }, { - "epoch": 1.0, - "grad_norm": 0.14288119090775778, - "learning_rate": 0.00016882637078216868, - "loss": 0.1316, - "step": 155 - }, - { - "epoch": 1.0, - "eval_loss": 0.14944089949131012, - "eval_runtime": 27.6083, - "eval_samples_per_second": 4.745, - "eval_steps_per_second": 0.616, + "epoch": 0.5585585585585585, + "grad_norm": 0.149670716923037, + "learning_rate": 0.00018112359912185924, + "loss": 0.1211, "step": 155 }, { - "epoch": 1.0064516129032257, - "grad_norm": 0.08453811084356862, - "learning_rate": 0.0001682791924007661, - "loss": 0.0732, + "epoch": 0.5621621621621622, + "grad_norm": 0.11342605203968036, + "learning_rate": 0.000180753122203197, + "loss": 0.0899, "step": 156 }, { - "epoch": 1.0129032258064516, - "grad_norm": 0.1163343033490188, - "learning_rate": 0.00016772815716257412, - "loss": 0.0955, + "epoch": 0.5657657657657658, + "grad_norm": 0.15888593819383173, + "learning_rate": 0.00018037943163182283, + "loss": 0.1445, "step": 157 }, { - "epoch": 1.0193548387096774, - "grad_norm": 0.12774969124655083, - "learning_rate": 0.0001671732961936785, - "loss": 0.1154, + "epoch": 0.5693693693693693, + "grad_norm": 0.12437893978089608, + "learning_rate": 0.00018000254227913348, + "loss": 0.1152, "step": 158 }, { - "epoch": 1.0258064516129033, - "grad_norm": 0.10149504758995384, - "learning_rate": 0.00016661464083626734, - "loss": 0.0834, + "epoch": 0.572972972972973, + "grad_norm": 0.11638937373238138, + "learning_rate": 0.0001796224691438244, + "loss": 0.1123, "step": 159 }, { - "epoch": 1.032258064516129, - "grad_norm": 0.1558767636437416, - "learning_rate": 0.00016605222264686086, - "loss": 0.0978, + "epoch": 0.5765765765765766, + "grad_norm": 0.14812854362945038, + "learning_rate": 0.00017923922735129302, + "loss": 0.1263, "step": 160 }, { - "epoch": 1.038709677419355, - "grad_norm": 0.14800857594022712, - "learning_rate": 0.00016548607339452853, - "loss": 0.0783, + "epoch": 0.5801801801801801, + "grad_norm": 0.10770071386782099, + "learning_rate": 0.0001788528321530366, + "loss": 0.0955, "step": 161 }, { - "epoch": 1.0451612903225806, - "grad_norm": 0.10427437820954576, - "learning_rate": 0.00016491622505909482, - "loss": 0.0714, + "epoch": 0.5837837837837838, + "grad_norm": 0.1870539683925041, + "learning_rate": 0.00017846329892604547, + "loss": 0.1124, "step": 162 }, { - "epoch": 1.0516129032258064, - "grad_norm": 0.16509881361577539, - "learning_rate": 0.00016434270982933273, - "loss": 0.0971, + "epoch": 0.5873873873873874, + "grad_norm": 0.1560374478952629, + "learning_rate": 0.00017807064317219094, + "loss": 0.122, "step": 163 }, { - "epoch": 1.0580645161290323, - "grad_norm": 0.15528612822142446, - "learning_rate": 0.0001637655601011454, - "loss": 0.0806, + "epoch": 0.590990990990991, + "grad_norm": 0.14789972168680796, + "learning_rate": 0.00017767488051760857, + "loss": 0.0955, "step": 164 }, { - "epoch": 1.064516129032258, - "grad_norm": 0.1578175598943513, - "learning_rate": 0.00016318480847573642, - "loss": 0.0962, + "epoch": 0.5945945945945946, + "grad_norm": 0.17954009944461283, + "learning_rate": 0.00017727602671207605, + "loss": 0.1326, "step": 165 }, { - "epoch": 1.070967741935484, - "grad_norm": 0.2591491854389689, - "learning_rate": 0.00016260048775776804, - "loss": 0.1107, + "epoch": 0.5981981981981982, + "grad_norm": 0.12473531577026101, + "learning_rate": 0.00017687409762838664, + "loss": 0.139, "step": 166 }, { - "epoch": 1.0774193548387097, - "grad_norm": 0.1557337965418426, - "learning_rate": 0.00016201263095350833, - "loss": 0.0707, + "epoch": 0.6018018018018018, + "grad_norm": 0.18890214448118112, + "learning_rate": 0.00017646910926171747, + "loss": 0.158, "step": 167 }, { - "epoch": 1.0838709677419356, - "grad_norm": 0.24212581528206514, - "learning_rate": 0.0001614212712689668, - "loss": 0.0964, + "epoch": 0.6054054054054054, + "grad_norm": 0.1158510197827391, + "learning_rate": 0.00017606107772899287, + "loss": 0.124, "step": 168 }, { - "epoch": 1.0903225806451613, - "grad_norm": 0.12600203218602474, - "learning_rate": 0.00016082644210801844, - "loss": 0.0649, + "epoch": 0.609009009009009, + "grad_norm": 0.1513359972404607, + "learning_rate": 0.00017565001926824313, + "loss": 0.1535, "step": 169 }, { - "epoch": 1.096774193548387, - "grad_norm": 0.14377456483205683, - "learning_rate": 0.00016022817707051724, - "loss": 0.0778, + "epoch": 0.6126126126126126, + "grad_norm": 0.11561240472832256, + "learning_rate": 0.00017523595023795813, + "loss": 0.097, "step": 170 }, { - "epoch": 1.103225806451613, - "grad_norm": 0.1828183442232092, - "learning_rate": 0.00015962650995039783, - "loss": 0.1068, + "epoch": 0.6162162162162163, + "grad_norm": 0.14453378759822266, + "learning_rate": 0.00017481888711643655, + "loss": 0.1369, "step": 171 }, { - "epoch": 1.1096774193548387, - "grad_norm": 0.1695078607562437, - "learning_rate": 0.00015902147473376694, - "loss": 0.0938, + "epoch": 0.6198198198198198, + "grad_norm": 0.10823698221755142, + "learning_rate": 0.00017439884650112989, + "loss": 0.0854, "step": 172 }, { - "epoch": 1.1161290322580646, - "grad_norm": 0.10525262692642562, - "learning_rate": 0.00015841310559698343, - "loss": 0.0645, + "epoch": 0.6234234234234234, + "grad_norm": 0.16461158555393735, + "learning_rate": 0.0001739758451079821, + "loss": 0.1327, "step": 173 }, { - "epoch": 1.1225806451612903, - "grad_norm": 0.15379971536703851, - "learning_rate": 0.0001578014369047279, - "loss": 0.0752, + "epoch": 0.6270270270270271, + "grad_norm": 0.13330810816894179, + "learning_rate": 0.00017354989977076422, + "loss": 0.0988, "step": 174 }, { - "epoch": 1.129032258064516, - "grad_norm": 0.1352511385337785, - "learning_rate": 0.00015718650320806142, - "loss": 0.0803, + "epoch": 0.6306306306306306, + "grad_norm": 0.1603897957937655, + "learning_rate": 0.00017312102744040467, + "loss": 0.1517, "step": 175 }, { - "epoch": 1.135483870967742, - "grad_norm": 0.1708537982318491, - "learning_rate": 0.00015656833924247398, - "loss": 0.0908, + "epoch": 0.6342342342342342, + "grad_norm": 0.1387499574229483, + "learning_rate": 0.00017268924518431438, + "loss": 0.1159, "step": 176 }, { - "epoch": 1.1419354838709677, - "grad_norm": 0.16847128771716718, - "learning_rate": 0.00015594697992592232, - "loss": 0.0697, + "epoch": 0.6378378378378379, + "grad_norm": 0.15123399261590567, + "learning_rate": 0.0001722545701857079, + "loss": 0.135, "step": 177 }, { - "epoch": 1.1483870967741936, - "grad_norm": 0.14044376651199897, - "learning_rate": 0.00015532246035685756, - "loss": 0.0707, + "epoch": 0.6414414414414414, + "grad_norm": 0.201686818845506, + "learning_rate": 0.0001718170197429193, + "loss": 0.1601, "step": 178 }, { - "epoch": 1.1548387096774193, - "grad_norm": 0.13752732543534765, - "learning_rate": 0.00015469481581224272, - "loss": 0.0769, + "epoch": 0.645045045045045, + "grad_norm": 0.16050791333444517, + "learning_rate": 0.0001713766112687139, + "loss": 0.1376, "step": 179 }, { - "epoch": 1.1612903225806452, - "grad_norm": 0.20084535405957962, - "learning_rate": 0.00015406408174555976, - "loss": 0.1114, + "epoch": 0.6486486486486487, + "grad_norm": 0.13004224853328716, + "learning_rate": 0.00017093336228959536, + "loss": 0.1191, "step": 180 }, { - "epoch": 1.167741935483871, - "grad_norm": 0.16882912932677738, - "learning_rate": 0.0001534302937848073, - "loss": 0.0977, + "epoch": 0.6522522522522523, + "grad_norm": 0.10061992398695434, + "learning_rate": 0.000170487290445108, + "loss": 0.0958, "step": 181 }, { - "epoch": 1.1741935483870969, - "grad_norm": 0.16088758279122345, - "learning_rate": 0.00015279348773048786, - "loss": 0.0862, + "epoch": 0.6558558558558558, + "grad_norm": 0.09779721051938423, + "learning_rate": 0.0001700384134871351, + "loss": 0.098, "step": 182 }, { - "epoch": 1.1806451612903226, - "grad_norm": 0.1610420842518173, - "learning_rate": 0.00015215369955358566, - "loss": 0.0909, + "epoch": 0.6594594594594595, + "grad_norm": 0.12862092154540355, + "learning_rate": 0.0001695867492791921, + "loss": 0.1083, "step": 183 }, { - "epoch": 1.1870967741935483, - "grad_norm": 0.1567858759713509, - "learning_rate": 0.0001515109653935348, - "loss": 0.0988, + "epoch": 0.6630630630630631, + "grad_norm": 0.13476322854527875, + "learning_rate": 0.00016913231579571608, + "loss": 0.1466, "step": 184 }, { - "epoch": 1.1935483870967742, - "grad_norm": 0.12400211503217344, - "learning_rate": 0.00015086532155617784, - "loss": 0.0813, + "epoch": 0.6666666666666666, + "grad_norm": 0.09595530640274692, + "learning_rate": 0.00016867513112135013, + "loss": 0.0842, "step": 185 }, { - "epoch": 1.2, - "grad_norm": 0.12785181430507606, - "learning_rate": 0.00015021680451171498, - "loss": 0.0802, + "epoch": 0.6702702702702703, + "grad_norm": 0.15679543098949758, + "learning_rate": 0.00016821521345022377, + "loss": 0.1338, "step": 186 }, { - "epoch": 1.206451612903226, - "grad_norm": 0.14298258948499543, - "learning_rate": 0.00014956545089264407, - "loss": 0.0843, + "epoch": 0.6738738738738739, + "grad_norm": 0.14388550615027906, + "learning_rate": 0.00016775258108522908, + "loss": 0.1125, "step": 187 }, { - "epoch": 1.2129032258064516, - "grad_norm": 0.20148810623427083, - "learning_rate": 0.0001489112974916912, - "loss": 0.0942, + "epoch": 0.6774774774774774, + "grad_norm": 0.14073204006731552, + "learning_rate": 0.0001672872524372919, + "loss": 0.139, "step": 188 }, { - "epoch": 1.2193548387096773, - "grad_norm": 0.14657915199625932, - "learning_rate": 0.00014825438125973264, - "loss": 0.0829, + "epoch": 0.6810810810810811, + "grad_norm": 0.09327785295917886, + "learning_rate": 0.00016681924602463962, + "loss": 0.0876, "step": 189 }, { - "epoch": 1.2258064516129032, - "grad_norm": 0.16147059815020365, - "learning_rate": 0.00014759473930370736, - "loss": 0.0756, + "epoch": 0.6846846846846847, + "grad_norm": 0.0966354577674113, + "learning_rate": 0.00016634858047206378, + "loss": 0.0817, "step": 190 }, { - "epoch": 1.232258064516129, - "grad_norm": 0.17956881851269735, - "learning_rate": 0.0001469324088845212, - "loss": 0.1007, + "epoch": 0.6882882882882883, + "grad_norm": 0.1298212529485729, + "learning_rate": 0.00016587527451017938, + "loss": 0.1248, "step": 191 }, { - "epoch": 1.238709677419355, - "grad_norm": 0.1578318532174886, - "learning_rate": 0.00014626742741494206, - "loss": 0.0983, + "epoch": 0.6918918918918919, + "grad_norm": 0.15190505228456444, + "learning_rate": 0.00016539934697467894, + "loss": 0.1346, "step": 192 }, { - "epoch": 1.2451612903225806, - "grad_norm": 0.1649792762044239, - "learning_rate": 0.00014559983245748638, - "loss": 0.0905, + "epoch": 0.6954954954954955, + "grad_norm": 0.12074435445615049, + "learning_rate": 0.0001649208168055833, + "loss": 0.1218, "step": 193 }, { - "epoch": 1.2516129032258063, - "grad_norm": 0.1455653603561888, - "learning_rate": 0.00014492966172229777, - "loss": 0.0791, + "epoch": 0.6990990990990991, + "grad_norm": 0.11339361129121636, + "learning_rate": 0.0001644397030464877, + "loss": 0.0945, "step": 194 }, { - "epoch": 1.2580645161290323, - "grad_norm": 0.1312841252501999, - "learning_rate": 0.00014425695306501658, - "loss": 0.0613, + "epoch": 0.7027027027027027, + "grad_norm": 0.1480633681266718, + "learning_rate": 0.00016395602484380406, + "loss": 0.143, "step": 195 }, { - "epoch": 1.2645161290322582, - "grad_norm": 0.1361347125426188, - "learning_rate": 0.00014358174448464154, - "loss": 0.0773, + "epoch": 0.7063063063063063, + "grad_norm": 0.13202765755871132, + "learning_rate": 0.0001634698014459988, + "loss": 0.1256, "step": 196 }, { - "epoch": 1.270967741935484, - "grad_norm": 0.17551022044504175, - "learning_rate": 0.00014290407412138366, - "loss": 0.0985, + "epoch": 0.7099099099099099, + "grad_norm": 0.10905065599283695, + "learning_rate": 0.00016298105220282713, + "loss": 0.1024, "step": 197 }, { - "epoch": 1.2774193548387096, - "grad_norm": 0.1456899710914213, - "learning_rate": 0.00014222398025451135, - "loss": 0.0827, + "epoch": 0.7135135135135136, + "grad_norm": 0.10616436723037755, + "learning_rate": 0.00016248979656456275, + "loss": 0.1066, "step": 198 }, { - "epoch": 1.2838709677419355, - "grad_norm": 0.19482064326323745, - "learning_rate": 0.00014154150130018866, - "loss": 0.0974, + "epoch": 0.7171171171171171, + "grad_norm": 0.1063733952868901, + "learning_rate": 0.0001619960540812239, + "loss": 0.1065, "step": 199 }, { - "epoch": 1.2903225806451613, - "grad_norm": 0.13544624920378107, - "learning_rate": 0.0001408566758093048, - "loss": 0.0742, + "epoch": 0.7207207207207207, + "grad_norm": 0.1648449550913926, + "learning_rate": 0.00016149984440179537, + "loss": 0.1416, "step": 200 }, { - "epoch": 1.2967741935483872, - "grad_norm": 0.12685823273525554, - "learning_rate": 0.00014016954246529696, - "loss": 0.0677, + "epoch": 0.7243243243243244, + "grad_norm": 0.14504142427358913, + "learning_rate": 0.00016100118727344659, + "loss": 0.1323, "step": 201 }, { - "epoch": 1.303225806451613, - "grad_norm": 0.15457862761329938, - "learning_rate": 0.00013948014008196487, - "loss": 0.0815, + "epoch": 0.7279279279279279, + "grad_norm": 0.15511743070098452, + "learning_rate": 0.00016050010254074564, + "loss": 0.1259, "step": 202 }, { - "epoch": 1.3096774193548386, - "grad_norm": 0.1482256655702007, - "learning_rate": 0.0001387885076012785, - "loss": 0.0961, + "epoch": 0.7315315315315315, + "grad_norm": 0.12280785450706579, + "learning_rate": 0.00015999661014486956, + "loss": 0.1165, "step": 203 }, { - "epoch": 1.3161290322580645, - "grad_norm": 0.20896278322687534, - "learning_rate": 0.00013809468409117846, - "loss": 0.1049, + "epoch": 0.7351351351351352, + "grad_norm": 0.13888404263902684, + "learning_rate": 0.00015949073012281093, + "loss": 0.1047, "step": 204 }, { - "epoch": 1.3225806451612903, - "grad_norm": 0.175019480492322, - "learning_rate": 0.00013739870874336898, - "loss": 0.0884, + "epoch": 0.7387387387387387, + "grad_norm": 0.10651036692593711, + "learning_rate": 0.00015898248260658016, + "loss": 0.1181, "step": 205 }, { - "epoch": 1.3290322580645162, - "grad_norm": 0.14363356864891735, - "learning_rate": 0.00013670062087110422, - "loss": 0.0864, + "epoch": 0.7423423423423423, + "grad_norm": 0.10861974936989245, + "learning_rate": 0.0001584718878224047, + "loss": 0.1064, "step": 206 }, { - "epoch": 1.335483870967742, - "grad_norm": 0.1397081258858762, - "learning_rate": 0.00013600045990696762, - "loss": 0.0797, + "epoch": 0.745945945945946, + "grad_norm": 0.10231866176721904, + "learning_rate": 0.00015795896608992378, + "loss": 0.0988, "step": 207 }, { - "epoch": 1.3419354838709676, - "grad_norm": 0.1667674455506937, - "learning_rate": 0.0001352982654006444, - "loss": 0.0977, + "epoch": 0.7495495495495496, + "grad_norm": 0.15024568241023914, + "learning_rate": 0.00015744373782137992, + "loss": 0.1489, "step": 208 }, { - "epoch": 1.3483870967741935, - "grad_norm": 0.1409898972781143, - "learning_rate": 0.00013459407701668763, - "loss": 0.0923, + "epoch": 0.7531531531531531, + "grad_norm": 0.12371588452286458, + "learning_rate": 0.00015692622352080662, + "loss": 0.116, "step": 209 }, { - "epoch": 1.3548387096774195, - "grad_norm": 0.16290520924324786, - "learning_rate": 0.00013388793453227767, - "loss": 0.1033, + "epoch": 0.7567567567567568, + "grad_norm": 0.11392721432010788, + "learning_rate": 0.00015640644378321235, + "loss": 0.1015, "step": 210 }, { - "epoch": 1.3612903225806452, - "grad_norm": 0.14967784709530377, - "learning_rate": 0.0001331798778349752, - "loss": 0.0843, + "epoch": 0.7603603603603604, + "grad_norm": 0.11201427932233406, + "learning_rate": 0.00015588441929376097, + "loss": 0.0863, "step": 211 }, { - "epoch": 1.367741935483871, - "grad_norm": 0.14370370779202904, - "learning_rate": 0.00013246994692046836, - "loss": 0.0845, + "epoch": 0.7639639639639639, + "grad_norm": 0.159849063390471, + "learning_rate": 0.00015536017082694846, + "loss": 0.1651, "step": 212 }, { - "epoch": 1.3741935483870968, - "grad_norm": 0.15288920332301245, - "learning_rate": 0.00013175818189031327, - "loss": 0.0962, + "epoch": 0.7675675675675676, + "grad_norm": 0.13935698141384686, + "learning_rate": 0.00015483371924577635, + "loss": 0.1262, "step": 213 }, { - "epoch": 1.3806451612903226, - "grad_norm": 0.16907810212754584, - "learning_rate": 0.00013104462294966896, - "loss": 0.1033, + "epoch": 0.7711711711711712, + "grad_norm": 0.15388913192797118, + "learning_rate": 0.00015430508550092124, + "loss": 0.1602, "step": 214 }, { - "epoch": 1.3870967741935485, - "grad_norm": 0.17023880837738012, - "learning_rate": 0.00013032931040502627, - "loss": 0.1155, + "epoch": 0.7747747747747747, + "grad_norm": 0.11744911276482749, + "learning_rate": 0.00015377429062990122, + "loss": 0.1082, "step": 215 }, { - "epoch": 1.3935483870967742, - "grad_norm": 0.12189251069561345, - "learning_rate": 0.00012961228466193116, - "loss": 0.0692, + "epoch": 0.7783783783783784, + "grad_norm": 0.14669529425537173, + "learning_rate": 0.00015324135575623857, + "loss": 0.1329, "step": 216 }, { - "epoch": 1.4, - "grad_norm": 0.18108512316647296, - "learning_rate": 0.00012889358622270223, - "loss": 0.086, + "epoch": 0.781981981981982, + "grad_norm": 0.09725689202217797, + "learning_rate": 0.00015270630208861916, + "loss": 0.1001, "step": 217 }, { - "epoch": 1.4064516129032258, - "grad_norm": 0.15804455793477432, - "learning_rate": 0.00012817325568414297, - "loss": 0.0751, + "epoch": 0.7855855855855856, + "grad_norm": 0.09066648478601479, + "learning_rate": 0.00015216915092004847, + "loss": 0.1005, "step": 218 }, { - "epoch": 1.4129032258064516, - "grad_norm": 0.13668879316730062, - "learning_rate": 0.00012745133373524853, - "loss": 0.0786, + "epoch": 0.7891891891891892, + "grad_norm": 0.10556590806339675, + "learning_rate": 0.00015162992362700406, + "loss": 0.104, "step": 219 }, { - "epoch": 1.4193548387096775, - "grad_norm": 0.18610949095399393, - "learning_rate": 0.0001267278611549073, - "loss": 0.0732, + "epoch": 0.7927927927927928, + "grad_norm": 0.10568504388848617, + "learning_rate": 0.00015108864166858506, + "loss": 0.1079, "step": 220 }, { - "epoch": 1.4258064516129032, - "grad_norm": 0.186875270696186, - "learning_rate": 0.00012600287880959763, - "loss": 0.1051, + "epoch": 0.7963963963963964, + "grad_norm": 0.13168798693648778, + "learning_rate": 0.0001505453265856581, + "loss": 0.1319, "step": 221 }, { - "epoch": 1.432258064516129, - "grad_norm": 0.15558801012557805, - "learning_rate": 0.0001252764276510792, - "loss": 0.0879, + "epoch": 0.8, + "grad_norm": 0.11471998852906086, + "learning_rate": 0.00015000000000000001, + "loss": 0.1181, "step": 222 }, { - "epoch": 1.4387096774193548, - "grad_norm": 0.16188457902393685, - "learning_rate": 0.00012454854871407994, - "loss": 0.0887, + "epoch": 0.8036036036036036, + "grad_norm": 0.11715811264986671, + "learning_rate": 0.00014945268361343748, + "loss": 0.1053, "step": 223 }, { - "epoch": 1.4451612903225808, - "grad_norm": 0.1332789968563102, - "learning_rate": 0.00012381928311397806, - "loss": 0.0819, + "epoch": 0.8072072072072072, + "grad_norm": 0.10491568472945026, + "learning_rate": 0.00014890339920698334, + "loss": 0.0931, "step": 224 }, { - "epoch": 1.4516129032258065, - "grad_norm": 0.13104439295337186, - "learning_rate": 0.0001230886720444796, - "loss": 0.0992, + "epoch": 0.8108108108108109, + "grad_norm": 0.1381588520705617, + "learning_rate": 0.00014835216863996975, + "loss": 0.1417, "step": 225 }, { - "epoch": 1.4580645161290322, - "grad_norm": 0.1741921210517873, - "learning_rate": 0.00012235675677529158, - "loss": 0.1049, + "epoch": 0.8144144144144144, + "grad_norm": 0.15507649891779268, + "learning_rate": 0.0001477990138491783, + "loss": 0.1418, "step": 226 }, { - "epoch": 1.4645161290322581, - "grad_norm": 0.17686059237450052, - "learning_rate": 0.00012162357864979072, - "loss": 0.0942, + "epoch": 0.818018018018018, + "grad_norm": 0.11750139732263555, + "learning_rate": 0.0001472439568479671, + "loss": 0.1207, "step": 227 }, { - "epoch": 1.4709677419354839, - "grad_norm": 0.14208255193217198, - "learning_rate": 0.00012088917908268821, - "loss": 0.0932, + "epoch": 0.8216216216216217, + "grad_norm": 0.12736893690378323, + "learning_rate": 0.00014668701972539458, + "loss": 0.1277, "step": 228 }, { - "epoch": 1.4774193548387098, - "grad_norm": 0.16207984639669018, - "learning_rate": 0.00012015359955769021, - "loss": 0.0952, + "epoch": 0.8252252252252252, + "grad_norm": 0.11333975714941213, + "learning_rate": 0.00014612822464534059, + "loss": 0.1113, "step": 229 }, { - "epoch": 1.4838709677419355, - "grad_norm": 0.10567516069189406, - "learning_rate": 0.00011941688162515467, - "loss": 0.0698, + "epoch": 0.8288288288288288, + "grad_norm": 0.11811846805876995, + "learning_rate": 0.00014556759384562416, + "loss": 0.1174, "step": 230 }, { - "epoch": 1.4903225806451612, - "grad_norm": 0.14315458863752037, - "learning_rate": 0.00011867906689974428, - "loss": 0.0706, + "epoch": 0.8324324324324325, + "grad_norm": 0.12819157247369997, + "learning_rate": 0.00014500514963711883, + "loss": 0.1143, "step": 231 }, { - "epoch": 1.4967741935483871, - "grad_norm": 0.14644874259664967, - "learning_rate": 0.00011794019705807584, - "loss": 0.0954, + "epoch": 0.836036036036036, + "grad_norm": 0.11727175144557134, + "learning_rate": 0.0001444409144028644, + "loss": 0.1153, "step": 232 }, { - "epoch": 1.5032258064516129, - "grad_norm": 0.1522258926853921, - "learning_rate": 0.00011720031383636585, - "loss": 0.1026, + "epoch": 0.8396396396396396, + "grad_norm": 0.13457703292067713, + "learning_rate": 0.00014387491059717652, + "loss": 0.1199, "step": 233 }, { - "epoch": 1.5096774193548388, - "grad_norm": 0.2389589012648026, - "learning_rate": 0.00011645945902807341, - "loss": 0.0981, + "epoch": 0.8432432432432433, + "grad_norm": 0.11901299124274167, + "learning_rate": 0.00014330716074475286, + "loss": 0.1147, "step": 234 }, { - "epoch": 1.5161290322580645, - "grad_norm": 0.1539160607687386, - "learning_rate": 0.00011571767448153901, - "loss": 0.074, + "epoch": 0.8468468468468469, + "grad_norm": 0.10353994887251415, + "learning_rate": 0.00014273768743977685, + "loss": 0.1026, "step": 235 }, { - "epoch": 1.5225806451612902, - "grad_norm": 0.17340052794011998, - "learning_rate": 0.00011497500209762102, - "loss": 0.0943, + "epoch": 0.8504504504504504, + "grad_norm": 0.10419191980690304, + "learning_rate": 0.0001421665133450184, + "loss": 0.1063, "step": 236 }, { - "epoch": 1.5290322580645161, - "grad_norm": 0.11294207936842038, - "learning_rate": 0.00011423148382732853, - "loss": 0.0644, + "epoch": 0.8540540540540541, + "grad_norm": 0.12748698891225302, + "learning_rate": 0.00014159366119093214, + "loss": 0.1079, "step": 237 }, { - "epoch": 1.535483870967742, - "grad_norm": 0.14391081206665055, - "learning_rate": 0.00011348716166945195, - "loss": 0.0774, + "epoch": 0.8576576576576577, + "grad_norm": 0.16200721887310557, + "learning_rate": 0.00014101915377475274, + "loss": 0.1152, "step": 238 }, { - "epoch": 1.5419354838709678, - "grad_norm": 0.16298875474891092, - "learning_rate": 0.0001127420776681905, - "loss": 0.0786, + "epoch": 0.8612612612612612, + "grad_norm": 0.15795975577284813, + "learning_rate": 0.0001404430139595877, + "loss": 0.1542, "step": 239 }, { - "epoch": 1.5483870967741935, - "grad_norm": 0.1325076119120941, - "learning_rate": 0.00011199627391077732, - "loss": 0.0889, + "epoch": 0.8648648648648649, + "grad_norm": 0.14933463930244448, + "learning_rate": 0.0001398652646735076, + "loss": 0.1236, "step": 240 }, { - "epoch": 1.5548387096774192, - "grad_norm": 0.10971482413094012, - "learning_rate": 0.00011124979252510208, - "loss": 0.0737, + "epoch": 0.8684684684684685, + "grad_norm": 0.16198753222835588, + "learning_rate": 0.0001392859289086334, + "loss": 0.1375, "step": 241 }, { - "epoch": 1.5612903225806452, - "grad_norm": 0.14614299703062, - "learning_rate": 0.0001105026756773314, - "loss": 0.0682, + "epoch": 0.872072072072072, + "grad_norm": 0.13433467388254222, + "learning_rate": 0.00013870502972022173, + "loss": 0.1323, "step": 242 }, { - "epoch": 1.567741935483871, - "grad_norm": 0.22690104820583093, - "learning_rate": 0.00010975496556952682, - "loss": 0.1094, + "epoch": 0.8756756756756757, + "grad_norm": 0.12593674925296103, + "learning_rate": 0.00013812259022574717, + "loss": 0.1216, "step": 243 }, { - "epoch": 1.5741935483870968, - "grad_norm": 0.2561727438038473, - "learning_rate": 0.00010900670443726135, - "loss": 0.0851, + "epoch": 0.8792792792792793, + "grad_norm": 0.13013719230493928, + "learning_rate": 0.00013753863360398241, + "loss": 0.1247, "step": 244 }, { - "epoch": 1.5806451612903225, - "grad_norm": 0.15371305701947427, - "learning_rate": 0.00010825793454723325, - "loss": 0.0923, + "epoch": 0.8828828828828829, + "grad_norm": 0.11799267349520824, + "learning_rate": 0.0001369531830940757, + "loss": 0.1086, "step": 245 }, { - "epoch": 1.5870967741935482, - "grad_norm": 0.16267574566743875, - "learning_rate": 0.00010750869819487883, - "loss": 0.1036, + "epoch": 0.8864864864864865, + "grad_norm": 0.08312084262618047, + "learning_rate": 0.00013636626199462615, + "loss": 0.0813, "step": 246 }, { - "epoch": 1.5935483870967742, - "grad_norm": 0.15591155698404394, - "learning_rate": 0.00010675903770198333, - "loss": 0.0893, + "epoch": 0.8900900900900901, + "grad_norm": 0.1338651554767216, + "learning_rate": 0.00013577789366275644, + "loss": 0.137, "step": 247 }, { - "epoch": 1.6, - "grad_norm": 0.14338972962339533, - "learning_rate": 0.00010600899541429004, - "loss": 0.0837, + "epoch": 0.8936936936936937, + "grad_norm": 0.10150227632820087, + "learning_rate": 0.0001351881015131833, + "loss": 0.0975, "step": 248 }, { - "epoch": 1.6064516129032258, - "grad_norm": 0.12387607320751257, - "learning_rate": 0.00010525861369910877, - "loss": 0.0755, + "epoch": 0.8972972972972973, + "grad_norm": 0.10189929428402296, + "learning_rate": 0.00013459690901728588, + "loss": 0.0923, "step": 249 }, { - "epoch": 1.6129032258064515, - "grad_norm": 0.16606169294386383, - "learning_rate": 0.00010450793494292224, - "loss": 0.1043, + "epoch": 0.9009009009009009, + "grad_norm": 0.1408210936693087, + "learning_rate": 0.00013400433970217135, + "loss": 0.1378, "step": 250 }, { - "epoch": 1.6193548387096774, - "grad_norm": 0.1795920159350681, - "learning_rate": 0.00010375700154899208, - "loss": 0.1008, + "epoch": 0.9045045045045045, + "grad_norm": 0.11765895193363322, + "learning_rate": 0.000133410417149739, + "loss": 0.1096, "step": 251 }, { - "epoch": 1.6258064516129034, - "grad_norm": 0.13025097291519463, - "learning_rate": 0.00010300585593496348, - "loss": 0.0851, + "epoch": 0.9081081081081082, + "grad_norm": 0.1413792560787727, + "learning_rate": 0.00013281516499574135, + "loss": 0.1401, "step": 252 }, { - "epoch": 1.632258064516129, - "grad_norm": 0.14349816154023654, - "learning_rate": 0.00010225454053046921, - "loss": 0.0807, + "epoch": 0.9117117117117117, + "grad_norm": 0.08054406846656884, + "learning_rate": 0.00013221860692884396, + "loss": 0.0835, "step": 253 }, { - "epoch": 1.6387096774193548, - "grad_norm": 0.16695448888874226, - "learning_rate": 0.00010150309777473306, - "loss": 0.1117, + "epoch": 0.9153153153153153, + "grad_norm": 0.12127761773938303, + "learning_rate": 0.0001316207666896824, + "loss": 0.118, "step": 254 }, { - "epoch": 1.6451612903225805, - "grad_norm": 0.15743712528269815, - "learning_rate": 0.0001007515701141722, - "loss": 0.086, + "epoch": 0.918918918918919, + "grad_norm": 0.10139113989817501, + "learning_rate": 0.00013102166806991768, + "loss": 0.0966, "step": 255 }, { - "epoch": 1.6516129032258065, - "grad_norm": 0.16704335210894908, - "learning_rate": 0.0001, - "loss": 0.0886, + "epoch": 0.9225225225225225, + "grad_norm": 0.10511129293269068, + "learning_rate": 0.00013042133491128935, + "loss": 0.0846, "step": 256 }, { - "epoch": 1.6580645161290324, - "grad_norm": 0.10245531520994122, - "learning_rate": 9.924842988582782e-05, - "loss": 0.0678, + "epoch": 0.9261261261261261, + "grad_norm": 0.13928639672942275, + "learning_rate": 0.00012981979110466654, + "loss": 0.1106, "step": 257 }, { - "epoch": 1.664516129032258, - "grad_norm": 0.1688033836669086, - "learning_rate": 9.849690222526698e-05, - "loss": 0.0958, + "epoch": 0.9297297297297298, + "grad_norm": 0.1575504268549112, + "learning_rate": 0.00012921706058909756, + "loss": 0.1022, "step": 258 }, { - "epoch": 1.6709677419354838, - "grad_norm": 0.1338126992775965, - "learning_rate": 9.77454594695308e-05, - "loss": 0.0782, + "epoch": 0.9333333333333333, + "grad_norm": 0.09456528837585412, + "learning_rate": 0.00012861316735085686, + "loss": 0.0943, "step": 259 }, { - "epoch": 1.6774193548387095, - "grad_norm": 0.1332820451604909, - "learning_rate": 9.699414406503654e-05, - "loss": 0.0856, + "epoch": 0.9369369369369369, + "grad_norm": 0.11421875251828266, + "learning_rate": 0.00012800813542249072, + "loss": 0.0988, "step": 260 }, { - "epoch": 1.6838709677419355, - "grad_norm": 0.1449401720605745, - "learning_rate": 9.624299845100795e-05, - "loss": 0.0882, + "epoch": 0.9405405405405406, + "grad_norm": 0.11985070545179864, + "learning_rate": 0.00012740198888186064, + "loss": 0.1238, "step": 261 }, { - "epoch": 1.6903225806451614, - "grad_norm": 0.19395703151279187, - "learning_rate": 9.549206505707777e-05, - "loss": 0.1148, + "epoch": 0.9441441441441442, + "grad_norm": 0.09679571111756961, + "learning_rate": 0.00012679475185118535, + "loss": 0.1063, "step": 262 }, { - "epoch": 1.696774193548387, - "grad_norm": 0.13482277559178169, - "learning_rate": 9.474138630089124e-05, - "loss": 0.0644, + "epoch": 0.9477477477477477, + "grad_norm": 0.09782919038732428, + "learning_rate": 0.0001261864484960807, + "loss": 0.1039, "step": 263 }, { - "epoch": 1.7032258064516128, - "grad_norm": 0.18530593840863338, - "learning_rate": 9.399100458570997e-05, - "loss": 0.1074, + "epoch": 0.9513513513513514, + "grad_norm": 0.17653628828090737, + "learning_rate": 0.00012557710302459803, + "loss": 0.1354, "step": 264 }, { - "epoch": 1.7096774193548387, - "grad_norm": 0.1897995033582595, - "learning_rate": 9.324096229801674e-05, - "loss": 0.0867, + "epoch": 0.954954954954955, + "grad_norm": 0.1409157686607275, + "learning_rate": 0.00012496673968626068, + "loss": 0.1181, "step": 265 }, { - "epoch": 1.7161290322580647, - "grad_norm": 0.16472780150681127, - "learning_rate": 9.249130180512118e-05, - "loss": 0.0896, + "epoch": 0.9585585585585585, + "grad_norm": 0.16396955244736236, + "learning_rate": 0.0001243553827710992, + "loss": 0.1352, "step": 266 }, { - "epoch": 1.7225806451612904, - "grad_norm": 0.15525608679066774, - "learning_rate": 9.174206545276677e-05, - "loss": 0.0865, + "epoch": 0.9621621621621622, + "grad_norm": 0.11706567681214818, + "learning_rate": 0.0001237430566086844, + "loss": 0.1103, "step": 267 }, { - "epoch": 1.729032258064516, - "grad_norm": 0.15426999703424252, - "learning_rate": 9.099329556273866e-05, - "loss": 0.0853, + "epoch": 0.9657657657657658, + "grad_norm": 0.11951814155751256, + "learning_rate": 0.00012312978556715932, + "loss": 0.1182, "step": 268 }, { - "epoch": 1.7354838709677418, - "grad_norm": 0.19103115451320254, - "learning_rate": 9.024503443047319e-05, - "loss": 0.0993, + "epoch": 0.9693693693693693, + "grad_norm": 0.1098976660754676, + "learning_rate": 0.00012251559405226941, + "loss": 0.0981, "step": 269 }, { - "epoch": 1.7419354838709677, - "grad_norm": 0.12323460303269068, - "learning_rate": 8.949732432266866e-05, - "loss": 0.0723, + "epoch": 0.972972972972973, + "grad_norm": 0.13497038508376635, + "learning_rate": 0.00012190050650639131, + "loss": 0.139, "step": 270 }, { - "epoch": 1.7483870967741937, - "grad_norm": 0.1339668030381976, - "learning_rate": 8.875020747489794e-05, - "loss": 0.0852, + "epoch": 0.9765765765765766, + "grad_norm": 0.10505221561748224, + "learning_rate": 0.00012128454740756014, + "loss": 0.0968, "step": 271 }, { - "epoch": 1.7548387096774194, - "grad_norm": 0.1385020226698345, - "learning_rate": 8.800372608922271e-05, - "loss": 0.0773, + "epoch": 0.9801801801801802, + "grad_norm": 0.09400827669331373, + "learning_rate": 0.00012066774126849529, + "loss": 0.091, "step": 272 }, { - "epoch": 1.761290322580645, - "grad_norm": 0.21835470061774626, - "learning_rate": 8.72579223318095e-05, - "loss": 0.1167, + "epoch": 0.9837837837837838, + "grad_norm": 0.13553635299634834, + "learning_rate": 0.00012005011263562513, + "loss": 0.1269, "step": 273 }, { - "epoch": 1.7677419354838708, - "grad_norm": 0.1425322245524885, - "learning_rate": 8.651283833054809e-05, - "loss": 0.0801, + "epoch": 0.9873873873873874, + "grad_norm": 0.12708467697016343, + "learning_rate": 0.00011943168608810978, + "loss": 0.1393, "step": 274 }, { - "epoch": 1.7741935483870968, - "grad_norm": 0.1794495686810833, - "learning_rate": 8.57685161726715e-05, - "loss": 0.0784, + "epoch": 0.990990990990991, + "grad_norm": 0.12975117728566488, + "learning_rate": 0.00011881248623686338, + "loss": 0.1305, "step": 275 }, { - "epoch": 1.7806451612903227, - "grad_norm": 0.15688310999446023, - "learning_rate": 8.5024997902379e-05, - "loss": 0.1001, + "epoch": 0.9945945945945946, + "grad_norm": 0.13065574753229398, + "learning_rate": 0.00011819253772357442, + "loss": 0.1236, "step": 276 }, { - "epoch": 1.7870967741935484, - "grad_norm": 0.15417421931142297, - "learning_rate": 8.428232551846101e-05, - "loss": 0.0898, + "epoch": 0.9981981981981982, + "grad_norm": 0.11060359555949814, + "learning_rate": 0.00011757186521972512, + "loss": 0.1018, "step": 277 }, { - "epoch": 1.793548387096774, - "grad_norm": 0.17958092903510822, - "learning_rate": 8.35405409719266e-05, - "loss": 0.0921, + "epoch": 0.9981981981981982, + "eval_loss": 0.12383058667182922, + "eval_runtime": 52.8086, + "eval_samples_per_second": 4.431, + "eval_steps_per_second": 0.568, + "step": 277 + }, + { + "epoch": 1.0018018018018018, + "grad_norm": 0.11956545731254414, + "learning_rate": 0.00011695049342560968, + "loss": 0.0926, "step": 278 }, { - "epoch": 1.8, - "grad_norm": 0.165061388870699, - "learning_rate": 8.279968616363418e-05, - "loss": 0.0935, + "epoch": 1.0054054054054054, + "grad_norm": 0.07533035620811855, + "learning_rate": 0.00011632844706935124, + "loss": 0.0797, "step": 279 }, { - "epoch": 1.8064516129032258, - "grad_norm": 0.1116000947584692, - "learning_rate": 8.205980294192421e-05, - "loss": 0.0684, + "epoch": 1.009009009009009, + "grad_norm": 0.07020760288792346, + "learning_rate": 0.00011570575090591791, + "loss": 0.0607, "step": 280 }, { - "epoch": 1.8129032258064517, - "grad_norm": 0.12370745355237868, - "learning_rate": 8.132093310025571e-05, - "loss": 0.0764, + "epoch": 1.0126126126126127, + "grad_norm": 0.07819045444088978, + "learning_rate": 0.00011508242971613741, + "loss": 0.0735, "step": 281 }, { - "epoch": 1.8193548387096774, - "grad_norm": 0.1724137684896053, - "learning_rate": 8.058311837484535e-05, - "loss": 0.0969, + "epoch": 1.0162162162162163, + "grad_norm": 0.10053168911518578, + "learning_rate": 0.0001144585083057111, + "loss": 0.0835, "step": 282 }, { - "epoch": 1.8258064516129031, - "grad_norm": 0.16125193051943912, - "learning_rate": 7.984640044230983e-05, - "loss": 0.0868, + "epoch": 1.0198198198198198, + "grad_norm": 0.10526070984024917, + "learning_rate": 0.0001138340115042267, + "loss": 0.0951, "step": 283 }, { - "epoch": 1.832258064516129, - "grad_norm": 0.13929471780512187, - "learning_rate": 7.911082091731181e-05, - "loss": 0.0701, + "epoch": 1.0234234234234234, + "grad_norm": 0.09945638649949284, + "learning_rate": 0.00011320896416417026, + "loss": 0.0767, "step": 284 }, { - "epoch": 1.838709677419355, - "grad_norm": 0.1172859356132756, - "learning_rate": 7.837642135020929e-05, - "loss": 0.0705, + "epoch": 1.027027027027027, + "grad_norm": 0.07761913145188672, + "learning_rate": 0.00011258339115993696, + "loss": 0.0683, "step": 285 }, { - "epoch": 1.8451612903225807, - "grad_norm": 0.1416371336298496, - "learning_rate": 7.764324322470841e-05, - "loss": 0.0683, + "epoch": 1.0306306306306305, + "grad_norm": 0.09704735378738133, + "learning_rate": 0.0001119573173868415, + "loss": 0.0743, "step": 286 }, { - "epoch": 1.8516129032258064, - "grad_norm": 0.19198949443360017, - "learning_rate": 7.691132795552043e-05, - "loss": 0.0894, + "epoch": 1.0342342342342343, + "grad_norm": 0.07516525486775329, + "learning_rate": 0.000111330767760127, + "loss": 0.055, "step": 287 }, { - "epoch": 1.8580645161290321, - "grad_norm": 0.19384526534149363, - "learning_rate": 7.618071688602199e-05, - "loss": 0.0954, + "epoch": 1.037837837837838, + "grad_norm": 0.12817568478073565, + "learning_rate": 0.00011070376721397373, + "loss": 0.0812, "step": 288 }, { - "epoch": 1.864516129032258, - "grad_norm": 0.21508649416468206, - "learning_rate": 7.54514512859201e-05, - "loss": 0.0986, + "epoch": 1.0414414414414415, + "grad_norm": 0.14184653764167465, + "learning_rate": 0.00011007634070050684, + "loss": 0.1011, "step": 289 }, { - "epoch": 1.870967741935484, - "grad_norm": 0.1952670486171591, - "learning_rate": 7.472357234892082e-05, - "loss": 0.1049, + "epoch": 1.045045045045045, + "grad_norm": 0.12176639431416836, + "learning_rate": 0.00010944851318880314, + "loss": 0.0658, "step": 290 }, { - "epoch": 1.8774193548387097, - "grad_norm": 0.13259304301639457, - "learning_rate": 7.399712119040238e-05, - "loss": 0.0784, + "epoch": 1.0486486486486486, + "grad_norm": 0.12307205920891376, + "learning_rate": 0.00010882030966389766, + "loss": 0.0681, "step": 291 }, { - "epoch": 1.8838709677419354, - "grad_norm": 0.1367442063153438, - "learning_rate": 7.327213884509272e-05, - "loss": 0.0828, + "epoch": 1.0522522522522522, + "grad_norm": 0.10538765068004156, + "learning_rate": 0.00010819175512578926, + "loss": 0.0641, "step": 292 }, { - "epoch": 1.8903225806451613, - "grad_norm": 0.16389743747526248, - "learning_rate": 7.254866626475152e-05, - "loss": 0.0927, + "epoch": 1.055855855855856, + "grad_norm": 0.1835426273917669, + "learning_rate": 0.00010756287458844569, + "loss": 0.0741, "step": 293 }, { - "epoch": 1.896774193548387, - "grad_norm": 0.22089121957380548, - "learning_rate": 7.182674431585704e-05, - "loss": 0.1077, + "epoch": 1.0594594594594595, + "grad_norm": 0.15004556066173333, + "learning_rate": 0.00010693369307880816, + "loss": 0.0697, "step": 294 }, { - "epoch": 1.903225806451613, - "grad_norm": 0.14882797154919192, - "learning_rate": 7.110641377729778e-05, - "loss": 0.0879, + "epoch": 1.063063063063063, + "grad_norm": 0.17087142751095086, + "learning_rate": 0.00010630423563579551, + "loss": 0.0908, "step": 295 }, { - "epoch": 1.9096774193548387, - "grad_norm": 0.12794357419440644, - "learning_rate": 7.038771533806884e-05, - "loss": 0.0719, + "epoch": 1.0666666666666667, + "grad_norm": 0.11365528787847244, + "learning_rate": 0.00010567452730930743, + "loss": 0.0618, "step": 296 }, { - "epoch": 1.9161290322580644, - "grad_norm": 0.1846824594193936, - "learning_rate": 6.967068959497376e-05, - "loss": 0.0895, + "epoch": 1.0702702702702702, + "grad_norm": 0.1164578811980725, + "learning_rate": 0.00010504459315922788, + "loss": 0.0622, "step": 297 }, { - "epoch": 1.9225806451612903, - "grad_norm": 0.17646276082900025, - "learning_rate": 6.895537705033108e-05, - "loss": 0.0996, + "epoch": 1.0738738738738738, + "grad_norm": 0.188496193123827, + "learning_rate": 0.00010441445825442772, + "loss": 0.1077, "step": 298 }, { - "epoch": 1.9290322580645163, - "grad_norm": 0.15604696338212207, - "learning_rate": 6.824181810968675e-05, - "loss": 0.087, + "epoch": 1.0774774774774776, + "grad_norm": 0.10295065488210545, + "learning_rate": 0.00010378414767176705, + "loss": 0.0735, "step": 299 }, { - "epoch": 1.935483870967742, - "grad_norm": 0.15478000466177547, - "learning_rate": 6.753005307953167e-05, - "loss": 0.0831, + "epoch": 1.0810810810810811, + "grad_norm": 0.15374433915948252, + "learning_rate": 0.00010315368649509716, + "loss": 0.085, "step": 300 }, { - "epoch": 1.9419354838709677, - "grad_norm": 0.19642059718505833, - "learning_rate": 6.682012216502484e-05, - "loss": 0.1073, + "epoch": 1.0846846846846847, + "grad_norm": 0.09815844567459187, + "learning_rate": 0.00010252309981426244, + "loss": 0.054, "step": 301 }, { - "epoch": 1.9483870967741934, - "grad_norm": 0.17255606883063224, - "learning_rate": 6.611206546772237e-05, - "loss": 0.1018, + "epoch": 1.0882882882882883, + "grad_norm": 0.09380489686860213, + "learning_rate": 0.0001018924127241019, + "loss": 0.058, "step": 302 }, { - "epoch": 1.9548387096774194, - "grad_norm": 0.1393602083174125, - "learning_rate": 6.54059229833124e-05, - "loss": 0.0859, + "epoch": 1.0918918918918918, + "grad_norm": 0.1284280573750672, + "learning_rate": 0.00010126165032345038, + "loss": 0.0769, "step": 303 }, { - "epoch": 1.9612903225806453, - "grad_norm": 0.16652643955266602, - "learning_rate": 6.47017345993556e-05, - "loss": 0.0899, + "epoch": 1.0954954954954954, + "grad_norm": 0.1229186112985099, + "learning_rate": 0.00010063083771413975, + "loss": 0.0859, "step": 304 }, { - "epoch": 1.967741935483871, - "grad_norm": 0.16878660937501272, - "learning_rate": 6.39995400930324e-05, - "loss": 0.0998, + "epoch": 1.0990990990990992, + "grad_norm": 0.09913923706830541, + "learning_rate": 0.0001, + "loss": 0.077, "step": 305 }, { - "epoch": 1.9741935483870967, - "grad_norm": 0.12969061293334938, - "learning_rate": 6.329937912889582e-05, - "loss": 0.0706, + "epoch": 1.1027027027027028, + "grad_norm": 0.10294321295155191, + "learning_rate": 9.936916228586028e-05, + "loss": 0.0756, "step": 306 }, { - "epoch": 1.9806451612903224, - "grad_norm": 0.15739894009886643, - "learning_rate": 6.260129125663106e-05, - "loss": 0.0888, + "epoch": 1.1063063063063063, + "grad_norm": 0.09262673197538004, + "learning_rate": 9.873834967654964e-05, + "loss": 0.0583, "step": 307 }, { - "epoch": 1.9870967741935484, - "grad_norm": 0.1752067977727585, - "learning_rate": 6.190531590882159e-05, - "loss": 0.0826, + "epoch": 1.10990990990991, + "grad_norm": 0.09476725567348886, + "learning_rate": 9.810758727589813e-05, + "loss": 0.0659, "step": 308 }, { - "epoch": 1.9935483870967743, - "grad_norm": 0.21002280616145647, - "learning_rate": 6.121149239872151e-05, - "loss": 0.123, + "epoch": 1.1135135135135135, + "grad_norm": 0.10386525977756432, + "learning_rate": 9.747690018573757e-05, + "loss": 0.0604, "step": 309 }, { - "epoch": 2.0, - "grad_norm": 0.13850366944373244, - "learning_rate": 6.051985991803517e-05, - "loss": 0.073, + "epoch": 1.117117117117117, + "grad_norm": 0.1368700701066548, + "learning_rate": 9.684631350490287e-05, + "loss": 0.0728, "step": 310 }, { - "epoch": 2.0, - "eval_loss": 0.16690203547477722, - "eval_runtime": 25.4682, - "eval_samples_per_second": 5.144, - "eval_steps_per_second": 0.668, - "step": 310 - }, - { - "epoch": 2.0064516129032257, - "grad_norm": 0.1124990090834071, - "learning_rate": 5.983045753470308e-05, - "loss": 0.0485, + "epoch": 1.1207207207207208, + "grad_norm": 0.1432741421446136, + "learning_rate": 9.621585232823298e-05, + "loss": 0.0922, "step": 311 }, { - "epoch": 2.0129032258064514, - "grad_norm": 0.08246086368829486, - "learning_rate": 5.9143324190695196e-05, - "loss": 0.0411, + "epoch": 1.1243243243243244, + "grad_norm": 0.11931342929926518, + "learning_rate": 9.55855417455723e-05, + "loss": 0.0763, "step": 312 }, { - "epoch": 2.0193548387096776, - "grad_norm": 0.11824209839228844, - "learning_rate": 5.845849869981137e-05, - "loss": 0.0578, + "epoch": 1.127927927927928, + "grad_norm": 0.10803701150902781, + "learning_rate": 9.495540684077216e-05, + "loss": 0.0661, "step": 313 }, { - "epoch": 2.0258064516129033, - "grad_norm": 0.09183702414760068, - "learning_rate": 5.777601974548866e-05, - "loss": 0.0405, + "epoch": 1.1315315315315315, + "grad_norm": 0.12884484782558658, + "learning_rate": 9.432547269069261e-05, + "loss": 0.0606, "step": 314 }, { - "epoch": 2.032258064516129, - "grad_norm": 0.10138990147374953, - "learning_rate": 5.709592587861637e-05, - "loss": 0.045, + "epoch": 1.135135135135135, + "grad_norm": 0.1344643405568192, + "learning_rate": 9.36957643642045e-05, + "loss": 0.0779, "step": 315 }, { - "epoch": 2.0387096774193547, - "grad_norm": 0.13174518232765414, - "learning_rate": 5.6418255515358486e-05, - "loss": 0.0417, + "epoch": 1.1387387387387387, + "grad_norm": 0.12760304961299287, + "learning_rate": 9.306630692119182e-05, + "loss": 0.0603, "step": 316 }, { - "epoch": 2.0451612903225804, - "grad_norm": 0.11134581768568716, - "learning_rate": 5.574304693498346e-05, - "loss": 0.0351, + "epoch": 1.1423423423423422, + "grad_norm": 0.11600505233354397, + "learning_rate": 9.243712541155436e-05, + "loss": 0.0731, "step": 317 }, { - "epoch": 2.0516129032258066, - "grad_norm": 0.10372734844639428, - "learning_rate": 5.507033827770225e-05, - "loss": 0.0458, + "epoch": 1.145945945945946, + "grad_norm": 0.12470577794958661, + "learning_rate": 9.180824487421077e-05, + "loss": 0.0712, "step": 318 }, { - "epoch": 2.0580645161290323, - "grad_norm": 0.09893496819685892, - "learning_rate": 5.4400167542513636e-05, - "loss": 0.0421, + "epoch": 1.1495495495495496, + "grad_norm": 0.1612422337136671, + "learning_rate": 9.117969033610236e-05, + "loss": 0.0683, "step": 319 }, { - "epoch": 2.064516129032258, - "grad_norm": 0.1544849048513997, - "learning_rate": 5.3732572585057974e-05, - "loss": 0.0449, + "epoch": 1.1531531531531531, + "grad_norm": 0.14102273355899492, + "learning_rate": 9.055148681119688e-05, + "loss": 0.0674, "step": 320 }, { - "epoch": 2.0709677419354837, - "grad_norm": 0.12211629106983518, - "learning_rate": 5.306759111547881e-05, - "loss": 0.0341, + "epoch": 1.1567567567567567, + "grad_norm": 0.14126790004481535, + "learning_rate": 8.992365929949319e-05, + "loss": 0.0812, "step": 321 }, { - "epoch": 2.07741935483871, - "grad_norm": 0.1519039459170261, - "learning_rate": 5.240526069629265e-05, - "loss": 0.0477, + "epoch": 1.1603603603603603, + "grad_norm": 0.15229918606873402, + "learning_rate": 8.929623278602627e-05, + "loss": 0.0701, "step": 322 }, { - "epoch": 2.0838709677419356, - "grad_norm": 0.10882973801143421, - "learning_rate": 5.174561874026741e-05, - "loss": 0.0376, + "epoch": 1.163963963963964, + "grad_norm": 0.15483466842409133, + "learning_rate": 8.866923223987302e-05, + "loss": 0.0736, "step": 323 }, { - "epoch": 2.0903225806451613, - "grad_norm": 0.09785322895617321, - "learning_rate": 5.108870250830882e-05, - "loss": 0.0305, + "epoch": 1.1675675675675676, + "grad_norm": 0.14180233144913557, + "learning_rate": 8.80426826131585e-05, + "loss": 0.0783, "step": 324 }, { - "epoch": 2.096774193548387, - "grad_norm": 0.14990301799413153, - "learning_rate": 5.0434549107355944e-05, - "loss": 0.0409, + "epoch": 1.1711711711711712, + "grad_norm": 0.13636974208873606, + "learning_rate": 8.741660884006303e-05, + "loss": 0.0694, "step": 325 }, { - "epoch": 2.1032258064516127, - "grad_norm": 0.1915279670852206, - "learning_rate": 4.978319548828504e-05, - "loss": 0.045, + "epoch": 1.1747747747747748, + "grad_norm": 0.09259441173399019, + "learning_rate": 8.679103583582979e-05, + "loss": 0.0524, "step": 326 }, { - "epoch": 2.109677419354839, - "grad_norm": 0.15445162243894608, - "learning_rate": 4.9134678443822166e-05, - "loss": 0.0334, + "epoch": 1.1783783783783783, + "grad_norm": 0.13715824142259708, + "learning_rate": 8.616598849577333e-05, + "loss": 0.08, "step": 327 }, { - "epoch": 2.1161290322580646, - "grad_norm": 0.34562522599625123, - "learning_rate": 4.8489034606465225e-05, - "loss": 0.0395, + "epoch": 1.181981981981982, + "grad_norm": 0.12376655099748206, + "learning_rate": 8.554149169428894e-05, + "loss": 0.0784, "step": 328 }, { - "epoch": 2.1225806451612903, - "grad_norm": 0.3183911745576579, - "learning_rate": 4.784630044641435e-05, - "loss": 0.0546, + "epoch": 1.1855855855855855, + "grad_norm": 0.09319112288968635, + "learning_rate": 8.491757028386263e-05, + "loss": 0.0586, "step": 329 }, { - "epoch": 2.129032258064516, - "grad_norm": 0.1619196710614726, - "learning_rate": 4.7206512269512124e-05, - "loss": 0.0446, + "epoch": 1.1891891891891893, + "grad_norm": 0.10475390780150441, + "learning_rate": 8.429424909408214e-05, + "loss": 0.0563, "step": 330 }, { - "epoch": 2.135483870967742, - "grad_norm": 0.24657637060341897, - "learning_rate": 4.65697062151927e-05, - "loss": 0.0349, + "epoch": 1.1927927927927928, + "grad_norm": 0.138568291584437, + "learning_rate": 8.367155293064878e-05, + "loss": 0.0894, "step": 331 }, { - "epoch": 2.141935483870968, - "grad_norm": 0.11723613581448042, - "learning_rate": 4.593591825444028e-05, - "loss": 0.0337, + "epoch": 1.1963963963963964, + "grad_norm": 0.10228738965627485, + "learning_rate": 8.304950657439033e-05, + "loss": 0.0571, "step": 332 }, { - "epoch": 2.1483870967741936, - "grad_norm": 0.12413977056977685, - "learning_rate": 4.530518418775733e-05, - "loss": 0.0355, + "epoch": 1.2, + "grad_norm": 0.09804366197069557, + "learning_rate": 8.242813478027492e-05, + "loss": 0.0632, "step": 333 }, { - "epoch": 2.1548387096774193, - "grad_norm": 0.1368012687546921, - "learning_rate": 4.4677539643142454e-05, - "loss": 0.0369, + "epoch": 1.2036036036036035, + "grad_norm": 0.09408876005476795, + "learning_rate": 8.180746227642562e-05, + "loss": 0.0553, "step": 334 }, { - "epoch": 2.161290322580645, - "grad_norm": 0.132092425970093, - "learning_rate": 4.40530200740777e-05, - "loss": 0.0346, + "epoch": 1.2072072072072073, + "grad_norm": 0.13733518876900813, + "learning_rate": 8.118751376313664e-05, + "loss": 0.074, "step": 335 }, { - "epoch": 2.167741935483871, - "grad_norm": 0.17255652388608525, - "learning_rate": 4.343166075752605e-05, - "loss": 0.0425, + "epoch": 1.2108108108108109, + "grad_norm": 0.11537452385210425, + "learning_rate": 8.056831391189023e-05, + "loss": 0.0686, "step": 336 }, { - "epoch": 2.174193548387097, - "grad_norm": 0.10537625462278236, - "learning_rate": 4.281349679193861e-05, - "loss": 0.0363, + "epoch": 1.2144144144144144, + "grad_norm": 0.1072867298305809, + "learning_rate": 7.99498873643749e-05, + "loss": 0.0628, "step": 337 }, { - "epoch": 2.1806451612903226, - "grad_norm": 0.08446560264535163, - "learning_rate": 4.2198563095272116e-05, - "loss": 0.0303, + "epoch": 1.218018018018018, + "grad_norm": 0.10612605300014923, + "learning_rate": 7.93322587315047e-05, + "loss": 0.0678, "step": 338 }, { - "epoch": 2.1870967741935483, - "grad_norm": 0.11155624490505092, - "learning_rate": 4.158689440301657e-05, - "loss": 0.0424, + "epoch": 1.2216216216216216, + "grad_norm": 0.09775413101898157, + "learning_rate": 7.87154525924399e-05, + "loss": 0.0577, "step": 339 }, { - "epoch": 2.193548387096774, - "grad_norm": 0.10614591478252769, - "learning_rate": 4.097852526623307e-05, - "loss": 0.0374, + "epoch": 1.2252252252252251, + "grad_norm": 0.11955490707565056, + "learning_rate": 7.809949349360872e-05, + "loss": 0.0576, "step": 340 }, { - "epoch": 2.2, - "grad_norm": 0.15380955207075353, - "learning_rate": 4.0373490049602204e-05, - "loss": 0.0506, + "epoch": 1.2288288288288287, + "grad_norm": 0.10380886011793584, + "learning_rate": 7.74844059477306e-05, + "loss": 0.0603, "step": 341 }, { - "epoch": 2.206451612903226, - "grad_norm": 0.11905414571153279, - "learning_rate": 3.977182292948283e-05, - "loss": 0.0517, + "epoch": 1.2324324324324325, + "grad_norm": 0.13589511087320075, + "learning_rate": 7.687021443284071e-05, + "loss": 0.0773, "step": 342 }, { - "epoch": 2.2129032258064516, - "grad_norm": 0.14425425935781702, - "learning_rate": 3.9173557891981573e-05, - "loss": 0.0467, + "epoch": 1.236036036036036, + "grad_norm": 0.1167184661521862, + "learning_rate": 7.625694339131564e-05, + "loss": 0.0677, "step": 343 }, { - "epoch": 2.2193548387096773, - "grad_norm": 0.12453230757386329, - "learning_rate": 3.857872873103322e-05, - "loss": 0.0435, + "epoch": 1.2396396396396396, + "grad_norm": 0.12624459810290067, + "learning_rate": 7.564461722890081e-05, + "loss": 0.0802, "step": 344 }, { - "epoch": 2.225806451612903, - "grad_norm": 0.10038267160538758, - "learning_rate": 3.7987369046491684e-05, - "loss": 0.0387, + "epoch": 1.2432432432432432, + "grad_norm": 0.12261184267145957, + "learning_rate": 7.503326031373931e-05, + "loss": 0.0649, "step": 345 }, { - "epoch": 2.232258064516129, - "grad_norm": 0.1438025015993235, - "learning_rate": 3.7399512242231995e-05, - "loss": 0.0346, + "epoch": 1.2468468468468468, + "grad_norm": 0.16140905294131228, + "learning_rate": 7.442289697540201e-05, + "loss": 0.0648, "step": 346 }, { - "epoch": 2.238709677419355, - "grad_norm": 0.26072179652296745, - "learning_rate": 3.6815191524263624e-05, - "loss": 0.0545, + "epoch": 1.2504504504504506, + "grad_norm": 0.1369989260957558, + "learning_rate": 7.381355150391933e-05, + "loss": 0.074, "step": 347 }, { - "epoch": 2.2451612903225806, - "grad_norm": 0.08770511437142381, - "learning_rate": 3.623443989885462e-05, - "loss": 0.0327, + "epoch": 1.2540540540540541, + "grad_norm": 0.10405503701690619, + "learning_rate": 7.32052481488147e-05, + "loss": 0.0683, "step": 348 }, { - "epoch": 2.2516129032258063, - "grad_norm": 0.12301181296016188, - "learning_rate": 3.565729017066729e-05, - "loss": 0.039, + "epoch": 1.2576576576576577, + "grad_norm": 0.11234589174920957, + "learning_rate": 7.25980111181394e-05, + "loss": 0.0643, "step": 349 }, { - "epoch": 2.258064516129032, - "grad_norm": 0.13814667341858822, - "learning_rate": 3.508377494090521e-05, - "loss": 0.0356, + "epoch": 1.2612612612612613, + "grad_norm": 0.09321884974416474, + "learning_rate": 7.19918645775093e-05, + "loss": 0.0571, "step": 350 }, { - "epoch": 2.264516129032258, - "grad_norm": 0.12413533495298362, - "learning_rate": 3.45139266054715e-05, - "loss": 0.0415, + "epoch": 1.2648648648648648, + "grad_norm": 0.12641606453495435, + "learning_rate": 7.138683264914314e-05, + "loss": 0.0702, "step": 351 }, { - "epoch": 2.270967741935484, - "grad_norm": 0.11031185879435731, - "learning_rate": 3.394777735313919e-05, - "loss": 0.0363, + "epoch": 1.2684684684684684, + "grad_norm": 0.09916971519672783, + "learning_rate": 7.078293941090249e-05, + "loss": 0.0669, "step": 352 }, { - "epoch": 2.2774193548387096, - "grad_norm": 0.11353520726841541, - "learning_rate": 3.338535916373266e-05, - "loss": 0.0376, + "epoch": 1.272072072072072, + "grad_norm": 0.11635429968669815, + "learning_rate": 7.018020889533348e-05, + "loss": 0.071, "step": 353 }, { - "epoch": 2.2838709677419353, - "grad_norm": 0.23000368159173218, - "learning_rate": 3.2826703806321525e-05, - "loss": 0.0444, + "epoch": 1.2756756756756757, + "grad_norm": 0.1634329754196539, + "learning_rate": 6.957866508871068e-05, + "loss": 0.0956, "step": 354 }, { - "epoch": 2.2903225806451615, - "grad_norm": 0.10390022981327145, - "learning_rate": 3.227184283742591e-05, - "loss": 0.0308, + "epoch": 1.2792792792792793, + "grad_norm": 0.10617306865400682, + "learning_rate": 6.897833193008231e-05, + "loss": 0.0601, "step": 355 }, { - "epoch": 2.296774193548387, - "grad_norm": 0.14881153611607173, - "learning_rate": 3.17208075992339e-05, - "loss": 0.0428, + "epoch": 1.2828828828828829, + "grad_norm": 0.09334802201378282, + "learning_rate": 6.83792333103176e-05, + "loss": 0.0633, "step": 356 }, { - "epoch": 2.303225806451613, - "grad_norm": 0.22847373170988944, - "learning_rate": 3.117362921783134e-05, - "loss": 0.0449, + "epoch": 1.2864864864864864, + "grad_norm": 0.09803158330755328, + "learning_rate": 6.77813930711561e-05, + "loss": 0.059, "step": 357 }, { - "epoch": 2.3096774193548386, - "grad_norm": 0.1261660214676178, - "learning_rate": 3.063033860144339e-05, - "loss": 0.0354, + "epoch": 1.29009009009009, + "grad_norm": 0.1431245474215182, + "learning_rate": 6.718483500425867e-05, + "loss": 0.0942, "step": 358 }, { - "epoch": 2.3161290322580643, - "grad_norm": 0.16508383182402034, - "learning_rate": 3.0090966438688772e-05, - "loss": 0.0404, + "epoch": 1.2936936936936938, + "grad_norm": 0.093782557814536, + "learning_rate": 6.658958285026102e-05, + "loss": 0.0606, "step": 359 }, { - "epoch": 2.3225806451612905, - "grad_norm": 0.14003607495149023, - "learning_rate": 2.9555543196846292e-05, - "loss": 0.0417, + "epoch": 1.2972972972972974, + "grad_norm": 0.11393416329015467, + "learning_rate": 6.599566029782863e-05, + "loss": 0.0717, "step": 360 }, { - "epoch": 2.329032258064516, - "grad_norm": 0.14067522061651452, - "learning_rate": 2.9024099120133673e-05, - "loss": 0.0325, + "epoch": 1.300900900900901, + "grad_norm": 0.13923517110417213, + "learning_rate": 6.540309098271416e-05, + "loss": 0.0702, "step": 361 }, { - "epoch": 2.335483870967742, - "grad_norm": 0.17575236098692995, - "learning_rate": 2.8496664227999415e-05, - "loss": 0.0533, + "epoch": 1.3045045045045045, + "grad_norm": 0.11247902438190675, + "learning_rate": 6.48118984868167e-05, + "loss": 0.0671, "step": 362 }, { - "epoch": 2.3419354838709676, - "grad_norm": 0.11116834491408094, - "learning_rate": 2.7973268313426837e-05, - "loss": 0.0347, + "epoch": 1.308108108108108, + "grad_norm": 0.08843201816576483, + "learning_rate": 6.42221063372436e-05, + "loss": 0.0528, "step": 363 }, { - "epoch": 2.3483870967741938, - "grad_norm": 0.16113939577201203, - "learning_rate": 2.745394094125141e-05, - "loss": 0.0472, + "epoch": 1.3117117117117116, + "grad_norm": 0.09716724428964281, + "learning_rate": 6.363373800537387e-05, + "loss": 0.064, "step": 364 }, { - "epoch": 2.3548387096774195, - "grad_norm": 0.10955543396615161, - "learning_rate": 2.6938711446490606e-05, - "loss": 0.0357, + "epoch": 1.3153153153153152, + "grad_norm": 0.15158524613568697, + "learning_rate": 6.304681690592431e-05, + "loss": 0.0704, "step": 365 }, { - "epoch": 2.361290322580645, - "grad_norm": 0.17969798691111394, - "learning_rate": 2.6427608932686843e-05, - "loss": 0.0584, + "epoch": 1.318918918918919, + "grad_norm": 0.16465157701305033, + "learning_rate": 6.246136639601764e-05, + "loss": 0.0834, "step": 366 }, { - "epoch": 2.367741935483871, - "grad_norm": 0.08542932983952008, - "learning_rate": 2.5920662270263653e-05, - "loss": 0.032, + "epoch": 1.3225225225225226, + "grad_norm": 0.12163370539770614, + "learning_rate": 6.187740977425285e-05, + "loss": 0.0734, "step": 367 }, { - "epoch": 2.3741935483870966, - "grad_norm": 0.14296192462496796, - "learning_rate": 2.5417900094894744e-05, - "loss": 0.0463, + "epoch": 1.3261261261261261, + "grad_norm": 0.10264467748684972, + "learning_rate": 6.129497027977829e-05, + "loss": 0.0688, "step": 368 }, { - "epoch": 2.3806451612903228, - "grad_norm": 0.11857064948287055, - "learning_rate": 2.4919350805886577e-05, - "loss": 0.0328, + "epoch": 1.3297297297297297, + "grad_norm": 0.0908877614772114, + "learning_rate": 6.071407109136662e-05, + "loss": 0.056, "step": 369 }, { - "epoch": 2.3870967741935485, - "grad_norm": 0.1614783419773078, - "learning_rate": 2.4425042564574184e-05, - "loss": 0.0398, + "epoch": 1.3333333333333333, + "grad_norm": 0.13375636202891336, + "learning_rate": 6.0134735326492456e-05, + "loss": 0.0775, "step": 370 }, { - "epoch": 2.393548387096774, - "grad_norm": 0.12389155943230613, - "learning_rate": 2.3935003292730296e-05, - "loss": 0.0389, + "epoch": 1.336936936936937, + "grad_norm": 0.11306366286295132, + "learning_rate": 5.955698604041231e-05, + "loss": 0.0609, "step": 371 }, { - "epoch": 2.4, - "grad_norm": 0.11380344182842446, - "learning_rate": 2.344926067098836e-05, - "loss": 0.0412, + "epoch": 1.3405405405405406, + "grad_norm": 0.1273835553046169, + "learning_rate": 5.8980846225247286e-05, + "loss": 0.0653, "step": 372 }, { - "epoch": 2.4064516129032256, - "grad_norm": 0.09336252258103142, - "learning_rate": 2.2967842137278706e-05, - "loss": 0.0311, + "epoch": 1.3441441441441442, + "grad_norm": 0.10437088258018255, + "learning_rate": 5.8406338809067874e-05, + "loss": 0.0639, "step": 373 }, { - "epoch": 2.412903225806452, - "grad_norm": 0.1312469591783424, - "learning_rate": 2.2490774885278908e-05, - "loss": 0.0363, + "epoch": 1.3477477477477477, + "grad_norm": 0.10184725937214306, + "learning_rate": 5.7833486654981606e-05, + "loss": 0.0531, "step": 374 }, { - "epoch": 2.4193548387096775, - "grad_norm": 0.1928829483525128, - "learning_rate": 2.201808586287757e-05, - "loss": 0.0447, + "epoch": 1.3513513513513513, + "grad_norm": 0.1439045913318642, + "learning_rate": 5.726231256022316e-05, + "loss": 0.0756, "step": 375 }, { - "epoch": 2.425806451612903, - "grad_norm": 0.1407497782508695, - "learning_rate": 2.15498017706521e-05, - "loss": 0.0425, + "epoch": 1.3549549549549549, + "grad_norm": 0.1161759648717375, + "learning_rate": 5.669283925524715e-05, + "loss": 0.0564, "step": 376 }, { - "epoch": 2.432258064516129, - "grad_norm": 0.12625194439970722, - "learning_rate": 2.1085949060360654e-05, - "loss": 0.0394, + "epoch": 1.3585585585585584, + "grad_norm": 0.14605792072752058, + "learning_rate": 5.6125089402823485e-05, + "loss": 0.0676, "step": 377 }, { - "epoch": 2.4387096774193546, - "grad_norm": 0.1650711686265322, - "learning_rate": 2.0626553933447734e-05, - "loss": 0.0356, + "epoch": 1.3621621621621622, + "grad_norm": 0.11801235296837954, + "learning_rate": 5.555908559713561e-05, + "loss": 0.0922, "step": 378 }, { - "epoch": 2.445161290322581, - "grad_norm": 0.203794329158885, - "learning_rate": 2.01716423395644e-05, - "loss": 0.046, + "epoch": 1.3657657657657658, + "grad_norm": 0.0986919703756885, + "learning_rate": 5.4994850362881214e-05, + "loss": 0.0624, "step": 379 }, { - "epoch": 2.4516129032258065, - "grad_norm": 0.0983865374506401, - "learning_rate": 1.9721239975102313e-05, - "loss": 0.0383, + "epoch": 1.3693693693693694, + "grad_norm": 0.10287784532358656, + "learning_rate": 5.443240615437586e-05, + "loss": 0.0692, "step": 380 }, { - "epoch": 2.458064516129032, - "grad_norm": 0.11475237817508949, - "learning_rate": 1.9275372281742242e-05, - "loss": 0.0341, + "epoch": 1.372972972972973, + "grad_norm": 0.1340740211739596, + "learning_rate": 5.387177535465945e-05, + "loss": 0.0835, "step": 381 }, { - "epoch": 2.464516129032258, - "grad_norm": 0.11696076741124051, - "learning_rate": 1.8834064445016953e-05, - "loss": 0.0351, + "epoch": 1.3765765765765765, + "grad_norm": 0.138304807951036, + "learning_rate": 5.331298027460539e-05, + "loss": 0.0749, "step": 382 }, { - "epoch": 2.4709677419354836, - "grad_norm": 0.11440255447742229, - "learning_rate": 1.839734139288868e-05, - "loss": 0.0344, + "epoch": 1.3801801801801803, + "grad_norm": 0.11145520456541595, + "learning_rate": 5.275604315203293e-05, + "loss": 0.062, "step": 383 }, { - "epoch": 2.47741935483871, - "grad_norm": 0.12722166866147971, - "learning_rate": 1.7965227794340877e-05, - "loss": 0.0327, + "epoch": 1.3837837837837839, + "grad_norm": 0.09247928908715367, + "learning_rate": 5.2200986150821696e-05, + "loss": 0.0571, "step": 384 }, { - "epoch": 2.4838709677419355, - "grad_norm": 0.11977054711252778, - "learning_rate": 1.753774805798486e-05, - "loss": 0.0374, + "epoch": 1.3873873873873874, + "grad_norm": 0.1101460184702829, + "learning_rate": 5.164783136003027e-05, + "loss": 0.076, "step": 385 }, { - "epoch": 2.490322580645161, - "grad_norm": 0.19041811914083892, - "learning_rate": 1.7114926330680957e-05, - "loss": 0.0448, + "epoch": 1.390990990990991, + "grad_norm": 0.09455088802615286, + "learning_rate": 5.109660079301668e-05, + "loss": 0.0708, "step": 386 }, { - "epoch": 2.496774193548387, - "grad_norm": 0.12908385372034334, - "learning_rate": 1.6696786496174578e-05, - "loss": 0.0421, + "epoch": 1.3945945945945946, + "grad_norm": 0.12915569506877214, + "learning_rate": 5.0547316386562507e-05, + "loss": 0.0683, "step": 387 }, { - "epoch": 2.5032258064516126, - "grad_norm": 0.11345958947234494, - "learning_rate": 1.6283352173747145e-05, - "loss": 0.0337, + "epoch": 1.3981981981981981, + "grad_norm": 0.14298967602118384, + "learning_rate": 5.000000000000002e-05, + "loss": 0.0802, "step": 388 }, { - "epoch": 2.509677419354839, - "grad_norm": 0.13882462080577593, - "learning_rate": 1.587464671688187e-05, - "loss": 0.0399, + "epoch": 1.4018018018018017, + "grad_norm": 0.13870069410611544, + "learning_rate": 4.945467341434195e-05, + "loss": 0.0849, "step": 389 }, { - "epoch": 2.5161290322580645, - "grad_norm": 0.15241110627485502, - "learning_rate": 1.5470693211944643e-05, - "loss": 0.0331, + "epoch": 1.4054054054054055, + "grad_norm": 0.1542562430256231, + "learning_rate": 4.891135833141495e-05, + "loss": 0.0875, "step": 390 }, { - "epoch": 2.52258064516129, - "grad_norm": 0.10975692065298487, - "learning_rate": 1.5071514476879878e-05, - "loss": 0.0379, + "epoch": 1.409009009009009, + "grad_norm": 0.11965550746042822, + "learning_rate": 4.837007637299595e-05, + "loss": 0.0599, "step": 391 }, { - "epoch": 2.5290322580645164, - "grad_norm": 0.1379096296948562, - "learning_rate": 1.4677133059921632e-05, - "loss": 0.0362, + "epoch": 1.4126126126126126, + "grad_norm": 0.13624840902702007, + "learning_rate": 4.783084907995156e-05, + "loss": 0.0805, "step": 392 }, { - "epoch": 2.535483870967742, - "grad_norm": 0.10190107251103887, - "learning_rate": 1.4287571238320053e-05, - "loss": 0.0371, + "epoch": 1.4162162162162162, + "grad_norm": 0.0916114816428032, + "learning_rate": 4.729369791138085e-05, + "loss": 0.0523, "step": 393 }, { - "epoch": 2.541935483870968, - "grad_norm": 0.0875699618751271, - "learning_rate": 1.3902851017082864e-05, - "loss": 0.0335, + "epoch": 1.4198198198198198, + "grad_norm": 0.10008527767691701, + "learning_rate": 4.675864424376146e-05, + "loss": 0.0623, "step": 394 }, { - "epoch": 2.5483870967741935, - "grad_norm": 0.13655405710076232, - "learning_rate": 1.3522994127732414e-05, - "loss": 0.0388, + "epoch": 1.4234234234234235, + "grad_norm": 0.1329413030801314, + "learning_rate": 4.622570937009879e-05, + "loss": 0.0768, "step": 395 }, { - "epoch": 2.554838709677419, - "grad_norm": 0.11073272357568731, - "learning_rate": 1.3148022027078222e-05, - "loss": 0.0366, + "epoch": 1.427027027027027, + "grad_norm": 0.09990109338082896, + "learning_rate": 4.569491449907878e-05, + "loss": 0.0624, "step": 396 }, { - "epoch": 2.5612903225806454, - "grad_norm": 0.11393589351391524, - "learning_rate": 1.2777955896004812e-05, - "loss": 0.0375, + "epoch": 1.4306306306306307, + "grad_norm": 0.10994747606781914, + "learning_rate": 4.5166280754223676e-05, + "loss": 0.0554, "step": 397 }, { - "epoch": 2.567741935483871, - "grad_norm": 0.1294793091961239, - "learning_rate": 1.2412816638275404e-05, - "loss": 0.0447, + "epoch": 1.4342342342342342, + "grad_norm": 0.11716673038318011, + "learning_rate": 4.4639829173051554e-05, + "loss": 0.0487, "step": 398 }, { - "epoch": 2.574193548387097, - "grad_norm": 0.12252957295031998, - "learning_rate": 1.2052624879351104e-05, - "loss": 0.036, + "epoch": 1.4378378378378378, + "grad_norm": 0.14740947528399398, + "learning_rate": 4.411558070623907e-05, + "loss": 0.0705, "step": 399 }, { - "epoch": 2.5806451612903225, - "grad_norm": 0.10701768631052949, - "learning_rate": 1.1697400965225747e-05, - "loss": 0.0375, + "epoch": 1.4414414414414414, + "grad_norm": 0.10905802122912368, + "learning_rate": 4.359355621678764e-05, + "loss": 0.0669, "step": 400 }, { - "epoch": 2.587096774193548, - "grad_norm": 0.1826126683217801, - "learning_rate": 1.134716496127679e-05, - "loss": 0.0398, + "epoch": 1.445045045045045, + "grad_norm": 0.1372304462996132, + "learning_rate": 4.307377647919343e-05, + "loss": 0.0754, "step": 401 }, { - "epoch": 2.5935483870967744, - "grad_norm": 0.12179126729794809, - "learning_rate": 1.1001936651131717e-05, - "loss": 0.0396, + "epoch": 1.4486486486486487, + "grad_norm": 0.1220267603792114, + "learning_rate": 4.255626217862013e-05, + "loss": 0.061, "step": 402 }, { - "epoch": 2.6, - "grad_norm": 0.11972903116479075, - "learning_rate": 1.0661735535550666e-05, - "loss": 0.0336, + "epoch": 1.4522522522522523, + "grad_norm": 0.10857496168656595, + "learning_rate": 4.204103391007623e-05, + "loss": 0.0666, "step": 403 }, { - "epoch": 2.606451612903226, - "grad_norm": 0.09797391900877583, - "learning_rate": 1.0326580831324817e-05, - "loss": 0.0291, + "epoch": 1.4558558558558559, + "grad_norm": 0.12862098370827468, + "learning_rate": 4.152811217759529e-05, + "loss": 0.0702, "step": 404 }, { - "epoch": 2.6129032258064515, - "grad_norm": 0.08807259670414254, - "learning_rate": 9.996491470190917e-06, - "loss": 0.0318, + "epoch": 1.4594594594594594, + "grad_norm": 0.12121566089700932, + "learning_rate": 4.1017517393419826e-05, + "loss": 0.0701, "step": 405 }, { - "epoch": 2.6193548387096772, - "grad_norm": 0.12986706205285126, - "learning_rate": 9.671486097761917e-06, - "loss": 0.0408, + "epoch": 1.463063063063063, + "grad_norm": 0.12880561806718493, + "learning_rate": 4.0509269877189106e-05, + "loss": 0.0855, "step": 406 }, { - "epoch": 2.6258064516129034, - "grad_norm": 0.1331746828106328, - "learning_rate": 9.351583072473713e-06, - "loss": 0.0427, + "epoch": 1.4666666666666668, + "grad_norm": 0.1103245174300563, + "learning_rate": 4.000338985513046e-05, + "loss": 0.0645, "step": 407 }, { - "epoch": 2.632258064516129, - "grad_norm": 0.14912986532560119, - "learning_rate": 9.036800464548157e-06, - "loss": 0.0464, + "epoch": 1.4702702702702704, + "grad_norm": 0.11471559496567096, + "learning_rate": 3.9499897459254375e-05, + "loss": 0.069, "step": 408 }, { - "epoch": 2.638709677419355, - "grad_norm": 0.1440292521357826, - "learning_rate": 8.727156054972374e-06, - "loss": 0.0448, + "epoch": 1.473873873873874, + "grad_norm": 0.1068585326163203, + "learning_rate": 3.899881272655342e-05, + "loss": 0.0584, "step": 409 }, { - "epoch": 2.6451612903225805, - "grad_norm": 0.16571651133316453, - "learning_rate": 8.422667334494249e-06, - "loss": 0.0449, + "epoch": 1.4774774774774775, + "grad_norm": 0.12180507516800447, + "learning_rate": 3.8500155598204644e-05, + "loss": 0.0767, "step": 410 }, { - "epoch": 2.6516129032258062, - "grad_norm": 0.15006924502881824, - "learning_rate": 8.123351502634625e-06, - "loss": 0.0371, + "epoch": 1.481081081081081, + "grad_norm": 0.10787694069789268, + "learning_rate": 3.8003945918776143e-05, + "loss": 0.0652, "step": 411 }, { - "epoch": 2.6580645161290324, - "grad_norm": 0.3766926409227814, - "learning_rate": 7.82922546671555e-06, - "loss": 0.0522, + "epoch": 1.4846846846846846, + "grad_norm": 0.11726705088810202, + "learning_rate": 3.75102034354373e-05, + "loss": 0.0629, "step": 412 }, { - "epoch": 2.664516129032258, - "grad_norm": 0.1693245533307095, - "learning_rate": 7.54030584090537e-06, - "loss": 0.0523, + "epoch": 1.4882882882882882, + "grad_norm": 0.10762739211432558, + "learning_rate": 3.701894779717286e-05, + "loss": 0.0555, "step": 413 }, { - "epoch": 2.670967741935484, - "grad_norm": 0.16933380856734512, - "learning_rate": 7.256608945280319e-06, - "loss": 0.0448, + "epoch": 1.491891891891892, + "grad_norm": 0.09308134521067353, + "learning_rate": 3.653019855400123e-05, + "loss": 0.0564, "step": 414 }, { - "epoch": 2.6774193548387095, - "grad_norm": 0.12372656470631721, - "learning_rate": 6.97815080490245e-06, - "loss": 0.041, + "epoch": 1.4954954954954955, + "grad_norm": 0.1524049565519016, + "learning_rate": 3.6043975156195987e-05, + "loss": 0.0809, "step": 415 }, { - "epoch": 2.6838709677419352, - "grad_norm": 0.1139014768011305, - "learning_rate": 6.704947148914609e-06, - "loss": 0.034, + "epoch": 1.499099099099099, + "grad_norm": 0.13279511071977756, + "learning_rate": 3.5560296953512295e-05, + "loss": 0.069, "step": 416 }, { - "epoch": 2.6903225806451614, - "grad_norm": 0.14509781428626975, - "learning_rate": 6.437013409651849e-06, - "loss": 0.0289, + "epoch": 1.5027027027027027, + "grad_norm": 0.13286192000673344, + "learning_rate": 3.507918319441672e-05, + "loss": 0.0748, "step": 417 }, { - "epoch": 2.696774193548387, - "grad_norm": 0.1403482292705624, - "learning_rate": 6.174364721769743e-06, - "loss": 0.0493, + "epoch": 1.5063063063063065, + "grad_norm": 0.17493415887546465, + "learning_rate": 3.460065302532108e-05, + "loss": 0.0828, "step": 418 }, { - "epoch": 2.703225806451613, - "grad_norm": 0.12152856591733673, - "learning_rate": 5.917015921389568e-06, - "loss": 0.0343, + "epoch": 1.50990990990991, + "grad_norm": 0.11057059837297634, + "learning_rate": 3.4124725489820645e-05, + "loss": 0.063, "step": 419 }, { - "epoch": 2.709677419354839, - "grad_norm": 0.10494471511565585, - "learning_rate": 5.664981545260073e-06, - "loss": 0.03, + "epoch": 1.5135135135135136, + "grad_norm": 0.12735355112497881, + "learning_rate": 3.365141952793622e-05, + "loss": 0.0732, "step": 420 }, { - "epoch": 2.7161290322580647, - "grad_norm": 0.2176951472197835, - "learning_rate": 5.418275829936537e-06, - "loss": 0.0486, + "epoch": 1.5171171171171172, + "grad_norm": 0.1632337270590702, + "learning_rate": 3.3180753975360415e-05, + "loss": 0.0775, "step": 421 }, { - "epoch": 2.7225806451612904, - "grad_norm": 0.09677627092271344, - "learning_rate": 5.176912710976467e-06, - "loss": 0.0318, + "epoch": 1.5207207207207207, + "grad_norm": 0.14697481590979153, + "learning_rate": 3.2712747562708115e-05, + "loss": 0.0863, "step": 422 }, { - "epoch": 2.729032258064516, - "grad_norm": 0.14416553291024004, - "learning_rate": 4.940905822152453e-06, - "loss": 0.0356, + "epoch": 1.5243243243243243, + "grad_norm": 0.1660001616377742, + "learning_rate": 3.224741891477095e-05, + "loss": 0.082, "step": 423 }, { - "epoch": 2.735483870967742, - "grad_norm": 0.10141360651760384, - "learning_rate": 4.710268494682146e-06, - "loss": 0.0346, + "epoch": 1.5279279279279279, + "grad_norm": 0.13048897912831442, + "learning_rate": 3.178478654977624e-05, + "loss": 0.0794, "step": 424 }, { - "epoch": 2.741935483870968, - "grad_norm": 0.13463566674635727, - "learning_rate": 4.485013756475076e-06, - "loss": 0.0443, + "epoch": 1.5315315315315314, + "grad_norm": 0.11489435507746414, + "learning_rate": 3.132486887864992e-05, + "loss": 0.0694, "step": 425 }, { - "epoch": 2.7483870967741937, - "grad_norm": 0.26842778961550184, - "learning_rate": 4.2651543313968145e-06, - "loss": 0.0445, + "epoch": 1.535135135135135, + "grad_norm": 0.07639223925814809, + "learning_rate": 3.086768420428392e-05, + "loss": 0.0413, "step": 426 }, { - "epoch": 2.7548387096774194, - "grad_norm": 0.16014595507517537, - "learning_rate": 4.050702638550275e-06, - "loss": 0.0469, + "epoch": 1.5387387387387388, + "grad_norm": 0.11453408038112112, + "learning_rate": 3.0413250720807883e-05, + "loss": 0.0658, "step": 427 }, { - "epoch": 2.761290322580645, - "grad_norm": 0.2690022963645026, - "learning_rate": 3.841670791574137e-06, - "loss": 0.0423, + "epoch": 1.5423423423423424, + "grad_norm": 0.1667771876584119, + "learning_rate": 2.9961586512864947e-05, + "loss": 0.0798, "step": 428 }, { - "epoch": 2.767741935483871, - "grad_norm": 0.2323526833237967, - "learning_rate": 3.638070597958665e-06, - "loss": 0.035, + "epoch": 1.545945945945946, + "grad_norm": 0.11714414917895159, + "learning_rate": 2.9512709554892003e-05, + "loss": 0.0693, "step": 429 }, { - "epoch": 2.774193548387097, - "grad_norm": 0.11643458974488918, - "learning_rate": 3.4399135583787043e-06, - "loss": 0.0401, + "epoch": 1.5495495495495497, + "grad_norm": 0.1229110888950789, + "learning_rate": 2.9066637710404675e-05, + "loss": 0.0747, "step": 430 }, { - "epoch": 2.7806451612903227, - "grad_norm": 0.12278976372135998, - "learning_rate": 3.2472108660439706e-06, - "loss": 0.04, + "epoch": 1.5531531531531533, + "grad_norm": 0.1759221433787884, + "learning_rate": 2.8623388731286093e-05, + "loss": 0.0795, "step": 431 }, { - "epoch": 2.7870967741935484, - "grad_norm": 0.11369002878299092, - "learning_rate": 3.059973406066963e-06, - "loss": 0.0335, + "epoch": 1.5567567567567568, + "grad_norm": 0.09594983270629338, + "learning_rate": 2.818298025708075e-05, + "loss": 0.059, "step": 432 }, { - "epoch": 2.793548387096774, - "grad_norm": 0.1426764505452139, - "learning_rate": 2.878211754847926e-06, - "loss": 0.0431, + "epoch": 1.5603603603603604, + "grad_norm": 0.090367685537466, + "learning_rate": 2.7745429814292145e-05, + "loss": 0.0531, "step": 433 }, { - "epoch": 2.8, - "grad_norm": 0.17614309627774977, - "learning_rate": 2.7019361794775156e-06, - "loss": 0.0336, + "epoch": 1.563963963963964, + "grad_norm": 0.13195671170326462, + "learning_rate": 2.7310754815685624e-05, + "loss": 0.075, "step": 434 }, { - "epoch": 2.806451612903226, - "grad_norm": 0.13119646329419238, - "learning_rate": 2.5311566371568507e-06, - "loss": 0.0388, + "epoch": 1.5675675675675675, + "grad_norm": 0.09101802024714302, + "learning_rate": 2.687897255959536e-05, + "loss": 0.0541, "step": 435 }, { - "epoch": 2.8129032258064517, - "grad_norm": 0.2274358694692015, - "learning_rate": 2.365882774634998e-06, - "loss": 0.0483, + "epoch": 1.571171171171171, + "grad_norm": 0.12030212380166792, + "learning_rate": 2.6450100229235795e-05, + "loss": 0.0739, "step": 436 }, { - "epoch": 2.8193548387096774, - "grad_norm": 0.16809646480030987, - "learning_rate": 2.206123927664161e-06, - "loss": 0.0428, + "epoch": 1.5747747747747747, + "grad_norm": 0.09282633668248956, + "learning_rate": 2.6024154892017937e-05, + "loss": 0.0585, "step": 437 }, { - "epoch": 2.825806451612903, - "grad_norm": 0.08861174696327136, - "learning_rate": 2.0518891204722168e-06, - "loss": 0.0297, + "epoch": 1.5783783783783782, + "grad_norm": 0.1437367048181832, + "learning_rate": 2.5601153498870134e-05, + "loss": 0.0744, "step": 438 }, { - "epoch": 2.832258064516129, - "grad_norm": 0.1516310249747229, - "learning_rate": 1.903187065253076e-06, - "loss": 0.0486, + "epoch": 1.581981981981982, + "grad_norm": 0.12980503831327814, + "learning_rate": 2.518111288356345e-05, + "loss": 0.0741, "step": 439 }, { - "epoch": 2.838709677419355, - "grad_norm": 0.1154092981393085, - "learning_rate": 1.7600261616745106e-06, - "loss": 0.044, + "epoch": 1.5855855855855856, + "grad_norm": 0.12335416962202174, + "learning_rate": 2.4764049762041874e-05, + "loss": 0.072, "step": 440 }, { - "epoch": 2.8451612903225807, - "grad_norm": 0.14626098217303432, - "learning_rate": 1.6224144964036681e-06, - "loss": 0.045, + "epoch": 1.5891891891891892, + "grad_norm": 0.10492372446642, + "learning_rate": 2.4349980731756894e-05, + "loss": 0.0716, "step": 441 }, { - "epoch": 2.8516129032258064, - "grad_norm": 0.09272906884038755, - "learning_rate": 1.4903598426503241e-06, - "loss": 0.0305, + "epoch": 1.592792792792793, + "grad_norm": 0.14899587708147966, + "learning_rate": 2.3938922271007147e-05, + "loss": 0.0925, "step": 442 }, { - "epoch": 2.858064516129032, - "grad_norm": 0.1688562089326058, - "learning_rate": 1.3638696597277679e-06, - "loss": 0.0402, + "epoch": 1.5963963963963965, + "grad_norm": 0.13084935137363646, + "learning_rate": 2.353089073828255e-05, + "loss": 0.0823, "step": 443 }, { - "epoch": 2.864516129032258, - "grad_norm": 0.11628209549950047, - "learning_rate": 1.2429510926314836e-06, - "loss": 0.0309, + "epoch": 1.6, + "grad_norm": 0.13030296264089844, + "learning_rate": 2.312590237161335e-05, + "loss": 0.0725, "step": 444 }, { - "epoch": 2.870967741935484, - "grad_norm": 0.14018105443781761, - "learning_rate": 1.1276109716355287e-06, - "loss": 0.0466, + "epoch": 1.6036036036036037, + "grad_norm": 0.09700200874554347, + "learning_rate": 2.2723973287923962e-05, + "loss": 0.0664, "step": 445 }, { - "epoch": 2.8774193548387097, - "grad_norm": 0.12354234788520546, - "learning_rate": 1.0178558119067315e-06, - "loss": 0.0315, + "epoch": 1.6072072072072072, + "grad_norm": 0.10633986810502474, + "learning_rate": 2.2325119482391467e-05, + "loss": 0.0679, "step": 446 }, { - "epoch": 2.8838709677419354, - "grad_norm": 0.15472531322652747, - "learning_rate": 9.136918131366412e-07, - "loss": 0.0436, + "epoch": 1.6108108108108108, + "grad_norm": 0.11142544471141212, + "learning_rate": 2.1929356827809057e-05, + "loss": 0.0614, "step": 447 }, { - "epoch": 2.8903225806451616, - "grad_norm": 0.16727260037004013, - "learning_rate": 8.151248591913518e-07, - "loss": 0.0413, + "epoch": 1.6144144144144144, + "grad_norm": 0.09250950825140586, + "learning_rate": 2.1536701073954558e-05, + "loss": 0.0588, "step": 448 }, { - "epoch": 2.896774193548387, - "grad_norm": 0.15358557638143366, - "learning_rate": 7.221605177791691e-07, - "loss": 0.0477, + "epoch": 1.618018018018018, + "grad_norm": 0.1332480809778604, + "learning_rate": 2.1147167846963422e-05, + "loss": 0.0803, "step": 449 }, { - "epoch": 2.903225806451613, - "grad_norm": 0.11899732193116695, - "learning_rate": 6.348040401360833e-07, - "loss": 0.0337, + "epoch": 1.6216216216216215, + "grad_norm": 0.14121311299673173, + "learning_rate": 2.0760772648707016e-05, + "loss": 0.0947, "step": 450 }, { - "epoch": 2.9096774193548387, - "grad_norm": 0.19561410636845064, - "learning_rate": 5.530603607290851e-07, - "loss": 0.0556, + "epoch": 1.6252252252252253, + "grad_norm": 0.10826628920408898, + "learning_rate": 2.037753085617563e-05, + "loss": 0.0704, "step": 451 }, { - "epoch": 2.9161290322580644, - "grad_norm": 0.14981745610583072, - "learning_rate": 4.76934096977566e-07, - "loss": 0.0372, + "epoch": 1.6288288288288288, + "grad_norm": 0.08498951848877008, + "learning_rate": 1.999745772086655e-05, + "loss": 0.0416, "step": 452 }, { - "epoch": 2.9225806451612906, - "grad_norm": 0.1039480982798509, - "learning_rate": 4.0642954899238197e-07, - "loss": 0.0343, + "epoch": 1.6324324324324324, + "grad_norm": 0.15866638731251254, + "learning_rate": 1.9620568368177184e-05, + "loss": 0.0836, "step": 453 }, { - "epoch": 2.9290322580645163, - "grad_norm": 0.11333580055218463, - "learning_rate": 3.415506993330153e-07, - "loss": 0.0388, + "epoch": 1.6360360360360362, + "grad_norm": 0.11879058663162903, + "learning_rate": 1.924687779680302e-05, + "loss": 0.0721, "step": 454 }, { - "epoch": 2.935483870967742, - "grad_norm": 0.10481858385197364, - "learning_rate": 2.8230121278257637e-07, - "loss": 0.0313, + "epoch": 1.6396396396396398, + "grad_norm": 0.16212624210699175, + "learning_rate": 1.8876400878140775e-05, + "loss": 0.0788, "step": 455 }, { - "epoch": 2.9419354838709677, - "grad_norm": 0.15064675470229316, - "learning_rate": 2.2868443614082469e-07, - "loss": 0.0449, + "epoch": 1.6432432432432433, + "grad_norm": 0.1205599470750737, + "learning_rate": 1.8509152355696623e-05, + "loss": 0.0891, "step": 456 }, { - "epoch": 2.9483870967741934, - "grad_norm": 0.13918413557689208, - "learning_rate": 1.8070339803509807e-07, - "loss": 0.0414, + "epoch": 1.646846846846847, + "grad_norm": 0.10723522883909493, + "learning_rate": 1.8145146844499383e-05, + "loss": 0.0702, "step": 457 }, { - "epoch": 2.9548387096774196, - "grad_norm": 0.15721864030858848, - "learning_rate": 1.3836080874926049e-07, - "loss": 0.0342, + "epoch": 1.6504504504504505, + "grad_norm": 0.10243191710751155, + "learning_rate": 1.7784398830519e-05, + "loss": 0.0558, "step": 458 }, { - "epoch": 2.9612903225806453, - "grad_norm": 0.15411456561887094, - "learning_rate": 1.0165906007056914e-07, - "loss": 0.0418, + "epoch": 1.654054054054054, + "grad_norm": 0.08896982101996669, + "learning_rate": 1.742692267008996e-05, + "loss": 0.0603, "step": 459 }, { - "epoch": 2.967741935483871, - "grad_norm": 0.1513944041768792, - "learning_rate": 7.060022515460451e-08, - "loss": 0.0364, + "epoch": 1.6576576576576576, + "grad_norm": 0.13618088732046185, + "learning_rate": 1.7072732589339955e-05, + "loss": 0.0744, "step": 460 }, { - "epoch": 2.9741935483870967, - "grad_norm": 0.22512284396579382, - "learning_rate": 4.518605840815315e-08, - "loss": 0.0447, + "epoch": 1.6612612612612612, + "grad_norm": 0.14798363521421304, + "learning_rate": 1.672184268362391e-05, + "loss": 0.0875, "step": 461 }, { - "epoch": 2.9806451612903224, - "grad_norm": 0.1529295717768871, - "learning_rate": 2.5417995390086824e-08, - "loss": 0.0449, + "epoch": 1.6648648648648647, + "grad_norm": 0.10230547180028401, + "learning_rate": 1.6374266916962832e-05, + "loss": 0.0497, "step": 462 }, { - "epoch": 2.9870967741935486, - "grad_norm": 0.13942712092352366, - "learning_rate": 1.129715273033849e-08, - "loss": 0.0367, + "epoch": 1.6684684684684683, + "grad_norm": 0.10092039325506573, + "learning_rate": 1.6030019121488227e-05, + "loss": 0.0611, "step": 463 }, { - "epoch": 2.9935483870967743, - "grad_norm": 0.13569899513050301, - "learning_rate": 2.824328066730608e-09, - "loss": 0.0398, + "epoch": 1.672072072072072, + "grad_norm": 0.10528958039623297, + "learning_rate": 1.5689112996891576e-05, + "loss": 0.0719, "step": 464 }, { - "epoch": 3.0, - "grad_norm": 0.11004047385534296, - "learning_rate": 0.0, - "loss": 0.0305, + "epoch": 1.6756756756756757, + "grad_norm": 0.1357872146845403, + "learning_rate": 1.535156210987917e-05, + "loss": 0.0792, "step": 465 }, { - "epoch": 3.0, - "eval_loss": 0.2107405662536621, - "eval_runtime": 25.5157, - "eval_samples_per_second": 5.134, - "eval_steps_per_second": 0.666, - "step": 465 + "epoch": 1.6792792792792792, + "grad_norm": 0.16092347024456755, + "learning_rate": 1.5017379893632255e-05, + "loss": 0.0984, + "step": 466 + }, + { + "epoch": 1.682882882882883, + "grad_norm": 0.152681018425233, + "learning_rate": 1.4686579647272336e-05, + "loss": 0.0665, + "step": 467 + }, + { + "epoch": 1.6864864864864866, + "grad_norm": 0.10940364333158531, + "learning_rate": 1.4359174535331999e-05, + "loss": 0.0678, + "step": 468 + }, + { + "epoch": 1.6900900900900901, + "grad_norm": 0.10224571124448578, + "learning_rate": 1.4035177587230996e-05, + "loss": 0.0681, + "step": 469 + }, + { + "epoch": 1.6936936936936937, + "grad_norm": 0.1117508851307017, + "learning_rate": 1.3714601696757712e-05, + "loss": 0.0705, + "step": 470 + }, + { + "epoch": 1.6972972972972973, + "grad_norm": 0.11380708386619733, + "learning_rate": 1.339745962155613e-05, + "loss": 0.0787, + "step": 471 + }, + { + "epoch": 1.7009009009009008, + "grad_norm": 0.13014691697383945, + "learning_rate": 1.3083763982618025e-05, + "loss": 0.0746, + "step": 472 + }, + { + "epoch": 1.7045045045045044, + "grad_norm": 0.14178781462677711, + "learning_rate": 1.2773527263780626e-05, + "loss": 0.0802, + "step": 473 + }, + { + "epoch": 1.708108108108108, + "grad_norm": 0.13903277880623008, + "learning_rate": 1.2466761811230098e-05, + "loss": 0.0727, + "step": 474 + }, + { + "epoch": 1.7117117117117115, + "grad_norm": 0.14354542440047877, + "learning_rate": 1.2163479833009894e-05, + "loss": 0.0698, + "step": 475 + }, + { + "epoch": 1.7153153153153153, + "grad_norm": 0.12299158086851347, + "learning_rate": 1.1863693398535114e-05, + "loss": 0.0659, + "step": 476 + }, + { + "epoch": 1.718918918918919, + "grad_norm": 0.09882489616646957, + "learning_rate": 1.1567414438112156e-05, + "loss": 0.0626, + "step": 477 + }, + { + "epoch": 1.7225225225225225, + "grad_norm": 0.11570669617801674, + "learning_rate": 1.1274654742463841e-05, + "loss": 0.0646, + "step": 478 + }, + { + "epoch": 1.7261261261261263, + "grad_norm": 0.09504741769642676, + "learning_rate": 1.0985425962260343e-05, + "loss": 0.0587, + "step": 479 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 0.12981804877027894, + "learning_rate": 1.0699739607655435e-05, + "loss": 0.0646, + "step": 480 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.10868480851483209, + "learning_rate": 1.0417607047828426e-05, + "loss": 0.0671, + "step": 481 + }, + { + "epoch": 1.736936936936937, + "grad_norm": 0.13696903097667856, + "learning_rate": 1.01390395105318e-05, + "loss": 0.0786, + "step": 482 + }, + { + "epoch": 1.7405405405405405, + "grad_norm": 0.1157828731078967, + "learning_rate": 9.864048081644261e-06, + "loss": 0.0714, + "step": 483 + }, + { + "epoch": 1.744144144144144, + "grad_norm": 0.08900480095300936, + "learning_rate": 9.592643704729753e-06, + "loss": 0.0544, + "step": 484 + }, + { + "epoch": 1.7477477477477477, + "grad_norm": 0.10882498606926169, + "learning_rate": 9.324837180601741e-06, + "loss": 0.0645, + "step": 485 + }, + { + "epoch": 1.7513513513513512, + "grad_norm": 0.1879024889958637, + "learning_rate": 9.060639166893493e-06, + "loss": 0.0682, + "step": 486 + }, + { + "epoch": 1.7549549549549548, + "grad_norm": 0.1072698067292599, + "learning_rate": 8.80006017763395e-06, + "loss": 0.0558, + "step": 487 + }, + { + "epoch": 1.7585585585585586, + "grad_norm": 0.1177990807822665, + "learning_rate": 8.543110582829272e-06, + "loss": 0.0592, + "step": 488 + }, + { + "epoch": 1.7621621621621621, + "grad_norm": 0.1144052781708329, + "learning_rate": 8.289800608050202e-06, + "loss": 0.0685, + "step": 489 + }, + { + "epoch": 1.7657657657657657, + "grad_norm": 0.13290723111377578, + "learning_rate": 8.040140334025082e-06, + "loss": 0.0787, + "step": 490 + }, + { + "epoch": 1.7693693693693695, + "grad_norm": 0.14325581569648474, + "learning_rate": 7.794139696238645e-06, + "loss": 0.0767, + "step": 491 + }, + { + "epoch": 1.772972972972973, + "grad_norm": 0.13940373462189817, + "learning_rate": 7.551808484536782e-06, + "loss": 0.0713, + "step": 492 + }, + { + "epoch": 1.7765765765765766, + "grad_norm": 0.13465478912453277, + "learning_rate": 7.313156342736738e-06, + "loss": 0.0838, + "step": 493 + }, + { + "epoch": 1.7801801801801802, + "grad_norm": 0.10122791673864097, + "learning_rate": 7.078192768243486e-06, + "loss": 0.0577, + "step": 494 + }, + { + "epoch": 1.7837837837837838, + "grad_norm": 0.1571652297573745, + "learning_rate": 6.846927111671686e-06, + "loss": 0.0905, + "step": 495 + }, + { + "epoch": 1.7873873873873873, + "grad_norm": 0.12718467808306833, + "learning_rate": 6.61936857647355e-06, + "loss": 0.0566, + "step": 496 + }, + { + "epoch": 1.790990990990991, + "grad_norm": 0.15255418113431185, + "learning_rate": 6.395526218572723e-06, + "loss": 0.0646, + "step": 497 + }, + { + "epoch": 1.7945945945945945, + "grad_norm": 0.13721542773233897, + "learning_rate": 6.175408946003703e-06, + "loss": 0.0752, + "step": 498 + }, + { + "epoch": 1.798198198198198, + "grad_norm": 0.11503986327868332, + "learning_rate": 5.959025518557437e-06, + "loss": 0.0753, + "step": 499 + }, + { + "epoch": 1.8018018018018018, + "grad_norm": 0.09966839333280018, + "learning_rate": 5.746384547432737e-06, + "loss": 0.0658, + "step": 500 + }, + { + "epoch": 1.8054054054054054, + "grad_norm": 0.13097929394232063, + "learning_rate": 5.5374944948935135e-06, + "loss": 0.0647, + "step": 501 + }, + { + "epoch": 1.809009009009009, + "grad_norm": 0.11916565098885482, + "learning_rate": 5.332363673932106e-06, + "loss": 0.0683, + "step": 502 + }, + { + "epoch": 1.8126126126126128, + "grad_norm": 0.14740354567299807, + "learning_rate": 5.131000247938367e-06, + "loss": 0.0855, + "step": 503 + }, + { + "epoch": 1.8162162162162163, + "grad_norm": 0.12344512852482942, + "learning_rate": 4.933412230374812e-06, + "loss": 0.0689, + "step": 504 + }, + { + "epoch": 1.8198198198198199, + "grad_norm": 0.15079351397270196, + "learning_rate": 4.7396074844577975e-06, + "loss": 0.0865, + "step": 505 + }, + { + "epoch": 1.8234234234234235, + "grad_norm": 0.13387613481208308, + "learning_rate": 4.549593722844492e-06, + "loss": 0.0761, + "step": 506 + }, + { + "epoch": 1.827027027027027, + "grad_norm": 0.07611010028345803, + "learning_rate": 4.363378507325955e-06, + "loss": 0.0431, + "step": 507 + }, + { + "epoch": 1.8306306306306306, + "grad_norm": 0.13798397287470085, + "learning_rate": 4.180969248526334e-06, + "loss": 0.0806, + "step": 508 + }, + { + "epoch": 1.8342342342342342, + "grad_norm": 0.17673529491591075, + "learning_rate": 4.002373205607723e-06, + "loss": 0.0974, + "step": 509 + }, + { + "epoch": 1.8378378378378377, + "grad_norm": 0.09469787111672756, + "learning_rate": 3.827597485981527e-06, + "loss": 0.0592, + "step": 510 + }, + { + "epoch": 1.8414414414414413, + "grad_norm": 0.10244301409759372, + "learning_rate": 3.6566490450254286e-06, + "loss": 0.0629, + "step": 511 + }, + { + "epoch": 1.845045045045045, + "grad_norm": 0.09239653678543874, + "learning_rate": 3.4895346858066724e-06, + "loss": 0.0637, + "step": 512 + }, + { + "epoch": 1.8486486486486486, + "grad_norm": 0.136056185401784, + "learning_rate": 3.3262610588113307e-06, + "loss": 0.0784, + "step": 513 + }, + { + "epoch": 1.8522522522522522, + "grad_norm": 0.12238600483310333, + "learning_rate": 3.1668346616795963e-06, + "loss": 0.0703, + "step": 514 + }, + { + "epoch": 1.855855855855856, + "grad_norm": 0.15777844339693373, + "learning_rate": 3.011261838947277e-06, + "loss": 0.0861, + "step": 515 + }, + { + "epoch": 1.8594594594594596, + "grad_norm": 0.149256801647933, + "learning_rate": 2.859548781793242e-06, + "loss": 0.0816, + "step": 516 + }, + { + "epoch": 1.8630630630630631, + "grad_norm": 0.1158625228168308, + "learning_rate": 2.711701527793031e-06, + "loss": 0.0757, + "step": 517 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.127467190710186, + "learning_rate": 2.5677259606786684e-06, + "loss": 0.0718, + "step": 518 + }, + { + "epoch": 1.8702702702702703, + "grad_norm": 0.11178985379767997, + "learning_rate": 2.4276278101044046e-06, + "loss": 0.0729, + "step": 519 + }, + { + "epoch": 1.8738738738738738, + "grad_norm": 0.13318873038147186, + "learning_rate": 2.291412651418778e-06, + "loss": 0.0874, + "step": 520 + }, + { + "epoch": 1.8774774774774774, + "grad_norm": 0.11095794613520046, + "learning_rate": 2.159085905442737e-06, + "loss": 0.0632, + "step": 521 + }, + { + "epoch": 1.881081081081081, + "grad_norm": 0.13466988936312552, + "learning_rate": 2.03065283825381e-06, + "loss": 0.0696, + "step": 522 + }, + { + "epoch": 1.8846846846846845, + "grad_norm": 0.1501784745699571, + "learning_rate": 1.9061185609766995e-06, + "loss": 0.0781, + "step": 523 + }, + { + "epoch": 1.8882882882882883, + "grad_norm": 0.12439813775960844, + "learning_rate": 1.7854880295797405e-06, + "loss": 0.0778, + "step": 524 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.10543026463333441, + "learning_rate": 1.6687660446777277e-06, + "loss": 0.0658, + "step": 525 + }, + { + "epoch": 1.8954954954954955, + "grad_norm": 0.1493387910647338, + "learning_rate": 1.5559572513409338e-06, + "loss": 0.0733, + "step": 526 + }, + { + "epoch": 1.8990990990990992, + "grad_norm": 0.11530728589499968, + "learning_rate": 1.4470661389100804e-06, + "loss": 0.066, + "step": 527 + }, + { + "epoch": 1.9027027027027028, + "grad_norm": 0.0937287880770091, + "learning_rate": 1.3420970408178913e-06, + "loss": 0.0508, + "step": 528 + }, + { + "epoch": 1.9063063063063064, + "grad_norm": 0.1140049075150384, + "learning_rate": 1.241054134416464e-06, + "loss": 0.0722, + "step": 529 + }, + { + "epoch": 1.90990990990991, + "grad_norm": 0.10812439212587106, + "learning_rate": 1.143941440811147e-06, + "loss": 0.0607, + "step": 530 + }, + { + "epoch": 1.9135135135135135, + "grad_norm": 0.10082812889823659, + "learning_rate": 1.0507628247004465e-06, + "loss": 0.0611, + "step": 531 + }, + { + "epoch": 1.917117117117117, + "grad_norm": 0.1646769037018901, + "learning_rate": 9.615219942222474e-07, + "loss": 0.0907, + "step": 532 + }, + { + "epoch": 1.9207207207207206, + "grad_norm": 0.13342387053057897, + "learning_rate": 8.762225008062674e-07, + "loss": 0.0705, + "step": 533 + }, + { + "epoch": 1.9243243243243242, + "grad_norm": 0.10099025782756708, + "learning_rate": 7.948677390326786e-07, + "loss": 0.0594, + "step": 534 + }, + { + "epoch": 1.9279279279279278, + "grad_norm": 0.10984097417427825, + "learning_rate": 7.174609464970505e-07, + "loss": 0.064, + "step": 535 + }, + { + "epoch": 1.9315315315315316, + "grad_norm": 0.09250058769796181, + "learning_rate": 6.440052036815081e-07, + "loss": 0.0488, + "step": 536 + }, + { + "epoch": 1.9351351351351351, + "grad_norm": 0.11346138285376553, + "learning_rate": 5.745034338321187e-07, + "loss": 0.0709, + "step": 537 + }, + { + "epoch": 1.9387387387387387, + "grad_norm": 0.12145882785931153, + "learning_rate": 5.089584028425743e-07, + "loss": 0.0628, + "step": 538 + }, + { + "epoch": 1.9423423423423425, + "grad_norm": 0.09764192386128882, + "learning_rate": 4.4737271914411236e-07, + "loss": 0.0581, + "step": 539 + }, + { + "epoch": 1.945945945945946, + "grad_norm": 0.11336911485782475, + "learning_rate": 3.8974883360169966e-07, + "loss": 0.0652, + "step": 540 + }, + { + "epoch": 1.9495495495495496, + "grad_norm": 0.1197688852133187, + "learning_rate": 3.360890394165539e-07, + "loss": 0.0797, + "step": 541 + }, + { + "epoch": 1.9531531531531532, + "grad_norm": 0.12068524258623581, + "learning_rate": 2.86395472034795e-07, + "loss": 0.0796, + "step": 542 + }, + { + "epoch": 1.9567567567567568, + "grad_norm": 0.10022614951335301, + "learning_rate": 2.4067010906254626e-07, + "loss": 0.0651, + "step": 543 + }, + { + "epoch": 1.9603603603603603, + "grad_norm": 0.1030646452744324, + "learning_rate": 1.989147701871641e-07, + "loss": 0.0536, + "step": 544 + }, + { + "epoch": 1.9639639639639639, + "grad_norm": 0.07714530191454869, + "learning_rate": 1.611311171048735e-07, + "loss": 0.0494, + "step": 545 + }, + { + "epoch": 1.9675675675675675, + "grad_norm": 0.1157905922358179, + "learning_rate": 1.2732065345462118e-07, + "loss": 0.075, + "step": 546 + }, + { + "epoch": 1.971171171171171, + "grad_norm": 0.07486976400470267, + "learning_rate": 9.748472475823444e-08, + "loss": 0.0454, + "step": 547 + }, + { + "epoch": 1.9747747747747748, + "grad_norm": 0.09262953733370366, + "learning_rate": 7.162451836685291e-08, + "loss": 0.0556, + "step": 548 + }, + { + "epoch": 1.9783783783783784, + "grad_norm": 0.11616787129919331, + "learning_rate": 4.974106341374407e-08, + "loss": 0.0767, + "step": 549 + }, + { + "epoch": 1.981981981981982, + "grad_norm": 0.11058833058189024, + "learning_rate": 3.183523077324724e-08, + "loss": 0.0618, + "step": 550 + }, + { + "epoch": 1.9855855855855857, + "grad_norm": 0.10561626473162405, + "learning_rate": 1.7907733026223394e-08, + "loss": 0.0719, + "step": 551 + }, + { + "epoch": 1.9891891891891893, + "grad_norm": 0.1403058364172833, + "learning_rate": 7.959124431622389e-09, + "loss": 0.092, + "step": 552 + }, + { + "epoch": 1.9927927927927929, + "grad_norm": 0.1326365804299446, + "learning_rate": 1.989800904445005e-09, + "loss": 0.0678, + "step": 553 + }, + { + "epoch": 1.9963963963963964, + "grad_norm": 0.14065192361832085, + "learning_rate": 0.0, + "loss": 0.0822, + "step": 554 + }, + { + "epoch": 1.9963963963963964, + "eval_loss": 0.13327383995056152, + "eval_runtime": 50.2794, + "eval_samples_per_second": 4.654, + "eval_steps_per_second": 0.597, + "step": 554 }, { - "epoch": 3.0, - "step": 465, - "total_flos": 261374226563072.0, - "train_loss": 0.09038178783751304, - "train_runtime": 3542.1658, - "train_samples_per_second": 2.097, - "train_steps_per_second": 0.131 + "epoch": 1.9963963963963964, + "step": 554, + "total_flos": 450799970877440.0, + "train_loss": 0.0942714535009237, + "train_runtime": 4963.7264, + "train_samples_per_second": 1.787, + "train_steps_per_second": 0.112 } ], "logging_steps": 1, - "max_steps": 465, + "max_steps": 554, "num_input_tokens_seen": 0, - "num_train_epochs": 3, + "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -3314,7 +3929,7 @@ "attributes": {} } }, - "total_flos": 261374226563072.0, + "total_flos": 450799970877440.0, "train_batch_size": 2, "trial_name": null, "trial_params": null