{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 465, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0064516129032258064, "grad_norm": 1.8496088327765188, "learning_rate": 4.255319148936171e-06, "loss": 0.9425, "step": 1 }, { "epoch": 0.012903225806451613, "grad_norm": 1.464006720275199, "learning_rate": 8.510638297872341e-06, "loss": 0.6748, "step": 2 }, { "epoch": 0.01935483870967742, "grad_norm": 1.195864584909173, "learning_rate": 1.2765957446808511e-05, "loss": 0.5742, "step": 3 }, { "epoch": 0.025806451612903226, "grad_norm": 1.4795865650630229, "learning_rate": 1.7021276595744682e-05, "loss": 0.7095, "step": 4 }, { "epoch": 0.03225806451612903, "grad_norm": 1.3747220363223371, "learning_rate": 2.1276595744680852e-05, "loss": 0.6648, "step": 5 }, { "epoch": 0.03870967741935484, "grad_norm": 1.301406005018193, "learning_rate": 2.5531914893617022e-05, "loss": 0.6255, "step": 6 }, { "epoch": 0.04516129032258064, "grad_norm": 1.0187483254912535, "learning_rate": 2.9787234042553192e-05, "loss": 0.4435, "step": 7 }, { "epoch": 0.05161290322580645, "grad_norm": 1.0700513081990584, "learning_rate": 3.4042553191489365e-05, "loss": 0.4263, "step": 8 }, { "epoch": 0.05806451612903226, "grad_norm": 0.7712554505163577, "learning_rate": 3.829787234042553e-05, "loss": 0.2511, "step": 9 }, { "epoch": 0.06451612903225806, "grad_norm": 0.7232887438916186, "learning_rate": 4.2553191489361704e-05, "loss": 0.1781, "step": 10 }, { "epoch": 0.07096774193548387, "grad_norm": 0.8652521068380956, "learning_rate": 4.680851063829788e-05, "loss": 0.1926, "step": 11 }, { "epoch": 0.07741935483870968, "grad_norm": 0.5403017280043989, "learning_rate": 5.1063829787234044e-05, "loss": 0.1125, "step": 12 }, { "epoch": 0.08387096774193549, "grad_norm": 0.7070226880901681, "learning_rate": 5.531914893617022e-05, "loss": 0.101, "step": 13 }, { "epoch": 0.09032258064516129, "grad_norm": 0.2650073598784149, "learning_rate": 5.9574468085106384e-05, "loss": 0.1058, "step": 14 }, { "epoch": 0.0967741935483871, "grad_norm": 0.131557187963195, "learning_rate": 6.382978723404256e-05, "loss": 0.1113, "step": 15 }, { "epoch": 0.1032258064516129, "grad_norm": 0.09751111122397599, "learning_rate": 6.808510638297873e-05, "loss": 0.0834, "step": 16 }, { "epoch": 0.10967741935483871, "grad_norm": 0.10433139009644296, "learning_rate": 7.23404255319149e-05, "loss": 0.0961, "step": 17 }, { "epoch": 0.11612903225806452, "grad_norm": 0.1020133297592548, "learning_rate": 7.659574468085106e-05, "loss": 0.097, "step": 18 }, { "epoch": 0.12258064516129032, "grad_norm": 0.07145294843882412, "learning_rate": 8.085106382978723e-05, "loss": 0.0916, "step": 19 }, { "epoch": 0.12903225806451613, "grad_norm": 0.08981924516933591, "learning_rate": 8.510638297872341e-05, "loss": 0.101, "step": 20 }, { "epoch": 0.13548387096774195, "grad_norm": 0.0681524232238508, "learning_rate": 8.936170212765958e-05, "loss": 0.0736, "step": 21 }, { "epoch": 0.14193548387096774, "grad_norm": 0.0981002108730681, "learning_rate": 9.361702127659576e-05, "loss": 0.1091, "step": 22 }, { "epoch": 0.14838709677419354, "grad_norm": 0.11001940230271333, "learning_rate": 9.787234042553192e-05, "loss": 0.0991, "step": 23 }, { "epoch": 0.15483870967741936, "grad_norm": 0.06614356059405047, "learning_rate": 0.00010212765957446809, "loss": 0.0591, "step": 24 }, { "epoch": 0.16129032258064516, "grad_norm": 0.05937801367835722, "learning_rate": 0.00010638297872340425, "loss": 0.0633, "step": 25 }, { "epoch": 0.16774193548387098, "grad_norm": 0.07592110532595596, "learning_rate": 0.00011063829787234043, "loss": 0.095, "step": 26 }, { "epoch": 0.17419354838709677, "grad_norm": 0.07287278501663387, "learning_rate": 0.00011489361702127661, "loss": 0.09, "step": 27 }, { "epoch": 0.18064516129032257, "grad_norm": 0.07270501072864007, "learning_rate": 0.00011914893617021277, "loss": 0.0764, "step": 28 }, { "epoch": 0.1870967741935484, "grad_norm": 0.08202627980248832, "learning_rate": 0.00012340425531914893, "loss": 0.0841, "step": 29 }, { "epoch": 0.1935483870967742, "grad_norm": 0.10309403095606792, "learning_rate": 0.00012765957446808513, "loss": 0.0989, "step": 30 }, { "epoch": 0.2, "grad_norm": 0.08068773698268775, "learning_rate": 0.00013191489361702127, "loss": 0.0844, "step": 31 }, { "epoch": 0.2064516129032258, "grad_norm": 0.06817985516792972, "learning_rate": 0.00013617021276595746, "loss": 0.0793, "step": 32 }, { "epoch": 0.2129032258064516, "grad_norm": 0.06656651609987188, "learning_rate": 0.00014042553191489363, "loss": 0.0666, "step": 33 }, { "epoch": 0.21935483870967742, "grad_norm": 0.09101885574528384, "learning_rate": 0.0001446808510638298, "loss": 0.0856, "step": 34 }, { "epoch": 0.22580645161290322, "grad_norm": 0.11244141382637182, "learning_rate": 0.00014893617021276596, "loss": 0.1076, "step": 35 }, { "epoch": 0.23225806451612904, "grad_norm": 0.07641687845159767, "learning_rate": 0.00015319148936170213, "loss": 0.0678, "step": 36 }, { "epoch": 0.23870967741935484, "grad_norm": 0.08682080497813398, "learning_rate": 0.00015744680851063832, "loss": 0.0859, "step": 37 }, { "epoch": 0.24516129032258063, "grad_norm": 0.07050044811298162, "learning_rate": 0.00016170212765957446, "loss": 0.0746, "step": 38 }, { "epoch": 0.25161290322580643, "grad_norm": 0.10418569203255569, "learning_rate": 0.00016595744680851065, "loss": 0.1028, "step": 39 }, { "epoch": 0.25806451612903225, "grad_norm": 0.14211262039099226, "learning_rate": 0.00017021276595744682, "loss": 0.101, "step": 40 }, { "epoch": 0.2645161290322581, "grad_norm": 0.065192138802457, "learning_rate": 0.00017446808510638298, "loss": 0.0591, "step": 41 }, { "epoch": 0.2709677419354839, "grad_norm": 0.08691739931276934, "learning_rate": 0.00017872340425531915, "loss": 0.0816, "step": 42 }, { "epoch": 0.27741935483870966, "grad_norm": 0.09038372940039405, "learning_rate": 0.00018297872340425532, "loss": 0.0923, "step": 43 }, { "epoch": 0.2838709677419355, "grad_norm": 0.07006837751282566, "learning_rate": 0.0001872340425531915, "loss": 0.0781, "step": 44 }, { "epoch": 0.2903225806451613, "grad_norm": 0.08875720051203487, "learning_rate": 0.00019148936170212768, "loss": 0.0672, "step": 45 }, { "epoch": 0.2967741935483871, "grad_norm": 0.07220322606120361, "learning_rate": 0.00019574468085106384, "loss": 0.0853, "step": 46 }, { "epoch": 0.3032258064516129, "grad_norm": 0.07992244290240197, "learning_rate": 0.0002, "loss": 0.0846, "step": 47 }, { "epoch": 0.3096774193548387, "grad_norm": 0.09917015506018256, "learning_rate": 0.0001999971756719333, "loss": 0.0934, "step": 48 }, { "epoch": 0.3161290322580645, "grad_norm": 0.16671519547672903, "learning_rate": 0.00019998870284726968, "loss": 0.1228, "step": 49 }, { "epoch": 0.3225806451612903, "grad_norm": 0.09107913287594624, "learning_rate": 0.00019997458200460993, "loss": 0.0847, "step": 50 }, { "epoch": 0.32903225806451614, "grad_norm": 0.13795927374096006, "learning_rate": 0.00019995481394159188, "loss": 0.0904, "step": 51 }, { "epoch": 0.33548387096774196, "grad_norm": 0.11415771239086442, "learning_rate": 0.0001999293997748454, "loss": 0.0782, "step": 52 }, { "epoch": 0.3419354838709677, "grad_norm": 0.08413093725717319, "learning_rate": 0.00019989834093992945, "loss": 0.0818, "step": 53 }, { "epoch": 0.34838709677419355, "grad_norm": 0.06942673059881972, "learning_rate": 0.00019986163919125075, "loss": 0.0746, "step": 54 }, { "epoch": 0.3548387096774194, "grad_norm": 0.06779490919317474, "learning_rate": 0.00019981929660196492, "loss": 0.0704, "step": 55 }, { "epoch": 0.36129032258064514, "grad_norm": 0.0705554071643389, "learning_rate": 0.0001997713155638592, "loss": 0.0718, "step": 56 }, { "epoch": 0.36774193548387096, "grad_norm": 0.06094574022360802, "learning_rate": 0.00019971769878721743, "loss": 0.0615, "step": 57 }, { "epoch": 0.3741935483870968, "grad_norm": 0.08174462699234748, "learning_rate": 0.000199658449300667, "loss": 0.0772, "step": 58 }, { "epoch": 0.38064516129032255, "grad_norm": 0.15240897489693175, "learning_rate": 0.00019959357045100764, "loss": 0.1136, "step": 59 }, { "epoch": 0.3870967741935484, "grad_norm": 0.12657819665542305, "learning_rate": 0.00019952306590302247, "loss": 0.1066, "step": 60 }, { "epoch": 0.3935483870967742, "grad_norm": 0.06847012277175818, "learning_rate": 0.00019944693963927092, "loss": 0.0809, "step": 61 }, { "epoch": 0.4, "grad_norm": 0.06995242717454851, "learning_rate": 0.00019936519595986394, "loss": 0.0732, "step": 62 }, { "epoch": 0.4064516129032258, "grad_norm": 0.08400016052467398, "learning_rate": 0.00019927783948222084, "loss": 0.0838, "step": 63 }, { "epoch": 0.4129032258064516, "grad_norm": 0.06773211017815356, "learning_rate": 0.00019918487514080865, "loss": 0.0786, "step": 64 }, { "epoch": 0.41935483870967744, "grad_norm": 0.05982739234697708, "learning_rate": 0.00019908630818686338, "loss": 0.0643, "step": 65 }, { "epoch": 0.4258064516129032, "grad_norm": 0.07075319425828745, "learning_rate": 0.0001989821441880933, "loss": 0.0765, "step": 66 }, { "epoch": 0.432258064516129, "grad_norm": 0.08372038666535586, "learning_rate": 0.00019887238902836448, "loss": 0.0669, "step": 67 }, { "epoch": 0.43870967741935485, "grad_norm": 0.13201060477850413, "learning_rate": 0.00019875704890736853, "loss": 0.1129, "step": 68 }, { "epoch": 0.44516129032258067, "grad_norm": 0.15774619872849757, "learning_rate": 0.00019863613034027224, "loss": 0.1031, "step": 69 }, { "epoch": 0.45161290322580644, "grad_norm": 0.09239020494576224, "learning_rate": 0.0001985096401573497, "loss": 0.0827, "step": 70 }, { "epoch": 0.45806451612903226, "grad_norm": 0.08710163284471478, "learning_rate": 0.00019837758550359636, "loss": 0.0898, "step": 71 }, { "epoch": 0.4645161290322581, "grad_norm": 0.09970425528976597, "learning_rate": 0.0001982399738383255, "loss": 0.0671, "step": 72 }, { "epoch": 0.47096774193548385, "grad_norm": 0.11084567093271626, "learning_rate": 0.00019809681293474693, "loss": 0.0978, "step": 73 }, { "epoch": 0.4774193548387097, "grad_norm": 0.08583289505322891, "learning_rate": 0.0001979481108795278, "loss": 0.0851, "step": 74 }, { "epoch": 0.4838709677419355, "grad_norm": 0.08349769120006256, "learning_rate": 0.00019779387607233586, "loss": 0.096, "step": 75 }, { "epoch": 0.49032258064516127, "grad_norm": 0.08822285487830493, "learning_rate": 0.00019763411722536502, "loss": 0.0797, "step": 76 }, { "epoch": 0.4967741935483871, "grad_norm": 0.08213973307253318, "learning_rate": 0.00019746884336284317, "loss": 0.0798, "step": 77 }, { "epoch": 0.5032258064516129, "grad_norm": 0.13109929672968634, "learning_rate": 0.00019729806382052248, "loss": 0.1037, "step": 78 }, { "epoch": 0.5096774193548387, "grad_norm": 0.09245218089323069, "learning_rate": 0.00019712178824515212, "loss": 0.106, "step": 79 }, { "epoch": 0.5161290322580645, "grad_norm": 0.08163307640075247, "learning_rate": 0.00019694002659393305, "loss": 0.062, "step": 80 }, { "epoch": 0.5225806451612903, "grad_norm": 0.05950648712895881, "learning_rate": 0.00019675278913395606, "loss": 0.0597, "step": 81 }, { "epoch": 0.5290322580645161, "grad_norm": 0.06692605001225288, "learning_rate": 0.0001965600864416213, "loss": 0.0822, "step": 82 }, { "epoch": 0.535483870967742, "grad_norm": 0.08343609115222997, "learning_rate": 0.00019636192940204134, "loss": 0.0795, "step": 83 }, { "epoch": 0.5419354838709678, "grad_norm": 0.06269743932265057, "learning_rate": 0.00019615832920842586, "loss": 0.0723, "step": 84 }, { "epoch": 0.5483870967741935, "grad_norm": 0.08562276972837224, "learning_rate": 0.00019594929736144976, "loss": 0.0815, "step": 85 }, { "epoch": 0.5548387096774193, "grad_norm": 0.15766855113117675, "learning_rate": 0.0001957348456686032, "loss": 0.116, "step": 86 }, { "epoch": 0.5612903225806452, "grad_norm": 0.09597817519413815, "learning_rate": 0.00019551498624352496, "loss": 0.0748, "step": 87 }, { "epoch": 0.567741935483871, "grad_norm": 0.0781406533241935, "learning_rate": 0.00019528973150531787, "loss": 0.0767, "step": 88 }, { "epoch": 0.5741935483870968, "grad_norm": 0.06202378155483603, "learning_rate": 0.00019505909417784754, "loss": 0.0824, "step": 89 }, { "epoch": 0.5806451612903226, "grad_norm": 0.07132758636096014, "learning_rate": 0.00019482308728902356, "loss": 0.0711, "step": 90 }, { "epoch": 0.5870967741935483, "grad_norm": 0.08324703218713819, "learning_rate": 0.00019458172417006347, "loss": 0.0908, "step": 91 }, { "epoch": 0.5935483870967742, "grad_norm": 0.07808035645363601, "learning_rate": 0.00019433501845473995, "loss": 0.0798, "step": 92 }, { "epoch": 0.6, "grad_norm": 0.09922957657013898, "learning_rate": 0.00019408298407861042, "loss": 0.0938, "step": 93 }, { "epoch": 0.6064516129032258, "grad_norm": 0.11239213177839262, "learning_rate": 0.00019382563527823026, "loss": 0.0958, "step": 94 }, { "epoch": 0.6129032258064516, "grad_norm": 0.11717297126046855, "learning_rate": 0.00019356298659034817, "loss": 0.1095, "step": 95 }, { "epoch": 0.6193548387096774, "grad_norm": 0.058447164828301125, "learning_rate": 0.00019329505285108542, "loss": 0.0656, "step": 96 }, { "epoch": 0.6258064516129033, "grad_norm": 0.06781770204369802, "learning_rate": 0.00019302184919509755, "loss": 0.0731, "step": 97 }, { "epoch": 0.632258064516129, "grad_norm": 0.07151161214033872, "learning_rate": 0.00019274339105471971, "loss": 0.0715, "step": 98 }, { "epoch": 0.6387096774193548, "grad_norm": 0.07604008792488949, "learning_rate": 0.00019245969415909465, "loss": 0.0742, "step": 99 }, { "epoch": 0.6451612903225806, "grad_norm": 0.10649337867077462, "learning_rate": 0.00019217077453328449, "loss": 0.0888, "step": 100 }, { "epoch": 0.6516129032258065, "grad_norm": 0.06658172686723186, "learning_rate": 0.0001918766484973654, "loss": 0.0596, "step": 101 }, { "epoch": 0.6580645161290323, "grad_norm": 0.07673513301158036, "learning_rate": 0.00019157733266550575, "loss": 0.0807, "step": 102 }, { "epoch": 0.6645161290322581, "grad_norm": 0.10741827222890765, "learning_rate": 0.0001912728439450276, "loss": 0.0957, "step": 103 }, { "epoch": 0.6709677419354839, "grad_norm": 0.07943997555150896, "learning_rate": 0.00019096319953545185, "loss": 0.0732, "step": 104 }, { "epoch": 0.6774193548387096, "grad_norm": 0.09157580363747424, "learning_rate": 0.0001906484169275263, "loss": 0.0964, "step": 105 }, { "epoch": 0.6838709677419355, "grad_norm": 0.07483334970504105, "learning_rate": 0.00019032851390223812, "loss": 0.0813, "step": 106 }, { "epoch": 0.6903225806451613, "grad_norm": 0.08004302588923752, "learning_rate": 0.00019000350852980909, "loss": 0.0792, "step": 107 }, { "epoch": 0.6967741935483871, "grad_norm": 0.10990654224626453, "learning_rate": 0.00018967341916867518, "loss": 0.096, "step": 108 }, { "epoch": 0.7032258064516129, "grad_norm": 0.16146890814677625, "learning_rate": 0.00018933826446444933, "loss": 0.1212, "step": 109 }, { "epoch": 0.7096774193548387, "grad_norm": 0.0695644520734436, "learning_rate": 0.0001889980633488683, "loss": 0.072, "step": 110 }, { "epoch": 0.7161290322580646, "grad_norm": 0.07364701758879359, "learning_rate": 0.00018865283503872324, "loss": 0.0831, "step": 111 }, { "epoch": 0.7225806451612903, "grad_norm": 0.06683550033086046, "learning_rate": 0.00018830259903477426, "loss": 0.0801, "step": 112 }, { "epoch": 0.7290322580645161, "grad_norm": 0.12012664311858479, "learning_rate": 0.0001879473751206489, "loss": 0.1029, "step": 113 }, { "epoch": 0.7354838709677419, "grad_norm": 0.09146802828659024, "learning_rate": 0.0001875871833617246, "loss": 0.0894, "step": 114 }, { "epoch": 0.7419354838709677, "grad_norm": 0.07579863274704472, "learning_rate": 0.0001872220441039952, "loss": 0.0713, "step": 115 }, { "epoch": 0.7483870967741936, "grad_norm": 0.07086039028499559, "learning_rate": 0.0001868519779729218, "loss": 0.0731, "step": 116 }, { "epoch": 0.7548387096774194, "grad_norm": 0.07163280313926258, "learning_rate": 0.0001864770058722676, "loss": 0.0727, "step": 117 }, { "epoch": 0.7612903225806451, "grad_norm": 0.09459308338109298, "learning_rate": 0.00018609714898291718, "loss": 0.074, "step": 118 }, { "epoch": 0.7677419354838709, "grad_norm": 0.07703917957047136, "learning_rate": 0.00018571242876167996, "loss": 0.08, "step": 119 }, { "epoch": 0.7741935483870968, "grad_norm": 0.09836589908180506, "learning_rate": 0.0001853228669400784, "loss": 0.0922, "step": 120 }, { "epoch": 0.7806451612903226, "grad_norm": 0.09387479622355775, "learning_rate": 0.00018492848552312014, "loss": 0.1033, "step": 121 }, { "epoch": 0.7870967741935484, "grad_norm": 0.12284239686014672, "learning_rate": 0.00018452930678805536, "loss": 0.0999, "step": 122 }, { "epoch": 0.7935483870967742, "grad_norm": 0.09213360621981392, "learning_rate": 0.00018412535328311814, "loss": 0.0953, "step": 123 }, { "epoch": 0.8, "grad_norm": 0.09837327897834967, "learning_rate": 0.00018371664782625287, "loss": 0.1015, "step": 124 }, { "epoch": 0.8064516129032258, "grad_norm": 0.11241908687558025, "learning_rate": 0.00018330321350382544, "loss": 0.0871, "step": 125 }, { "epoch": 0.8129032258064516, "grad_norm": 0.10016314695009264, "learning_rate": 0.00018288507366931905, "loss": 0.1017, "step": 126 }, { "epoch": 0.8193548387096774, "grad_norm": 0.08602103688908509, "learning_rate": 0.00018246225194201517, "loss": 0.0812, "step": 127 }, { "epoch": 0.8258064516129032, "grad_norm": 0.12818530303124728, "learning_rate": 0.00018203477220565912, "loss": 0.1021, "step": 128 }, { "epoch": 0.832258064516129, "grad_norm": 0.10135096833159363, "learning_rate": 0.00018160265860711134, "loss": 0.1006, "step": 129 }, { "epoch": 0.8387096774193549, "grad_norm": 0.06634065993081388, "learning_rate": 0.00018116593555498307, "loss": 0.0771, "step": 130 }, { "epoch": 0.8451612903225807, "grad_norm": 0.08489626818724719, "learning_rate": 0.0001807246277182578, "loss": 0.0786, "step": 131 }, { "epoch": 0.8516129032258064, "grad_norm": 0.08888467555852377, "learning_rate": 0.0001802787600248977, "loss": 0.0991, "step": 132 }, { "epoch": 0.8580645161290322, "grad_norm": 0.10636042561644678, "learning_rate": 0.0001798283576604356, "loss": 0.1011, "step": 133 }, { "epoch": 0.864516129032258, "grad_norm": 0.07381978120350188, "learning_rate": 0.0001793734460665523, "loss": 0.0942, "step": 134 }, { "epoch": 0.8709677419354839, "grad_norm": 0.09141476973335388, "learning_rate": 0.00017891405093963938, "loss": 0.1003, "step": 135 }, { "epoch": 0.8774193548387097, "grad_norm": 0.08895622256143133, "learning_rate": 0.0001784501982293479, "loss": 0.0852, "step": 136 }, { "epoch": 0.8838709677419355, "grad_norm": 0.08177526392628556, "learning_rate": 0.00017798191413712243, "loss": 0.084, "step": 137 }, { "epoch": 0.8903225806451613, "grad_norm": 0.07208124492736653, "learning_rate": 0.0001775092251147211, "loss": 0.0765, "step": 138 }, { "epoch": 0.896774193548387, "grad_norm": 0.06475169326738762, "learning_rate": 0.0001770321578627213, "loss": 0.0677, "step": 139 }, { "epoch": 0.9032258064516129, "grad_norm": 0.12805303578104438, "learning_rate": 0.00017655073932901168, "loss": 0.1159, "step": 140 }, { "epoch": 0.9096774193548387, "grad_norm": 0.09062553054321057, "learning_rate": 0.0001760649967072697, "loss": 0.1001, "step": 141 }, { "epoch": 0.9161290322580645, "grad_norm": 0.09471211412797043, "learning_rate": 0.00017557495743542585, "loss": 0.0984, "step": 142 }, { "epoch": 0.9225806451612903, "grad_norm": 0.07126884163211589, "learning_rate": 0.00017508064919411344, "loss": 0.0675, "step": 143 }, { "epoch": 0.9290322580645162, "grad_norm": 0.06754737396081824, "learning_rate": 0.00017458209990510527, "loss": 0.0782, "step": 144 }, { "epoch": 0.9354838709677419, "grad_norm": 0.09114175386188283, "learning_rate": 0.00017407933772973637, "loss": 0.0955, "step": 145 }, { "epoch": 0.9419354838709677, "grad_norm": 0.09779279853308924, "learning_rate": 0.00017357239106731317, "loss": 0.1061, "step": 146 }, { "epoch": 0.9483870967741935, "grad_norm": 0.09679111364819645, "learning_rate": 0.00017306128855350942, "loss": 0.0942, "step": 147 }, { "epoch": 0.9548387096774194, "grad_norm": 0.1619352621242093, "learning_rate": 0.0001725460590587486, "loss": 0.1215, "step": 148 }, { "epoch": 0.9612903225806452, "grad_norm": 0.07218712567651438, "learning_rate": 0.00017202673168657318, "loss": 0.0716, "step": 149 }, { "epoch": 0.967741935483871, "grad_norm": 0.0768978967349938, "learning_rate": 0.0001715033357720006, "loss": 0.0921, "step": 150 }, { "epoch": 0.9741935483870968, "grad_norm": 0.0716182862543501, "learning_rate": 0.00017097590087986633, "loss": 0.069, "step": 151 }, { "epoch": 0.9806451612903225, "grad_norm": 0.09641833742826841, "learning_rate": 0.00017044445680315372, "loss": 0.0975, "step": 152 }, { "epoch": 0.9870967741935484, "grad_norm": 0.1015814477912073, "learning_rate": 0.00016990903356131124, "loss": 0.0935, "step": 153 }, { "epoch": 0.9935483870967742, "grad_norm": 0.07050697207500581, "learning_rate": 0.00016936966139855663, "loss": 0.0728, "step": 154 }, { "epoch": 1.0, "grad_norm": 0.12410114043097807, "learning_rate": 0.00016882637078216868, "loss": 0.0918, "step": 155 }, { "epoch": 1.0, "eval_loss": 0.08952951431274414, "eval_runtime": 27.6637, "eval_samples_per_second": 4.735, "eval_steps_per_second": 0.615, "step": 155 }, { "epoch": 1.0064516129032257, "grad_norm": 0.05504498458319807, "learning_rate": 0.0001682791924007661, "loss": 0.0464, "step": 156 }, { "epoch": 1.0129032258064516, "grad_norm": 0.06037793847523705, "learning_rate": 0.00016772815716257412, "loss": 0.0569, "step": 157 }, { "epoch": 1.0193548387096774, "grad_norm": 0.06519500890305997, "learning_rate": 0.0001671732961936785, "loss": 0.0785, "step": 158 }, { "epoch": 1.0258064516129033, "grad_norm": 0.06362711429608407, "learning_rate": 0.00016661464083626734, "loss": 0.0492, "step": 159 }, { "epoch": 1.032258064516129, "grad_norm": 0.059790795449393114, "learning_rate": 0.00016605222264686086, "loss": 0.0511, "step": 160 }, { "epoch": 1.038709677419355, "grad_norm": 0.062409758409412354, "learning_rate": 0.00016548607339452853, "loss": 0.0561, "step": 161 }, { "epoch": 1.0451612903225806, "grad_norm": 0.05212609330281915, "learning_rate": 0.00016491622505909482, "loss": 0.0482, "step": 162 }, { "epoch": 1.0516129032258064, "grad_norm": 0.07852785833231216, "learning_rate": 0.00016434270982933273, "loss": 0.0632, "step": 163 }, { "epoch": 1.0580645161290323, "grad_norm": 0.0844199396392266, "learning_rate": 0.0001637655601011454, "loss": 0.0634, "step": 164 }, { "epoch": 1.064516129032258, "grad_norm": 0.07478417013950381, "learning_rate": 0.00016318480847573642, "loss": 0.0705, "step": 165 }, { "epoch": 1.070967741935484, "grad_norm": 0.14677307886341195, "learning_rate": 0.00016260048775776804, "loss": 0.0707, "step": 166 }, { "epoch": 1.0774193548387097, "grad_norm": 0.11159415413369748, "learning_rate": 0.00016201263095350833, "loss": 0.0552, "step": 167 }, { "epoch": 1.0838709677419356, "grad_norm": 0.12431222248668082, "learning_rate": 0.0001614212712689668, "loss": 0.0521, "step": 168 }, { "epoch": 1.0903225806451613, "grad_norm": 0.08991938158139547, "learning_rate": 0.00016082644210801844, "loss": 0.0454, "step": 169 }, { "epoch": 1.096774193548387, "grad_norm": 0.08464763972774585, "learning_rate": 0.00016022817707051724, "loss": 0.0622, "step": 170 }, { "epoch": 1.103225806451613, "grad_norm": 0.12349871251730524, "learning_rate": 0.00015962650995039783, "loss": 0.0704, "step": 171 }, { "epoch": 1.1096774193548387, "grad_norm": 0.1309389973485375, "learning_rate": 0.00015902147473376694, "loss": 0.0711, "step": 172 }, { "epoch": 1.1161290322580646, "grad_norm": 0.06816581304698818, "learning_rate": 0.00015841310559698343, "loss": 0.0566, "step": 173 }, { "epoch": 1.1225806451612903, "grad_norm": 0.0628939350533328, "learning_rate": 0.0001578014369047279, "loss": 0.0515, "step": 174 }, { "epoch": 1.129032258064516, "grad_norm": 0.0859769654461784, "learning_rate": 0.00015718650320806142, "loss": 0.0666, "step": 175 }, { "epoch": 1.135483870967742, "grad_norm": 0.09102699765206346, "learning_rate": 0.00015656833924247398, "loss": 0.0617, "step": 176 }, { "epoch": 1.1419354838709677, "grad_norm": 0.0765372107398411, "learning_rate": 0.00015594697992592232, "loss": 0.0488, "step": 177 }, { "epoch": 1.1483870967741936, "grad_norm": 0.06699310113728325, "learning_rate": 0.00015532246035685756, "loss": 0.0573, "step": 178 }, { "epoch": 1.1548387096774193, "grad_norm": 0.08417544962609826, "learning_rate": 0.00015469481581224272, "loss": 0.0636, "step": 179 }, { "epoch": 1.1612903225806452, "grad_norm": 0.1377222230063782, "learning_rate": 0.00015406408174555976, "loss": 0.0675, "step": 180 }, { "epoch": 1.167741935483871, "grad_norm": 0.11567828351658707, "learning_rate": 0.0001534302937848073, "loss": 0.0694, "step": 181 }, { "epoch": 1.1741935483870969, "grad_norm": 0.07435102139494404, "learning_rate": 0.00015279348773048786, "loss": 0.0619, "step": 182 }, { "epoch": 1.1806451612903226, "grad_norm": 0.16550300010492666, "learning_rate": 0.00015215369955358566, "loss": 0.0863, "step": 183 }, { "epoch": 1.1870967741935483, "grad_norm": 0.12805046103524215, "learning_rate": 0.0001515109653935348, "loss": 0.0787, "step": 184 }, { "epoch": 1.1935483870967742, "grad_norm": 0.07302921753172271, "learning_rate": 0.00015086532155617784, "loss": 0.0616, "step": 185 }, { "epoch": 1.2, "grad_norm": 0.06678947449139945, "learning_rate": 0.00015021680451171498, "loss": 0.049, "step": 186 }, { "epoch": 1.206451612903226, "grad_norm": 0.07635737870842979, "learning_rate": 0.00014956545089264407, "loss": 0.06, "step": 187 }, { "epoch": 1.2129032258064516, "grad_norm": 0.09502911856234499, "learning_rate": 0.0001489112974916912, "loss": 0.0717, "step": 188 }, { "epoch": 1.2193548387096773, "grad_norm": 0.10739599373969563, "learning_rate": 0.00014825438125973264, "loss": 0.0695, "step": 189 }, { "epoch": 1.2258064516129032, "grad_norm": 0.07169666556967867, "learning_rate": 0.00014759473930370736, "loss": 0.0457, "step": 190 }, { "epoch": 1.232258064516129, "grad_norm": 0.0938431700551618, "learning_rate": 0.0001469324088845212, "loss": 0.0512, "step": 191 }, { "epoch": 1.238709677419355, "grad_norm": 0.08141491065850508, "learning_rate": 0.00014626742741494206, "loss": 0.0636, "step": 192 }, { "epoch": 1.2451612903225806, "grad_norm": 0.07323985237455122, "learning_rate": 0.00014559983245748638, "loss": 0.0547, "step": 193 }, { "epoch": 1.2516129032258063, "grad_norm": 0.07650175159597551, "learning_rate": 0.00014492966172229777, "loss": 0.0589, "step": 194 }, { "epoch": 1.2580645161290323, "grad_norm": 0.06313573515968769, "learning_rate": 0.00014425695306501658, "loss": 0.047, "step": 195 }, { "epoch": 1.2645161290322582, "grad_norm": 0.08819272769189934, "learning_rate": 0.00014358174448464154, "loss": 0.0578, "step": 196 }, { "epoch": 1.270967741935484, "grad_norm": 0.1258683666392954, "learning_rate": 0.00014290407412138366, "loss": 0.0694, "step": 197 }, { "epoch": 1.2774193548387096, "grad_norm": 0.11414016288586973, "learning_rate": 0.00014222398025451135, "loss": 0.0604, "step": 198 }, { "epoch": 1.2838709677419355, "grad_norm": 0.11137244002349718, "learning_rate": 0.00014154150130018866, "loss": 0.0756, "step": 199 }, { "epoch": 1.2903225806451613, "grad_norm": 0.07153259224988148, "learning_rate": 0.0001408566758093048, "loss": 0.0538, "step": 200 }, { "epoch": 1.2967741935483872, "grad_norm": 0.09526746882665606, "learning_rate": 0.00014016954246529696, "loss": 0.0494, "step": 201 }, { "epoch": 1.303225806451613, "grad_norm": 0.08649485022046847, "learning_rate": 0.00013948014008196487, "loss": 0.0553, "step": 202 }, { "epoch": 1.3096774193548386, "grad_norm": 0.1050606321924827, "learning_rate": 0.0001387885076012785, "loss": 0.0726, "step": 203 }, { "epoch": 1.3161290322580645, "grad_norm": 0.12765438794483042, "learning_rate": 0.00013809468409117846, "loss": 0.084, "step": 204 }, { "epoch": 1.3225806451612903, "grad_norm": 0.0778070748347935, "learning_rate": 0.00013739870874336898, "loss": 0.0549, "step": 205 }, { "epoch": 1.3290322580645162, "grad_norm": 0.08829501946025207, "learning_rate": 0.00013670062087110422, "loss": 0.054, "step": 206 }, { "epoch": 1.335483870967742, "grad_norm": 0.09272743824756946, "learning_rate": 0.00013600045990696762, "loss": 0.0533, "step": 207 }, { "epoch": 1.3419354838709676, "grad_norm": 0.09752395648449025, "learning_rate": 0.0001352982654006444, "loss": 0.0682, "step": 208 }, { "epoch": 1.3483870967741935, "grad_norm": 0.0787282052904063, "learning_rate": 0.00013459407701668763, "loss": 0.061, "step": 209 }, { "epoch": 1.3548387096774195, "grad_norm": 0.09965818076523, "learning_rate": 0.00013388793453227767, "loss": 0.0708, "step": 210 }, { "epoch": 1.3612903225806452, "grad_norm": 0.08510816398839181, "learning_rate": 0.0001331798778349752, "loss": 0.0612, "step": 211 }, { "epoch": 1.367741935483871, "grad_norm": 0.13415837375152884, "learning_rate": 0.00013246994692046836, "loss": 0.0589, "step": 212 }, { "epoch": 1.3741935483870968, "grad_norm": 0.10509841268799006, "learning_rate": 0.00013175818189031327, "loss": 0.0771, "step": 213 }, { "epoch": 1.3806451612903226, "grad_norm": 0.10611783573942027, "learning_rate": 0.00013104462294966896, "loss": 0.0772, "step": 214 }, { "epoch": 1.3870967741935485, "grad_norm": 0.10192686799206796, "learning_rate": 0.00013032931040502627, "loss": 0.0679, "step": 215 }, { "epoch": 1.3935483870967742, "grad_norm": 0.0734691690524852, "learning_rate": 0.00012961228466193116, "loss": 0.0536, "step": 216 }, { "epoch": 1.4, "grad_norm": 0.08238968788193186, "learning_rate": 0.00012889358622270223, "loss": 0.0469, "step": 217 }, { "epoch": 1.4064516129032258, "grad_norm": 0.107308663713204, "learning_rate": 0.00012817325568414297, "loss": 0.0472, "step": 218 }, { "epoch": 1.4129032258064516, "grad_norm": 0.08899570061026768, "learning_rate": 0.00012745133373524853, "loss": 0.0621, "step": 219 }, { "epoch": 1.4193548387096775, "grad_norm": 0.09218740393476688, "learning_rate": 0.0001267278611549073, "loss": 0.0449, "step": 220 }, { "epoch": 1.4258064516129032, "grad_norm": 0.17826386309202488, "learning_rate": 0.00012600287880959763, "loss": 0.0617, "step": 221 }, { "epoch": 1.432258064516129, "grad_norm": 0.10232360588380689, "learning_rate": 0.0001252764276510792, "loss": 0.0636, "step": 222 }, { "epoch": 1.4387096774193548, "grad_norm": 0.07733076855697436, "learning_rate": 0.00012454854871407994, "loss": 0.0524, "step": 223 }, { "epoch": 1.4451612903225808, "grad_norm": 0.1006676835885761, "learning_rate": 0.00012381928311397806, "loss": 0.057, "step": 224 }, { "epoch": 1.4516129032258065, "grad_norm": 0.080575053343473, "learning_rate": 0.0001230886720444796, "loss": 0.0663, "step": 225 }, { "epoch": 1.4580645161290322, "grad_norm": 0.1246560932677115, "learning_rate": 0.00012235675677529158, "loss": 0.0652, "step": 226 }, { "epoch": 1.4645161290322581, "grad_norm": 0.10632779078755829, "learning_rate": 0.00012162357864979072, "loss": 0.065, "step": 227 }, { "epoch": 1.4709677419354839, "grad_norm": 0.095566354892694, "learning_rate": 0.00012088917908268821, "loss": 0.0704, "step": 228 }, { "epoch": 1.4774193548387098, "grad_norm": 0.1101864261825649, "learning_rate": 0.00012015359955769021, "loss": 0.069, "step": 229 }, { "epoch": 1.4838709677419355, "grad_norm": 0.08091933130202467, "learning_rate": 0.00011941688162515467, "loss": 0.0513, "step": 230 }, { "epoch": 1.4903225806451612, "grad_norm": 0.12491882510784613, "learning_rate": 0.00011867906689974428, "loss": 0.0481, "step": 231 }, { "epoch": 1.4967741935483871, "grad_norm": 0.13567923305564322, "learning_rate": 0.00011794019705807584, "loss": 0.0689, "step": 232 }, { "epoch": 1.5032258064516129, "grad_norm": 0.15901158424809853, "learning_rate": 0.00011720031383636585, "loss": 0.0647, "step": 233 }, { "epoch": 1.5096774193548388, "grad_norm": 0.10314424853828444, "learning_rate": 0.00011645945902807341, "loss": 0.0596, "step": 234 }, { "epoch": 1.5161290322580645, "grad_norm": 0.08231901481570739, "learning_rate": 0.00011571767448153901, "loss": 0.0425, "step": 235 }, { "epoch": 1.5225806451612902, "grad_norm": 0.24497759631459304, "learning_rate": 0.00011497500209762102, "loss": 0.0861, "step": 236 }, { "epoch": 1.5290322580645161, "grad_norm": 0.08192342540576594, "learning_rate": 0.00011423148382732853, "loss": 0.037, "step": 237 }, { "epoch": 1.535483870967742, "grad_norm": 0.086893577170591, "learning_rate": 0.00011348716166945195, "loss": 0.053, "step": 238 }, { "epoch": 1.5419354838709678, "grad_norm": 0.10993883303276072, "learning_rate": 0.0001127420776681905, "loss": 0.0584, "step": 239 }, { "epoch": 1.5483870967741935, "grad_norm": 0.08488031387161675, "learning_rate": 0.00011199627391077732, "loss": 0.0618, "step": 240 }, { "epoch": 1.5548387096774192, "grad_norm": 0.059796713120191955, "learning_rate": 0.00011124979252510208, "loss": 0.0468, "step": 241 }, { "epoch": 1.5612903225806452, "grad_norm": 0.1333781764848824, "learning_rate": 0.0001105026756773314, "loss": 0.0435, "step": 242 }, { "epoch": 1.567741935483871, "grad_norm": 0.13677532337123358, "learning_rate": 0.00010975496556952682, "loss": 0.0822, "step": 243 }, { "epoch": 1.5741935483870968, "grad_norm": 0.07575070108462173, "learning_rate": 0.00010900670443726135, "loss": 0.0518, "step": 244 }, { "epoch": 1.5806451612903225, "grad_norm": 0.10448261881956887, "learning_rate": 0.00010825793454723325, "loss": 0.0754, "step": 245 }, { "epoch": 1.5870967741935482, "grad_norm": 0.1295950737836306, "learning_rate": 0.00010750869819487883, "loss": 0.0745, "step": 246 }, { "epoch": 1.5935483870967742, "grad_norm": 0.09240489021108271, "learning_rate": 0.00010675903770198333, "loss": 0.067, "step": 247 }, { "epoch": 1.6, "grad_norm": 0.07808190236709202, "learning_rate": 0.00010600899541429004, "loss": 0.0546, "step": 248 }, { "epoch": 1.6064516129032258, "grad_norm": 0.0790414641966529, "learning_rate": 0.00010525861369910877, "loss": 0.0498, "step": 249 }, { "epoch": 1.6129032258064515, "grad_norm": 0.08616829997306262, "learning_rate": 0.00010450793494292224, "loss": 0.0655, "step": 250 }, { "epoch": 1.6193548387096774, "grad_norm": 0.08883827111615517, "learning_rate": 0.00010375700154899208, "loss": 0.0657, "step": 251 }, { "epoch": 1.6258064516129034, "grad_norm": 0.09734167988079584, "learning_rate": 0.00010300585593496348, "loss": 0.0614, "step": 252 }, { "epoch": 1.632258064516129, "grad_norm": 0.08926667751389035, "learning_rate": 0.00010225454053046921, "loss": 0.0562, "step": 253 }, { "epoch": 1.6387096774193548, "grad_norm": 0.13520312388410863, "learning_rate": 0.00010150309777473306, "loss": 0.0731, "step": 254 }, { "epoch": 1.6451612903225805, "grad_norm": 0.08417129846577816, "learning_rate": 0.0001007515701141722, "loss": 0.0496, "step": 255 }, { "epoch": 1.6516129032258065, "grad_norm": 0.15813812964913973, "learning_rate": 0.0001, "loss": 0.0622, "step": 256 }, { "epoch": 1.6580645161290324, "grad_norm": 0.07426217779998688, "learning_rate": 9.924842988582782e-05, "loss": 0.0487, "step": 257 }, { "epoch": 1.664516129032258, "grad_norm": 0.14075828336643512, "learning_rate": 9.849690222526698e-05, "loss": 0.0623, "step": 258 }, { "epoch": 1.6709677419354838, "grad_norm": 0.13317164150901403, "learning_rate": 9.77454594695308e-05, "loss": 0.0551, "step": 259 }, { "epoch": 1.6774193548387095, "grad_norm": 0.08655308767786873, "learning_rate": 9.699414406503654e-05, "loss": 0.0678, "step": 260 }, { "epoch": 1.6838709677419355, "grad_norm": 0.11118369085651197, "learning_rate": 9.624299845100795e-05, "loss": 0.0622, "step": 261 }, { "epoch": 1.6903225806451614, "grad_norm": 0.11077421899130901, "learning_rate": 9.549206505707777e-05, "loss": 0.0745, "step": 262 }, { "epoch": 1.696774193548387, "grad_norm": 0.08190755444484397, "learning_rate": 9.474138630089124e-05, "loss": 0.0537, "step": 263 }, { "epoch": 1.7032258064516128, "grad_norm": 0.1113159750162081, "learning_rate": 9.399100458570997e-05, "loss": 0.0691, "step": 264 }, { "epoch": 1.7096774193548387, "grad_norm": 0.09279661384255962, "learning_rate": 9.324096229801674e-05, "loss": 0.0583, "step": 265 }, { "epoch": 1.7161290322580647, "grad_norm": 0.07523224454997395, "learning_rate": 9.249130180512118e-05, "loss": 0.0621, "step": 266 }, { "epoch": 1.7225806451612904, "grad_norm": 0.09015314442775083, "learning_rate": 9.174206545276677e-05, "loss": 0.0669, "step": 267 }, { "epoch": 1.729032258064516, "grad_norm": 0.10636283133034866, "learning_rate": 9.099329556273866e-05, "loss": 0.0739, "step": 268 }, { "epoch": 1.7354838709677418, "grad_norm": 0.1416802448850876, "learning_rate": 9.024503443047319e-05, "loss": 0.0684, "step": 269 }, { "epoch": 1.7419354838709677, "grad_norm": 0.09949812802402141, "learning_rate": 8.949732432266866e-05, "loss": 0.0517, "step": 270 }, { "epoch": 1.7483870967741937, "grad_norm": 0.09911727857006153, "learning_rate": 8.875020747489794e-05, "loss": 0.0695, "step": 271 }, { "epoch": 1.7548387096774194, "grad_norm": 0.09965173501261688, "learning_rate": 8.800372608922271e-05, "loss": 0.0601, "step": 272 }, { "epoch": 1.761290322580645, "grad_norm": 0.11483628184027991, "learning_rate": 8.72579223318095e-05, "loss": 0.0747, "step": 273 }, { "epoch": 1.7677419354838708, "grad_norm": 0.08398542909094164, "learning_rate": 8.651283833054809e-05, "loss": 0.0545, "step": 274 }, { "epoch": 1.7741935483870968, "grad_norm": 0.09206084337768013, "learning_rate": 8.57685161726715e-05, "loss": 0.0626, "step": 275 }, { "epoch": 1.7806451612903227, "grad_norm": 0.10640419292629341, "learning_rate": 8.5024997902379e-05, "loss": 0.0694, "step": 276 }, { "epoch": 1.7870967741935484, "grad_norm": 0.09655305743665399, "learning_rate": 8.428232551846101e-05, "loss": 0.063, "step": 277 }, { "epoch": 1.793548387096774, "grad_norm": 0.10280172089235683, "learning_rate": 8.35405409719266e-05, "loss": 0.0628, "step": 278 }, { "epoch": 1.8, "grad_norm": 0.0924442773479908, "learning_rate": 8.279968616363418e-05, "loss": 0.0614, "step": 279 }, { "epoch": 1.8064516129032258, "grad_norm": 0.09056943407276584, "learning_rate": 8.205980294192421e-05, "loss": 0.0587, "step": 280 }, { "epoch": 1.8129032258064517, "grad_norm": 0.08536247550150874, "learning_rate": 8.132093310025571e-05, "loss": 0.0457, "step": 281 }, { "epoch": 1.8193548387096774, "grad_norm": 0.10278594457682778, "learning_rate": 8.058311837484535e-05, "loss": 0.0743, "step": 282 }, { "epoch": 1.8258064516129031, "grad_norm": 0.10489767053221469, "learning_rate": 7.984640044230983e-05, "loss": 0.059, "step": 283 }, { "epoch": 1.832258064516129, "grad_norm": 0.07066039733968037, "learning_rate": 7.911082091731181e-05, "loss": 0.05, "step": 284 }, { "epoch": 1.838709677419355, "grad_norm": 0.08869091516941932, "learning_rate": 7.837642135020929e-05, "loss": 0.0468, "step": 285 }, { "epoch": 1.8451612903225807, "grad_norm": 0.07614529156417063, "learning_rate": 7.764324322470841e-05, "loss": 0.0504, "step": 286 }, { "epoch": 1.8516129032258064, "grad_norm": 0.12289914291233693, "learning_rate": 7.691132795552043e-05, "loss": 0.0654, "step": 287 }, { "epoch": 1.8580645161290321, "grad_norm": 0.19187133428553324, "learning_rate": 7.618071688602199e-05, "loss": 0.0819, "step": 288 }, { "epoch": 1.864516129032258, "grad_norm": 0.1132288808580514, "learning_rate": 7.54514512859201e-05, "loss": 0.068, "step": 289 }, { "epoch": 1.870967741935484, "grad_norm": 0.213823719636533, "learning_rate": 7.472357234892082e-05, "loss": 0.0761, "step": 290 }, { "epoch": 1.8774193548387097, "grad_norm": 0.08954961479842181, "learning_rate": 7.399712119040238e-05, "loss": 0.0599, "step": 291 }, { "epoch": 1.8838709677419354, "grad_norm": 0.10264188935178613, "learning_rate": 7.327213884509272e-05, "loss": 0.0597, "step": 292 }, { "epoch": 1.8903225806451613, "grad_norm": 0.12757793483602733, "learning_rate": 7.254866626475152e-05, "loss": 0.0736, "step": 293 }, { "epoch": 1.896774193548387, "grad_norm": 0.1424378202330991, "learning_rate": 7.182674431585704e-05, "loss": 0.0758, "step": 294 }, { "epoch": 1.903225806451613, "grad_norm": 0.09267218373713473, "learning_rate": 7.110641377729778e-05, "loss": 0.0602, "step": 295 }, { "epoch": 1.9096774193548387, "grad_norm": 0.07573069220027309, "learning_rate": 7.038771533806884e-05, "loss": 0.0479, "step": 296 }, { "epoch": 1.9161290322580644, "grad_norm": 0.11850380653093652, "learning_rate": 6.967068959497376e-05, "loss": 0.061, "step": 297 }, { "epoch": 1.9225806451612903, "grad_norm": 0.0952951197418935, "learning_rate": 6.895537705033108e-05, "loss": 0.0719, "step": 298 }, { "epoch": 1.9290322580645163, "grad_norm": 0.0869206540087956, "learning_rate": 6.824181810968675e-05, "loss": 0.0614, "step": 299 }, { "epoch": 1.935483870967742, "grad_norm": 0.11514530265371259, "learning_rate": 6.753005307953167e-05, "loss": 0.0631, "step": 300 }, { "epoch": 1.9419354838709677, "grad_norm": 0.113758246927865, "learning_rate": 6.682012216502484e-05, "loss": 0.0737, "step": 301 }, { "epoch": 1.9483870967741934, "grad_norm": 0.1202740564887612, "learning_rate": 6.611206546772237e-05, "loss": 0.0665, "step": 302 }, { "epoch": 1.9548387096774194, "grad_norm": 0.12119681869029483, "learning_rate": 6.54059229833124e-05, "loss": 0.0617, "step": 303 }, { "epoch": 1.9612903225806453, "grad_norm": 0.11167959831458651, "learning_rate": 6.47017345993556e-05, "loss": 0.0636, "step": 304 }, { "epoch": 1.967741935483871, "grad_norm": 0.10236314721384843, "learning_rate": 6.39995400930324e-05, "loss": 0.0691, "step": 305 }, { "epoch": 1.9741935483870967, "grad_norm": 0.08978452227249356, "learning_rate": 6.329937912889582e-05, "loss": 0.0486, "step": 306 }, { "epoch": 1.9806451612903224, "grad_norm": 0.09402215683973497, "learning_rate": 6.260129125663106e-05, "loss": 0.0575, "step": 307 }, { "epoch": 1.9870967741935484, "grad_norm": 0.09176207813711797, "learning_rate": 6.190531590882159e-05, "loss": 0.0616, "step": 308 }, { "epoch": 1.9935483870967743, "grad_norm": 0.1461148677177572, "learning_rate": 6.121149239872151e-05, "loss": 0.084, "step": 309 }, { "epoch": 2.0, "grad_norm": 0.0936208966101867, "learning_rate": 6.051985991803517e-05, "loss": 0.0463, "step": 310 }, { "epoch": 2.0, "eval_loss": 0.09596683084964752, "eval_runtime": 25.5767, "eval_samples_per_second": 5.122, "eval_steps_per_second": 0.665, "step": 310 }, { "epoch": 2.0064516129032257, "grad_norm": 0.06773450997938296, "learning_rate": 5.983045753470308e-05, "loss": 0.0453, "step": 311 }, { "epoch": 2.0129032258064514, "grad_norm": 0.05341229626959274, "learning_rate": 5.9143324190695196e-05, "loss": 0.036, "step": 312 }, { "epoch": 2.0193548387096776, "grad_norm": 0.06711247984899421, "learning_rate": 5.845849869981137e-05, "loss": 0.046, "step": 313 }, { "epoch": 2.0258064516129033, "grad_norm": 0.061859413521882546, "learning_rate": 5.777601974548866e-05, "loss": 0.0352, "step": 314 }, { "epoch": 2.032258064516129, "grad_norm": 0.05446833363095474, "learning_rate": 5.709592587861637e-05, "loss": 0.0369, "step": 315 }, { "epoch": 2.0387096774193547, "grad_norm": 0.0782453373649888, "learning_rate": 5.6418255515358486e-05, "loss": 0.0349, "step": 316 }, { "epoch": 2.0451612903225804, "grad_norm": 0.04790490679204087, "learning_rate": 5.574304693498346e-05, "loss": 0.0308, "step": 317 }, { "epoch": 2.0516129032258066, "grad_norm": 0.062494284774837545, "learning_rate": 5.507033827770225e-05, "loss": 0.0353, "step": 318 }, { "epoch": 2.0580645161290323, "grad_norm": 0.07037383356374984, "learning_rate": 5.4400167542513636e-05, "loss": 0.0412, "step": 319 }, { "epoch": 2.064516129032258, "grad_norm": 0.08297196040492472, "learning_rate": 5.3732572585057974e-05, "loss": 0.045, "step": 320 }, { "epoch": 2.0709677419354837, "grad_norm": 0.05677712601652311, "learning_rate": 5.306759111547881e-05, "loss": 0.0314, "step": 321 }, { "epoch": 2.07741935483871, "grad_norm": 0.10970235904680202, "learning_rate": 5.240526069629265e-05, "loss": 0.0426, "step": 322 }, { "epoch": 2.0838709677419356, "grad_norm": 0.1074904876869991, "learning_rate": 5.174561874026741e-05, "loss": 0.0453, "step": 323 }, { "epoch": 2.0903225806451613, "grad_norm": 0.05559023402895479, "learning_rate": 5.108870250830882e-05, "loss": 0.0295, "step": 324 }, { "epoch": 2.096774193548387, "grad_norm": 0.09545856608388173, "learning_rate": 5.0434549107355944e-05, "loss": 0.0441, "step": 325 }, { "epoch": 2.1032258064516127, "grad_norm": 0.09664814857644259, "learning_rate": 4.978319548828504e-05, "loss": 0.0335, "step": 326 }, { "epoch": 2.109677419354839, "grad_norm": 0.09467665510676392, "learning_rate": 4.9134678443822166e-05, "loss": 0.0251, "step": 327 }, { "epoch": 2.1161290322580646, "grad_norm": 0.1384628187897895, "learning_rate": 4.8489034606465225e-05, "loss": 0.0369, "step": 328 }, { "epoch": 2.1225806451612903, "grad_norm": 0.12578896351994917, "learning_rate": 4.784630044641435e-05, "loss": 0.0392, "step": 329 }, { "epoch": 2.129032258064516, "grad_norm": 0.16521279278081405, "learning_rate": 4.7206512269512124e-05, "loss": 0.0463, "step": 330 }, { "epoch": 2.135483870967742, "grad_norm": 0.11984973498381633, "learning_rate": 4.65697062151927e-05, "loss": 0.0394, "step": 331 }, { "epoch": 2.141935483870968, "grad_norm": 0.09544223502654232, "learning_rate": 4.593591825444028e-05, "loss": 0.037, "step": 332 }, { "epoch": 2.1483870967741936, "grad_norm": 0.13562773315198307, "learning_rate": 4.530518418775733e-05, "loss": 0.039, "step": 333 }, { "epoch": 2.1548387096774193, "grad_norm": 0.08782158354204284, "learning_rate": 4.4677539643142454e-05, "loss": 0.0392, "step": 334 }, { "epoch": 2.161290322580645, "grad_norm": 0.08174883141208888, "learning_rate": 4.40530200740777e-05, "loss": 0.0313, "step": 335 }, { "epoch": 2.167741935483871, "grad_norm": 0.09796897925277187, "learning_rate": 4.343166075752605e-05, "loss": 0.0337, "step": 336 }, { "epoch": 2.174193548387097, "grad_norm": 0.11851301692643428, "learning_rate": 4.281349679193861e-05, "loss": 0.039, "step": 337 }, { "epoch": 2.1806451612903226, "grad_norm": 0.10807728159748675, "learning_rate": 4.2198563095272116e-05, "loss": 0.0301, "step": 338 }, { "epoch": 2.1870967741935483, "grad_norm": 0.2141192740082474, "learning_rate": 4.158689440301657e-05, "loss": 0.0354, "step": 339 }, { "epoch": 2.193548387096774, "grad_norm": 0.07257549552594048, "learning_rate": 4.097852526623307e-05, "loss": 0.0339, "step": 340 }, { "epoch": 2.2, "grad_norm": 0.19631930583890622, "learning_rate": 4.0373490049602204e-05, "loss": 0.0497, "step": 341 }, { "epoch": 2.206451612903226, "grad_norm": 0.08740964228587876, "learning_rate": 3.977182292948283e-05, "loss": 0.0478, "step": 342 }, { "epoch": 2.2129032258064516, "grad_norm": 0.08272419362723894, "learning_rate": 3.9173557891981573e-05, "loss": 0.0379, "step": 343 }, { "epoch": 2.2193548387096773, "grad_norm": 0.08040985475184476, "learning_rate": 3.857872873103322e-05, "loss": 0.0447, "step": 344 }, { "epoch": 2.225806451612903, "grad_norm": 0.09245277473267594, "learning_rate": 3.7987369046491684e-05, "loss": 0.0391, "step": 345 }, { "epoch": 2.232258064516129, "grad_norm": 0.12614672525793685, "learning_rate": 3.7399512242231995e-05, "loss": 0.0342, "step": 346 }, { "epoch": 2.238709677419355, "grad_norm": 0.09266235955840761, "learning_rate": 3.6815191524263624e-05, "loss": 0.0354, "step": 347 }, { "epoch": 2.2451612903225806, "grad_norm": 0.06080492053598232, "learning_rate": 3.623443989885462e-05, "loss": 0.0335, "step": 348 }, { "epoch": 2.2516129032258063, "grad_norm": 0.07807865557506005, "learning_rate": 3.565729017066729e-05, "loss": 0.0347, "step": 349 }, { "epoch": 2.258064516129032, "grad_norm": 0.08087459186315774, "learning_rate": 3.508377494090521e-05, "loss": 0.036, "step": 350 }, { "epoch": 2.264516129032258, "grad_norm": 0.08899211474214212, "learning_rate": 3.45139266054715e-05, "loss": 0.0343, "step": 351 }, { "epoch": 2.270967741935484, "grad_norm": 0.07056377080769806, "learning_rate": 3.394777735313919e-05, "loss": 0.033, "step": 352 }, { "epoch": 2.2774193548387096, "grad_norm": 0.0786360395215486, "learning_rate": 3.338535916373266e-05, "loss": 0.0319, "step": 353 }, { "epoch": 2.2838709677419353, "grad_norm": 0.0870351921736199, "learning_rate": 3.2826703806321525e-05, "loss": 0.0408, "step": 354 }, { "epoch": 2.2903225806451615, "grad_norm": 0.07340127696415631, "learning_rate": 3.227184283742591e-05, "loss": 0.0348, "step": 355 }, { "epoch": 2.296774193548387, "grad_norm": 0.08727491345423612, "learning_rate": 3.17208075992339e-05, "loss": 0.0349, "step": 356 }, { "epoch": 2.303225806451613, "grad_norm": 0.08641599877739083, "learning_rate": 3.117362921783134e-05, "loss": 0.0435, "step": 357 }, { "epoch": 2.3096774193548386, "grad_norm": 0.06573525447698249, "learning_rate": 3.063033860144339e-05, "loss": 0.0316, "step": 358 }, { "epoch": 2.3161290322580643, "grad_norm": 0.07221179151878702, "learning_rate": 3.0090966438688772e-05, "loss": 0.0324, "step": 359 }, { "epoch": 2.3225806451612905, "grad_norm": 0.18424602936724097, "learning_rate": 2.9555543196846292e-05, "loss": 0.0396, "step": 360 }, { "epoch": 2.329032258064516, "grad_norm": 0.08124593683663159, "learning_rate": 2.9024099120133673e-05, "loss": 0.0365, "step": 361 }, { "epoch": 2.335483870967742, "grad_norm": 0.10243519568473496, "learning_rate": 2.8496664227999415e-05, "loss": 0.0431, "step": 362 }, { "epoch": 2.3419354838709676, "grad_norm": 0.08863145799938825, "learning_rate": 2.7973268313426837e-05, "loss": 0.0431, "step": 363 }, { "epoch": 2.3483870967741938, "grad_norm": 0.1157770288973089, "learning_rate": 2.745394094125141e-05, "loss": 0.0384, "step": 364 }, { "epoch": 2.3548387096774195, "grad_norm": 0.06937293824933606, "learning_rate": 2.6938711446490606e-05, "loss": 0.0326, "step": 365 }, { "epoch": 2.361290322580645, "grad_norm": 0.11070613688051265, "learning_rate": 2.6427608932686843e-05, "loss": 0.0453, "step": 366 }, { "epoch": 2.367741935483871, "grad_norm": 0.07304602035921624, "learning_rate": 2.5920662270263653e-05, "loss": 0.0349, "step": 367 }, { "epoch": 2.3741935483870966, "grad_norm": 0.0957880185255825, "learning_rate": 2.5417900094894744e-05, "loss": 0.0384, "step": 368 }, { "epoch": 2.3806451612903228, "grad_norm": 0.06844023851775795, "learning_rate": 2.4919350805886577e-05, "loss": 0.0312, "step": 369 }, { "epoch": 2.3870967741935485, "grad_norm": 0.10510477036054879, "learning_rate": 2.4425042564574184e-05, "loss": 0.0337, "step": 370 }, { "epoch": 2.393548387096774, "grad_norm": 0.0969617018143466, "learning_rate": 2.3935003292730296e-05, "loss": 0.035, "step": 371 }, { "epoch": 2.4, "grad_norm": 0.07967820297616653, "learning_rate": 2.344926067098836e-05, "loss": 0.037, "step": 372 }, { "epoch": 2.4064516129032256, "grad_norm": 0.07497176613411485, "learning_rate": 2.2967842137278706e-05, "loss": 0.0328, "step": 373 }, { "epoch": 2.412903225806452, "grad_norm": 0.09754092780042643, "learning_rate": 2.2490774885278908e-05, "loss": 0.0313, "step": 374 }, { "epoch": 2.4193548387096775, "grad_norm": 0.08185743462021658, "learning_rate": 2.201808586287757e-05, "loss": 0.031, "step": 375 }, { "epoch": 2.425806451612903, "grad_norm": 0.08264089376813998, "learning_rate": 2.15498017706521e-05, "loss": 0.0385, "step": 376 }, { "epoch": 2.432258064516129, "grad_norm": 0.0823740031947979, "learning_rate": 2.1085949060360654e-05, "loss": 0.039, "step": 377 }, { "epoch": 2.4387096774193546, "grad_norm": 0.07247982019193418, "learning_rate": 2.0626553933447734e-05, "loss": 0.0264, "step": 378 }, { "epoch": 2.445161290322581, "grad_norm": 0.08404435825498824, "learning_rate": 2.01716423395644e-05, "loss": 0.0479, "step": 379 }, { "epoch": 2.4516129032258065, "grad_norm": 0.08666218883884351, "learning_rate": 1.9721239975102313e-05, "loss": 0.0377, "step": 380 }, { "epoch": 2.458064516129032, "grad_norm": 0.0813934369010793, "learning_rate": 1.9275372281742242e-05, "loss": 0.0287, "step": 381 }, { "epoch": 2.464516129032258, "grad_norm": 0.08490400932239225, "learning_rate": 1.8834064445016953e-05, "loss": 0.0287, "step": 382 }, { "epoch": 2.4709677419354836, "grad_norm": 0.0745619544814596, "learning_rate": 1.839734139288868e-05, "loss": 0.0313, "step": 383 }, { "epoch": 2.47741935483871, "grad_norm": 0.07603999026318416, "learning_rate": 1.7965227794340877e-05, "loss": 0.0282, "step": 384 }, { "epoch": 2.4838709677419355, "grad_norm": 0.07482170653624513, "learning_rate": 1.753774805798486e-05, "loss": 0.0338, "step": 385 }, { "epoch": 2.490322580645161, "grad_norm": 0.11239247582773948, "learning_rate": 1.7114926330680957e-05, "loss": 0.0332, "step": 386 }, { "epoch": 2.496774193548387, "grad_norm": 0.08250289864329428, "learning_rate": 1.6696786496174578e-05, "loss": 0.0402, "step": 387 }, { "epoch": 2.5032258064516126, "grad_norm": 0.10452752517237242, "learning_rate": 1.6283352173747145e-05, "loss": 0.0333, "step": 388 }, { "epoch": 2.509677419354839, "grad_norm": 0.09847614894022073, "learning_rate": 1.587464671688187e-05, "loss": 0.0349, "step": 389 }, { "epoch": 2.5161290322580645, "grad_norm": 0.08854153674623017, "learning_rate": 1.5470693211944643e-05, "loss": 0.0294, "step": 390 }, { "epoch": 2.52258064516129, "grad_norm": 0.07568785710192437, "learning_rate": 1.5071514476879878e-05, "loss": 0.0304, "step": 391 }, { "epoch": 2.5290322580645164, "grad_norm": 0.10747782045229992, "learning_rate": 1.4677133059921632e-05, "loss": 0.034, "step": 392 }, { "epoch": 2.535483870967742, "grad_norm": 0.07944602809272536, "learning_rate": 1.4287571238320053e-05, "loss": 0.0362, "step": 393 }, { "epoch": 2.541935483870968, "grad_norm": 0.09637365953273927, "learning_rate": 1.3902851017082864e-05, "loss": 0.0351, "step": 394 }, { "epoch": 2.5483870967741935, "grad_norm": 0.07895627852796791, "learning_rate": 1.3522994127732414e-05, "loss": 0.0348, "step": 395 }, { "epoch": 2.554838709677419, "grad_norm": 0.11668198814666138, "learning_rate": 1.3148022027078222e-05, "loss": 0.0377, "step": 396 }, { "epoch": 2.5612903225806454, "grad_norm": 0.08859887668842747, "learning_rate": 1.2777955896004812e-05, "loss": 0.0339, "step": 397 }, { "epoch": 2.567741935483871, "grad_norm": 0.0955648903311151, "learning_rate": 1.2412816638275404e-05, "loss": 0.0429, "step": 398 }, { "epoch": 2.574193548387097, "grad_norm": 0.08953857592938266, "learning_rate": 1.2052624879351104e-05, "loss": 0.0354, "step": 399 }, { "epoch": 2.5806451612903225, "grad_norm": 0.12572795234550732, "learning_rate": 1.1697400965225747e-05, "loss": 0.0403, "step": 400 }, { "epoch": 2.587096774193548, "grad_norm": 0.10694439782055375, "learning_rate": 1.134716496127679e-05, "loss": 0.0336, "step": 401 }, { "epoch": 2.5935483870967744, "grad_norm": 0.06513844231274085, "learning_rate": 1.1001936651131717e-05, "loss": 0.0329, "step": 402 }, { "epoch": 2.6, "grad_norm": 0.09974875770813588, "learning_rate": 1.0661735535550666e-05, "loss": 0.0365, "step": 403 }, { "epoch": 2.606451612903226, "grad_norm": 0.06817268977991534, "learning_rate": 1.0326580831324817e-05, "loss": 0.0299, "step": 404 }, { "epoch": 2.6129032258064515, "grad_norm": 0.06220845407702803, "learning_rate": 9.996491470190917e-06, "loss": 0.027, "step": 405 }, { "epoch": 2.6193548387096772, "grad_norm": 0.08453636414206647, "learning_rate": 9.671486097761917e-06, "loss": 0.0418, "step": 406 }, { "epoch": 2.6258064516129034, "grad_norm": 0.10895972341564872, "learning_rate": 9.351583072473713e-06, "loss": 0.0411, "step": 407 }, { "epoch": 2.632258064516129, "grad_norm": 0.08350751437942873, "learning_rate": 9.036800464548157e-06, "loss": 0.0434, "step": 408 }, { "epoch": 2.638709677419355, "grad_norm": 0.10366684902670063, "learning_rate": 8.727156054972374e-06, "loss": 0.039, "step": 409 }, { "epoch": 2.6451612903225805, "grad_norm": 0.09956778774869587, "learning_rate": 8.422667334494249e-06, "loss": 0.0408, "step": 410 }, { "epoch": 2.6516129032258062, "grad_norm": 0.08758251356693646, "learning_rate": 8.123351502634625e-06, "loss": 0.0348, "step": 411 }, { "epoch": 2.6580645161290324, "grad_norm": 0.10844348207137618, "learning_rate": 7.82922546671555e-06, "loss": 0.0408, "step": 412 }, { "epoch": 2.664516129032258, "grad_norm": 0.10636911692675921, "learning_rate": 7.54030584090537e-06, "loss": 0.0429, "step": 413 }, { "epoch": 2.670967741935484, "grad_norm": 0.09542792766451057, "learning_rate": 7.256608945280319e-06, "loss": 0.0447, "step": 414 }, { "epoch": 2.6774193548387095, "grad_norm": 0.07697171392030314, "learning_rate": 6.97815080490245e-06, "loss": 0.0367, "step": 415 }, { "epoch": 2.6838709677419352, "grad_norm": 0.072266260251159, "learning_rate": 6.704947148914609e-06, "loss": 0.0325, "step": 416 }, { "epoch": 2.6903225806451614, "grad_norm": 0.0945534978466976, "learning_rate": 6.437013409651849e-06, "loss": 0.0296, "step": 417 }, { "epoch": 2.696774193548387, "grad_norm": 0.10612283288506409, "learning_rate": 6.174364721769743e-06, "loss": 0.0492, "step": 418 }, { "epoch": 2.703225806451613, "grad_norm": 0.08110795988519982, "learning_rate": 5.917015921389568e-06, "loss": 0.0313, "step": 419 }, { "epoch": 2.709677419354839, "grad_norm": 0.0662490047037332, "learning_rate": 5.664981545260073e-06, "loss": 0.028, "step": 420 }, { "epoch": 2.7161290322580647, "grad_norm": 0.11866120174733467, "learning_rate": 5.418275829936537e-06, "loss": 0.0335, "step": 421 }, { "epoch": 2.7225806451612904, "grad_norm": 0.09690307384076605, "learning_rate": 5.176912710976467e-06, "loss": 0.0328, "step": 422 }, { "epoch": 2.729032258064516, "grad_norm": 0.08332254956064335, "learning_rate": 4.940905822152453e-06, "loss": 0.0376, "step": 423 }, { "epoch": 2.735483870967742, "grad_norm": 0.10534480078797799, "learning_rate": 4.710268494682146e-06, "loss": 0.0427, "step": 424 }, { "epoch": 2.741935483870968, "grad_norm": 0.07751933249718136, "learning_rate": 4.485013756475076e-06, "loss": 0.0351, "step": 425 }, { "epoch": 2.7483870967741937, "grad_norm": 0.08421370184302716, "learning_rate": 4.2651543313968145e-06, "loss": 0.0349, "step": 426 }, { "epoch": 2.7548387096774194, "grad_norm": 0.09663498717781645, "learning_rate": 4.050702638550275e-06, "loss": 0.0375, "step": 427 }, { "epoch": 2.761290322580645, "grad_norm": 0.10109504165697188, "learning_rate": 3.841670791574137e-06, "loss": 0.0389, "step": 428 }, { "epoch": 2.767741935483871, "grad_norm": 0.07242313321239978, "learning_rate": 3.638070597958665e-06, "loss": 0.032, "step": 429 }, { "epoch": 2.774193548387097, "grad_norm": 0.07339732439560019, "learning_rate": 3.4399135583787043e-06, "loss": 0.0354, "step": 430 }, { "epoch": 2.7806451612903227, "grad_norm": 0.10075765784942739, "learning_rate": 3.2472108660439706e-06, "loss": 0.0492, "step": 431 }, { "epoch": 2.7870967741935484, "grad_norm": 0.07589971575113699, "learning_rate": 3.059973406066963e-06, "loss": 0.031, "step": 432 }, { "epoch": 2.793548387096774, "grad_norm": 0.08019217757997281, "learning_rate": 2.878211754847926e-06, "loss": 0.0421, "step": 433 }, { "epoch": 2.8, "grad_norm": 0.12861410711276888, "learning_rate": 2.7019361794775156e-06, "loss": 0.0365, "step": 434 }, { "epoch": 2.806451612903226, "grad_norm": 0.09493420573436114, "learning_rate": 2.5311566371568507e-06, "loss": 0.0326, "step": 435 }, { "epoch": 2.8129032258064517, "grad_norm": 0.07274924242928238, "learning_rate": 2.365882774634998e-06, "loss": 0.0398, "step": 436 }, { "epoch": 2.8193548387096774, "grad_norm": 0.08169132644155476, "learning_rate": 2.206123927664161e-06, "loss": 0.0379, "step": 437 }, { "epoch": 2.825806451612903, "grad_norm": 0.07933207328170928, "learning_rate": 2.0518891204722168e-06, "loss": 0.0255, "step": 438 }, { "epoch": 2.832258064516129, "grad_norm": 0.09957580250269343, "learning_rate": 1.903187065253076e-06, "loss": 0.0445, "step": 439 }, { "epoch": 2.838709677419355, "grad_norm": 0.08137735740771984, "learning_rate": 1.7600261616745106e-06, "loss": 0.0422, "step": 440 }, { "epoch": 2.8451612903225807, "grad_norm": 0.10587692098340466, "learning_rate": 1.6224144964036681e-06, "loss": 0.0364, "step": 441 }, { "epoch": 2.8516129032258064, "grad_norm": 0.10343920804144822, "learning_rate": 1.4903598426503241e-06, "loss": 0.0327, "step": 442 }, { "epoch": 2.858064516129032, "grad_norm": 0.07946725527009221, "learning_rate": 1.3638696597277679e-06, "loss": 0.0341, "step": 443 }, { "epoch": 2.864516129032258, "grad_norm": 0.09586325289816539, "learning_rate": 1.2429510926314836e-06, "loss": 0.0267, "step": 444 }, { "epoch": 2.870967741935484, "grad_norm": 0.0827004341912287, "learning_rate": 1.1276109716355287e-06, "loss": 0.0416, "step": 445 }, { "epoch": 2.8774193548387097, "grad_norm": 0.07005026862609978, "learning_rate": 1.0178558119067315e-06, "loss": 0.0344, "step": 446 }, { "epoch": 2.8838709677419354, "grad_norm": 0.09331217539366578, "learning_rate": 9.136918131366412e-07, "loss": 0.0353, "step": 447 }, { "epoch": 2.8903225806451616, "grad_norm": 0.08846943376776958, "learning_rate": 8.151248591913518e-07, "loss": 0.0386, "step": 448 }, { "epoch": 2.896774193548387, "grad_norm": 0.1051100476797595, "learning_rate": 7.221605177791691e-07, "loss": 0.0411, "step": 449 }, { "epoch": 2.903225806451613, "grad_norm": 0.10646364294182206, "learning_rate": 6.348040401360833e-07, "loss": 0.0346, "step": 450 }, { "epoch": 2.9096774193548387, "grad_norm": 0.14514647433288821, "learning_rate": 5.530603607290851e-07, "loss": 0.0432, "step": 451 }, { "epoch": 2.9161290322580644, "grad_norm": 0.08548221741715605, "learning_rate": 4.76934096977566e-07, "loss": 0.0318, "step": 452 }, { "epoch": 2.9225806451612906, "grad_norm": 0.07886089519019772, "learning_rate": 4.0642954899238197e-07, "loss": 0.0387, "step": 453 }, { "epoch": 2.9290322580645163, "grad_norm": 0.10187728036061035, "learning_rate": 3.415506993330153e-07, "loss": 0.0385, "step": 454 }, { "epoch": 2.935483870967742, "grad_norm": 0.06877182838764333, "learning_rate": 2.8230121278257637e-07, "loss": 0.0268, "step": 455 }, { "epoch": 2.9419354838709677, "grad_norm": 0.07135072782410999, "learning_rate": 2.2868443614082469e-07, "loss": 0.0348, "step": 456 }, { "epoch": 2.9483870967741934, "grad_norm": 0.0784846474201083, "learning_rate": 1.8070339803509807e-07, "loss": 0.0401, "step": 457 }, { "epoch": 2.9548387096774196, "grad_norm": 0.07055206711925587, "learning_rate": 1.3836080874926049e-07, "loss": 0.0341, "step": 458 }, { "epoch": 2.9612903225806453, "grad_norm": 0.12997432093488728, "learning_rate": 1.0165906007056914e-07, "loss": 0.0397, "step": 459 }, { "epoch": 2.967741935483871, "grad_norm": 0.07787008340791496, "learning_rate": 7.060022515460451e-08, "loss": 0.0327, "step": 460 }, { "epoch": 2.9741935483870967, "grad_norm": 0.08207645810528069, "learning_rate": 4.518605840815315e-08, "loss": 0.0433, "step": 461 }, { "epoch": 2.9806451612903224, "grad_norm": 0.0969168591567496, "learning_rate": 2.5417995390086824e-08, "loss": 0.0358, "step": 462 }, { "epoch": 2.9870967741935486, "grad_norm": 0.08709768527509966, "learning_rate": 1.129715273033849e-08, "loss": 0.0363, "step": 463 }, { "epoch": 2.9935483870967743, "grad_norm": 0.09987493142651711, "learning_rate": 2.824328066730608e-09, "loss": 0.0362, "step": 464 }, { "epoch": 3.0, "grad_norm": 0.06314894199443474, "learning_rate": 0.0, "loss": 0.0271, "step": 465 }, { "epoch": 3.0, "eval_loss": 0.11768443137407303, "eval_runtime": 25.378, "eval_samples_per_second": 5.162, "eval_steps_per_second": 0.67, "step": 465 }, { "epoch": 3.0, "step": 465, "total_flos": 261374226563072.0, "train_loss": 0.07172503977693537, "train_runtime": 3552.9052, "train_samples_per_second": 2.091, "train_steps_per_second": 0.131 } ], "logging_steps": 1, "max_steps": 465, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 261374226563072.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }