{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9739884393063583, "eval_steps": 129, "global_step": 258, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007707129094412331, "grad_norm": 27.94101905822754, "learning_rate": 2.0000000000000002e-07, "loss": 2.3083, "step": 1 }, { "epoch": 0.007707129094412331, "eval_loss": 2.273209810256958, "eval_runtime": 27.5737, "eval_samples_per_second": 1.813, "eval_steps_per_second": 0.471, "step": 1 }, { "epoch": 0.015414258188824663, "grad_norm": 27.76470184326172, "learning_rate": 4.0000000000000003e-07, "loss": 2.2483, "step": 2 }, { "epoch": 0.023121387283236993, "grad_norm": 28.12770652770996, "learning_rate": 6.000000000000001e-07, "loss": 2.29, "step": 3 }, { "epoch": 0.030828516377649325, "grad_norm": 29.105730056762695, "learning_rate": 8.000000000000001e-07, "loss": 2.3396, "step": 4 }, { "epoch": 0.038535645472061654, "grad_norm": 28.90327262878418, "learning_rate": 1.0000000000000002e-06, "loss": 2.2654, "step": 5 }, { "epoch": 0.046242774566473986, "grad_norm": 30.192190170288086, "learning_rate": 1.2000000000000002e-06, "loss": 2.2205, "step": 6 }, { "epoch": 0.05394990366088632, "grad_norm": 27.94385528564453, "learning_rate": 1.4000000000000001e-06, "loss": 2.2161, "step": 7 }, { "epoch": 0.06165703275529865, "grad_norm": 17.948062896728516, "learning_rate": 1.6000000000000001e-06, "loss": 2.0695, "step": 8 }, { "epoch": 0.06936416184971098, "grad_norm": 17.047054290771484, "learning_rate": 1.8000000000000001e-06, "loss": 2.0742, "step": 9 }, { "epoch": 0.07707129094412331, "grad_norm": 15.427838325500488, "learning_rate": 2.0000000000000003e-06, "loss": 1.9962, "step": 10 }, { "epoch": 0.08477842003853564, "grad_norm": 6.273721694946289, "learning_rate": 2.2e-06, "loss": 1.859, "step": 11 }, { "epoch": 0.09248554913294797, "grad_norm": 5.851183891296387, "learning_rate": 2.4000000000000003e-06, "loss": 1.8696, "step": 12 }, { "epoch": 0.1001926782273603, "grad_norm": 4.672307968139648, "learning_rate": 2.6e-06, "loss": 1.8348, "step": 13 }, { "epoch": 0.10789980732177264, "grad_norm": 2.6562161445617676, "learning_rate": 2.8000000000000003e-06, "loss": 1.7719, "step": 14 }, { "epoch": 0.11560693641618497, "grad_norm": 4.172055721282959, "learning_rate": 3e-06, "loss": 1.7984, "step": 15 }, { "epoch": 0.1233140655105973, "grad_norm": 4.055249214172363, "learning_rate": 3.2000000000000003e-06, "loss": 1.8025, "step": 16 }, { "epoch": 0.13102119460500963, "grad_norm": 3.3719887733459473, "learning_rate": 3.4000000000000005e-06, "loss": 1.739, "step": 17 }, { "epoch": 0.13872832369942195, "grad_norm": 2.81038498878479, "learning_rate": 3.6000000000000003e-06, "loss": 1.7478, "step": 18 }, { "epoch": 0.1464354527938343, "grad_norm": 2.2064859867095947, "learning_rate": 3.8000000000000005e-06, "loss": 1.7384, "step": 19 }, { "epoch": 0.15414258188824662, "grad_norm": 1.940885305404663, "learning_rate": 4.000000000000001e-06, "loss": 1.7135, "step": 20 }, { "epoch": 0.16184971098265896, "grad_norm": 1.9488122463226318, "learning_rate": 4.2000000000000004e-06, "loss": 1.7108, "step": 21 }, { "epoch": 0.16955684007707128, "grad_norm": 1.7049647569656372, "learning_rate": 4.4e-06, "loss": 1.6868, "step": 22 }, { "epoch": 0.17726396917148363, "grad_norm": 1.5429236888885498, "learning_rate": 4.600000000000001e-06, "loss": 1.6947, "step": 23 }, { "epoch": 0.18497109826589594, "grad_norm": 1.5304620265960693, "learning_rate": 4.800000000000001e-06, "loss": 1.6846, "step": 24 }, { "epoch": 0.1926782273603083, "grad_norm": 1.5696897506713867, "learning_rate": 5e-06, "loss": 1.6844, "step": 25 }, { "epoch": 0.2003853564547206, "grad_norm": 1.4362632036209106, "learning_rate": 5.2e-06, "loss": 1.6732, "step": 26 }, { "epoch": 0.20809248554913296, "grad_norm": 1.3416928052902222, "learning_rate": 5.400000000000001e-06, "loss": 1.6424, "step": 27 }, { "epoch": 0.21579961464354527, "grad_norm": 1.3142507076263428, "learning_rate": 5.600000000000001e-06, "loss": 1.677, "step": 28 }, { "epoch": 0.22350674373795762, "grad_norm": 1.342984676361084, "learning_rate": 5.8e-06, "loss": 1.6762, "step": 29 }, { "epoch": 0.23121387283236994, "grad_norm": 1.2972025871276855, "learning_rate": 6e-06, "loss": 1.6716, "step": 30 }, { "epoch": 0.23892100192678228, "grad_norm": 1.2904590368270874, "learning_rate": 6.200000000000001e-06, "loss": 1.6234, "step": 31 }, { "epoch": 0.2466281310211946, "grad_norm": 1.1942962408065796, "learning_rate": 6.4000000000000006e-06, "loss": 1.6533, "step": 32 }, { "epoch": 0.2543352601156069, "grad_norm": 1.124014973640442, "learning_rate": 6.600000000000001e-06, "loss": 1.6604, "step": 33 }, { "epoch": 0.26204238921001927, "grad_norm": 1.2125813961029053, "learning_rate": 6.800000000000001e-06, "loss": 1.6335, "step": 34 }, { "epoch": 0.2697495183044316, "grad_norm": 1.2104367017745972, "learning_rate": 7e-06, "loss": 1.6356, "step": 35 }, { "epoch": 0.2774566473988439, "grad_norm": 1.1877591609954834, "learning_rate": 7.2000000000000005e-06, "loss": 1.6349, "step": 36 }, { "epoch": 0.28516377649325625, "grad_norm": 1.2402458190917969, "learning_rate": 7.4e-06, "loss": 1.6463, "step": 37 }, { "epoch": 0.2928709055876686, "grad_norm": 1.1922346353530884, "learning_rate": 7.600000000000001e-06, "loss": 1.5998, "step": 38 }, { "epoch": 0.30057803468208094, "grad_norm": 1.197464942932129, "learning_rate": 7.800000000000002e-06, "loss": 1.6265, "step": 39 }, { "epoch": 0.30828516377649323, "grad_norm": 1.291739821434021, "learning_rate": 8.000000000000001e-06, "loss": 1.6077, "step": 40 }, { "epoch": 0.3159922928709056, "grad_norm": 1.145663857460022, "learning_rate": 8.2e-06, "loss": 1.6152, "step": 41 }, { "epoch": 0.3236994219653179, "grad_norm": 1.1572788953781128, "learning_rate": 8.400000000000001e-06, "loss": 1.6058, "step": 42 }, { "epoch": 0.33140655105973027, "grad_norm": 1.3273899555206299, "learning_rate": 8.6e-06, "loss": 1.6223, "step": 43 }, { "epoch": 0.33911368015414256, "grad_norm": 1.1160943508148193, "learning_rate": 8.8e-06, "loss": 1.5969, "step": 44 }, { "epoch": 0.3468208092485549, "grad_norm": 1.3087902069091797, "learning_rate": 9e-06, "loss": 1.6464, "step": 45 }, { "epoch": 0.35452793834296725, "grad_norm": 1.1589637994766235, "learning_rate": 9.200000000000002e-06, "loss": 1.5799, "step": 46 }, { "epoch": 0.3622350674373796, "grad_norm": 1.159191370010376, "learning_rate": 9.4e-06, "loss": 1.6153, "step": 47 }, { "epoch": 0.3699421965317919, "grad_norm": 1.206766128540039, "learning_rate": 9.600000000000001e-06, "loss": 1.5982, "step": 48 }, { "epoch": 0.37764932562620424, "grad_norm": 1.1924678087234497, "learning_rate": 9.800000000000001e-06, "loss": 1.6054, "step": 49 }, { "epoch": 0.3853564547206166, "grad_norm": 1.2029445171356201, "learning_rate": 1e-05, "loss": 1.6205, "step": 50 }, { "epoch": 0.3930635838150289, "grad_norm": 1.1406632661819458, "learning_rate": 1.02e-05, "loss": 1.6158, "step": 51 }, { "epoch": 0.4007707129094412, "grad_norm": 1.1437443494796753, "learning_rate": 1.04e-05, "loss": 1.6045, "step": 52 }, { "epoch": 0.40847784200385356, "grad_norm": 1.127734661102295, "learning_rate": 1.0600000000000002e-05, "loss": 1.5968, "step": 53 }, { "epoch": 0.4161849710982659, "grad_norm": 1.1851099729537964, "learning_rate": 1.0800000000000002e-05, "loss": 1.6045, "step": 54 }, { "epoch": 0.4238921001926782, "grad_norm": 1.1298301219940186, "learning_rate": 1.1000000000000001e-05, "loss": 1.5908, "step": 55 }, { "epoch": 0.43159922928709055, "grad_norm": 1.095090627670288, "learning_rate": 1.1200000000000001e-05, "loss": 1.5901, "step": 56 }, { "epoch": 0.4393063583815029, "grad_norm": 1.1739152669906616, "learning_rate": 1.14e-05, "loss": 1.6275, "step": 57 }, { "epoch": 0.44701348747591524, "grad_norm": 1.1687606573104858, "learning_rate": 1.16e-05, "loss": 1.5938, "step": 58 }, { "epoch": 0.45472061657032753, "grad_norm": 1.1895908117294312, "learning_rate": 1.18e-05, "loss": 1.6016, "step": 59 }, { "epoch": 0.4624277456647399, "grad_norm": 1.199129581451416, "learning_rate": 1.2e-05, "loss": 1.6317, "step": 60 }, { "epoch": 0.4701348747591522, "grad_norm": 1.2785886526107788, "learning_rate": 1.22e-05, "loss": 1.5672, "step": 61 }, { "epoch": 0.47784200385356457, "grad_norm": 1.2036688327789307, "learning_rate": 1.2400000000000002e-05, "loss": 1.5636, "step": 62 }, { "epoch": 0.48554913294797686, "grad_norm": 1.2586396932601929, "learning_rate": 1.2600000000000001e-05, "loss": 1.5806, "step": 63 }, { "epoch": 0.4932562620423892, "grad_norm": 1.1760581731796265, "learning_rate": 1.2800000000000001e-05, "loss": 1.5724, "step": 64 }, { "epoch": 0.5009633911368016, "grad_norm": 1.1171916723251343, "learning_rate": 1.3000000000000001e-05, "loss": 1.5982, "step": 65 }, { "epoch": 0.5086705202312138, "grad_norm": 1.234012484550476, "learning_rate": 1.3200000000000002e-05, "loss": 1.597, "step": 66 }, { "epoch": 0.5163776493256262, "grad_norm": 1.1812013387680054, "learning_rate": 1.3400000000000002e-05, "loss": 1.6064, "step": 67 }, { "epoch": 0.5240847784200385, "grad_norm": 1.1740922927856445, "learning_rate": 1.3600000000000002e-05, "loss": 1.5915, "step": 68 }, { "epoch": 0.5317919075144508, "grad_norm": 1.277176856994629, "learning_rate": 1.38e-05, "loss": 1.5711, "step": 69 }, { "epoch": 0.5394990366088632, "grad_norm": 1.1419289112091064, "learning_rate": 1.4e-05, "loss": 1.5934, "step": 70 }, { "epoch": 0.5472061657032755, "grad_norm": 1.2002787590026855, "learning_rate": 1.4200000000000001e-05, "loss": 1.6021, "step": 71 }, { "epoch": 0.5549132947976878, "grad_norm": 1.1920689344406128, "learning_rate": 1.4400000000000001e-05, "loss": 1.5893, "step": 72 }, { "epoch": 0.5626204238921002, "grad_norm": 1.2546113729476929, "learning_rate": 1.46e-05, "loss": 1.558, "step": 73 }, { "epoch": 0.5703275529865125, "grad_norm": 1.2610082626342773, "learning_rate": 1.48e-05, "loss": 1.5842, "step": 74 }, { "epoch": 0.5780346820809249, "grad_norm": 1.1725729703903198, "learning_rate": 1.5000000000000002e-05, "loss": 1.5746, "step": 75 }, { "epoch": 0.5857418111753372, "grad_norm": 1.1732540130615234, "learning_rate": 1.5200000000000002e-05, "loss": 1.5804, "step": 76 }, { "epoch": 0.5934489402697495, "grad_norm": 1.281145691871643, "learning_rate": 1.54e-05, "loss": 1.5884, "step": 77 }, { "epoch": 0.6011560693641619, "grad_norm": 1.1668535470962524, "learning_rate": 1.5600000000000003e-05, "loss": 1.6048, "step": 78 }, { "epoch": 0.6088631984585742, "grad_norm": 1.2680914402008057, "learning_rate": 1.58e-05, "loss": 1.5893, "step": 79 }, { "epoch": 0.6165703275529865, "grad_norm": 1.1659042835235596, "learning_rate": 1.6000000000000003e-05, "loss": 1.5791, "step": 80 }, { "epoch": 0.6242774566473989, "grad_norm": 1.2156031131744385, "learning_rate": 1.62e-05, "loss": 1.5916, "step": 81 }, { "epoch": 0.6319845857418112, "grad_norm": 1.1217319965362549, "learning_rate": 1.64e-05, "loss": 1.5725, "step": 82 }, { "epoch": 0.6396917148362236, "grad_norm": 1.307479977607727, "learning_rate": 1.66e-05, "loss": 1.5723, "step": 83 }, { "epoch": 0.6473988439306358, "grad_norm": 1.1636345386505127, "learning_rate": 1.6800000000000002e-05, "loss": 1.5906, "step": 84 }, { "epoch": 0.6551059730250481, "grad_norm": 1.3260914087295532, "learning_rate": 1.7e-05, "loss": 1.5972, "step": 85 }, { "epoch": 0.6628131021194605, "grad_norm": 1.14360511302948, "learning_rate": 1.72e-05, "loss": 1.5621, "step": 86 }, { "epoch": 0.6705202312138728, "grad_norm": 1.4284840822219849, "learning_rate": 1.7400000000000003e-05, "loss": 1.571, "step": 87 }, { "epoch": 0.6782273603082851, "grad_norm": 1.1513473987579346, "learning_rate": 1.76e-05, "loss": 1.6015, "step": 88 }, { "epoch": 0.6859344894026975, "grad_norm": 1.3102519512176514, "learning_rate": 1.7800000000000002e-05, "loss": 1.576, "step": 89 }, { "epoch": 0.6936416184971098, "grad_norm": 1.2329882383346558, "learning_rate": 1.8e-05, "loss": 1.5759, "step": 90 }, { "epoch": 0.7013487475915221, "grad_norm": 1.1875412464141846, "learning_rate": 1.8200000000000002e-05, "loss": 1.555, "step": 91 }, { "epoch": 0.7090558766859345, "grad_norm": 1.1887799501419067, "learning_rate": 1.8400000000000003e-05, "loss": 1.5926, "step": 92 }, { "epoch": 0.7167630057803468, "grad_norm": 1.3002405166625977, "learning_rate": 1.86e-05, "loss": 1.5849, "step": 93 }, { "epoch": 0.7244701348747592, "grad_norm": 1.194841980934143, "learning_rate": 1.88e-05, "loss": 1.5724, "step": 94 }, { "epoch": 0.7321772639691715, "grad_norm": 1.315577745437622, "learning_rate": 1.9e-05, "loss": 1.5296, "step": 95 }, { "epoch": 0.7398843930635838, "grad_norm": 1.239837884902954, "learning_rate": 1.9200000000000003e-05, "loss": 1.5845, "step": 96 }, { "epoch": 0.7475915221579962, "grad_norm": 1.3335014581680298, "learning_rate": 1.94e-05, "loss": 1.5843, "step": 97 }, { "epoch": 0.7552986512524085, "grad_norm": 1.2278801202774048, "learning_rate": 1.9600000000000002e-05, "loss": 1.588, "step": 98 }, { "epoch": 0.7630057803468208, "grad_norm": 1.3168463706970215, "learning_rate": 1.98e-05, "loss": 1.5758, "step": 99 }, { "epoch": 0.7707129094412332, "grad_norm": 1.3854187726974487, "learning_rate": 2e-05, "loss": 1.5489, "step": 100 }, { "epoch": 0.7784200385356455, "grad_norm": 1.1883262395858765, "learning_rate": 1.9998023297700656e-05, "loss": 1.5707, "step": 101 }, { "epoch": 0.7861271676300579, "grad_norm": 1.3683229684829712, "learning_rate": 1.999209397227302e-05, "loss": 1.5916, "step": 102 }, { "epoch": 0.7938342967244701, "grad_norm": 1.2290884256362915, "learning_rate": 1.998221436781933e-05, "loss": 1.6028, "step": 103 }, { "epoch": 0.8015414258188824, "grad_norm": 1.422328233718872, "learning_rate": 1.996838839014696e-05, "loss": 1.5753, "step": 104 }, { "epoch": 0.8092485549132948, "grad_norm": 1.2604609727859497, "learning_rate": 1.9950621505224276e-05, "loss": 1.5537, "step": 105 }, { "epoch": 0.8169556840077071, "grad_norm": 1.1725685596466064, "learning_rate": 1.9928920737019735e-05, "loss": 1.5745, "step": 106 }, { "epoch": 0.8246628131021194, "grad_norm": 1.284792423248291, "learning_rate": 1.9903294664725023e-05, "loss": 1.5868, "step": 107 }, { "epoch": 0.8323699421965318, "grad_norm": 1.1779919862747192, "learning_rate": 1.9873753419363336e-05, "loss": 1.5824, "step": 108 }, { "epoch": 0.8400770712909441, "grad_norm": 1.1214483976364136, "learning_rate": 1.9840308679784207e-05, "loss": 1.5486, "step": 109 }, { "epoch": 0.8477842003853564, "grad_norm": 1.2342500686645508, "learning_rate": 1.9802973668046364e-05, "loss": 1.5867, "step": 110 }, { "epoch": 0.8554913294797688, "grad_norm": 1.1915156841278076, "learning_rate": 1.976176314419051e-05, "loss": 1.579, "step": 111 }, { "epoch": 0.8631984585741811, "grad_norm": 1.215820550918579, "learning_rate": 1.97166934004041e-05, "loss": 1.6014, "step": 112 }, { "epoch": 0.8709055876685935, "grad_norm": 1.2331247329711914, "learning_rate": 1.9667782254580373e-05, "loss": 1.5653, "step": 113 }, { "epoch": 0.8786127167630058, "grad_norm": 1.1282511949539185, "learning_rate": 1.9615049043274207e-05, "loss": 1.5584, "step": 114 }, { "epoch": 0.8863198458574181, "grad_norm": 1.2783879041671753, "learning_rate": 1.955851461405761e-05, "loss": 1.5956, "step": 115 }, { "epoch": 0.8940269749518305, "grad_norm": 1.2387332916259766, "learning_rate": 1.949820131727783e-05, "loss": 1.5604, "step": 116 }, { "epoch": 0.9017341040462428, "grad_norm": 1.3010255098342896, "learning_rate": 1.9434132997221347e-05, "loss": 1.5557, "step": 117 }, { "epoch": 0.9094412331406551, "grad_norm": 1.3249139785766602, "learning_rate": 1.936633498268728e-05, "loss": 1.561, "step": 118 }, { "epoch": 0.9171483622350675, "grad_norm": 1.2389734983444214, "learning_rate": 1.9294834076973872e-05, "loss": 1.5726, "step": 119 }, { "epoch": 0.9248554913294798, "grad_norm": 1.258575677871704, "learning_rate": 1.921965854728207e-05, "loss": 1.5504, "step": 120 }, { "epoch": 0.9325626204238922, "grad_norm": 1.2949562072753906, "learning_rate": 1.9140838113540347e-05, "loss": 1.576, "step": 121 }, { "epoch": 0.9402697495183044, "grad_norm": 1.2721818685531616, "learning_rate": 1.9058403936655235e-05, "loss": 1.5697, "step": 122 }, { "epoch": 0.9479768786127167, "grad_norm": 1.198541522026062, "learning_rate": 1.8972388606192124e-05, "loss": 1.5672, "step": 123 }, { "epoch": 0.9556840077071291, "grad_norm": 1.2318319082260132, "learning_rate": 1.888282612749132e-05, "loss": 1.5511, "step": 124 }, { "epoch": 0.9633911368015414, "grad_norm": 1.3235722780227661, "learning_rate": 1.878975190822434e-05, "loss": 1.5972, "step": 125 }, { "epoch": 0.9710982658959537, "grad_norm": 1.2950528860092163, "learning_rate": 1.869320274439583e-05, "loss": 1.5696, "step": 126 }, { "epoch": 0.9788053949903661, "grad_norm": 1.2997064590454102, "learning_rate": 1.8593216805796612e-05, "loss": 1.5751, "step": 127 }, { "epoch": 0.9865125240847784, "grad_norm": 1.429874062538147, "learning_rate": 1.8489833620913644e-05, "loss": 1.5706, "step": 128 }, { "epoch": 0.9942196531791907, "grad_norm": 1.2658491134643555, "learning_rate": 1.8383094061302767e-05, "loss": 1.5681, "step": 129 }, { "epoch": 0.9942196531791907, "eval_loss": 1.590910792350769, "eval_runtime": 27.4214, "eval_samples_per_second": 1.823, "eval_steps_per_second": 0.474, "step": 129 }, { "epoch": 1.001926782273603, "grad_norm": 1.3823826313018799, "learning_rate": 1.8273040325430575e-05, "loss": 1.5453, "step": 130 }, { "epoch": 1.0096339113680155, "grad_norm": 1.174560546875, "learning_rate": 1.8159715921991612e-05, "loss": 1.5485, "step": 131 }, { "epoch": 1.0028901734104045, "grad_norm": 1.3361918926239014, "learning_rate": 1.804316565270765e-05, "loss": 1.511, "step": 132 }, { "epoch": 1.010597302504817, "grad_norm": 1.5681639909744263, "learning_rate": 1.7923435594615744e-05, "loss": 1.3985, "step": 133 }, { "epoch": 1.0183044315992293, "grad_norm": 1.3742421865463257, "learning_rate": 1.7800573081852124e-05, "loss": 1.4214, "step": 134 }, { "epoch": 1.0260115606936415, "grad_norm": 1.375709056854248, "learning_rate": 1.767462668693908e-05, "loss": 1.3857, "step": 135 }, { "epoch": 1.033718689788054, "grad_norm": 1.5805290937423706, "learning_rate": 1.7545646201582304e-05, "loss": 1.3975, "step": 136 }, { "epoch": 1.0414258188824663, "grad_norm": 1.4905924797058105, "learning_rate": 1.7413682616986185e-05, "loss": 1.3663, "step": 137 }, { "epoch": 1.0491329479768785, "grad_norm": 1.4072234630584717, "learning_rate": 1.7278788103694944e-05, "loss": 1.3969, "step": 138 }, { "epoch": 1.056840077071291, "grad_norm": 1.3707107305526733, "learning_rate": 1.71410159909675e-05, "loss": 1.3932, "step": 139 }, { "epoch": 1.0645472061657033, "grad_norm": 1.376590609550476, "learning_rate": 1.7000420745694256e-05, "loss": 1.3656, "step": 140 }, { "epoch": 1.0722543352601157, "grad_norm": 1.2771958112716675, "learning_rate": 1.6857057950864134e-05, "loss": 1.3694, "step": 141 }, { "epoch": 1.079961464354528, "grad_norm": 1.4592013359069824, "learning_rate": 1.671098428359037e-05, "loss": 1.3913, "step": 142 }, { "epoch": 1.0876685934489403, "grad_norm": 1.3110437393188477, "learning_rate": 1.6562257492703756e-05, "loss": 1.3768, "step": 143 }, { "epoch": 1.0953757225433527, "grad_norm": 1.344575047492981, "learning_rate": 1.64109363759222e-05, "loss": 1.3778, "step": 144 }, { "epoch": 1.1030828516377649, "grad_norm": 1.277384638786316, "learning_rate": 1.62570807566056e-05, "loss": 1.3499, "step": 145 }, { "epoch": 1.1107899807321773, "grad_norm": 1.2886083126068115, "learning_rate": 1.6100751460105244e-05, "loss": 1.3669, "step": 146 }, { "epoch": 1.1184971098265897, "grad_norm": 1.3069369792938232, "learning_rate": 1.5942010289717108e-05, "loss": 1.3918, "step": 147 }, { "epoch": 1.1262042389210019, "grad_norm": 1.2955520153045654, "learning_rate": 1.5780920002248484e-05, "loss": 1.3645, "step": 148 }, { "epoch": 1.1339113680154143, "grad_norm": 1.3005629777908325, "learning_rate": 1.561754428320771e-05, "loss": 1.3522, "step": 149 }, { "epoch": 1.1416184971098267, "grad_norm": 1.413831114768982, "learning_rate": 1.5451947721626676e-05, "loss": 1.4064, "step": 150 }, { "epoch": 1.1493256262042388, "grad_norm": 1.2129186391830444, "learning_rate": 1.5284195784526196e-05, "loss": 1.3576, "step": 151 }, { "epoch": 1.1570327552986512, "grad_norm": 1.3991036415100098, "learning_rate": 1.5114354791034225e-05, "loss": 1.3735, "step": 152 }, { "epoch": 1.1647398843930636, "grad_norm": 1.2813304662704468, "learning_rate": 1.494249188616723e-05, "loss": 1.3689, "step": 153 }, { "epoch": 1.1724470134874758, "grad_norm": 1.3265056610107422, "learning_rate": 1.4768675014285063e-05, "loss": 1.3714, "step": 154 }, { "epoch": 1.1801541425818882, "grad_norm": 1.244061827659607, "learning_rate": 1.4592972892229779e-05, "loss": 1.371, "step": 155 }, { "epoch": 1.1878612716763006, "grad_norm": 1.2477822303771973, "learning_rate": 1.4415454982159121e-05, "loss": 1.3705, "step": 156 }, { "epoch": 1.1955684007707128, "grad_norm": 1.3200701475143433, "learning_rate": 1.4236191464085286e-05, "loss": 1.3657, "step": 157 }, { "epoch": 1.2032755298651252, "grad_norm": 1.237042784690857, "learning_rate": 1.405525320812994e-05, "loss": 1.3602, "step": 158 }, { "epoch": 1.2109826589595376, "grad_norm": 1.30637526512146, "learning_rate": 1.3872711746506413e-05, "loss": 1.3758, "step": 159 }, { "epoch": 1.21868978805395, "grad_norm": 1.3186436891555786, "learning_rate": 1.3688639245240078e-05, "loss": 1.3907, "step": 160 }, { "epoch": 1.2263969171483622, "grad_norm": 1.2071219682693481, "learning_rate": 1.3503108475638244e-05, "loss": 1.3698, "step": 161 }, { "epoch": 1.2341040462427746, "grad_norm": 1.1885581016540527, "learning_rate": 1.331619278552068e-05, "loss": 1.3774, "step": 162 }, { "epoch": 1.241811175337187, "grad_norm": 1.1943105459213257, "learning_rate": 1.3127966070222273e-05, "loss": 1.3538, "step": 163 }, { "epoch": 1.2495183044315992, "grad_norm": 1.1982208490371704, "learning_rate": 1.2938502743379212e-05, "loss": 1.3797, "step": 164 }, { "epoch": 1.2572254335260116, "grad_norm": 1.191636562347412, "learning_rate": 1.2747877707510252e-05, "loss": 1.371, "step": 165 }, { "epoch": 1.264932562620424, "grad_norm": 1.2649930715560913, "learning_rate": 1.2556166324404747e-05, "loss": 1.3789, "step": 166 }, { "epoch": 1.2726396917148362, "grad_norm": 1.206629753112793, "learning_rate": 1.2363444385329052e-05, "loss": 1.4232, "step": 167 }, { "epoch": 1.2803468208092486, "grad_norm": 1.3122280836105347, "learning_rate": 1.2169788081063181e-05, "loss": 1.3871, "step": 168 }, { "epoch": 1.288053949903661, "grad_norm": 1.1735293865203857, "learning_rate": 1.1975273971779528e-05, "loss": 1.3741, "step": 169 }, { "epoch": 1.2957610789980731, "grad_norm": 1.3187175989151, "learning_rate": 1.1779978956775507e-05, "loss": 1.3644, "step": 170 }, { "epoch": 1.3034682080924855, "grad_norm": 1.2720284461975098, "learning_rate": 1.158398024407215e-05, "loss": 1.3661, "step": 171 }, { "epoch": 1.311175337186898, "grad_norm": 1.3094247579574585, "learning_rate": 1.1387355319890685e-05, "loss": 1.3617, "step": 172 }, { "epoch": 1.3188824662813103, "grad_norm": 1.2710013389587402, "learning_rate": 1.119018191801905e-05, "loss": 1.373, "step": 173 }, { "epoch": 1.3265895953757225, "grad_norm": 1.2845216989517212, "learning_rate": 1.0992537989080618e-05, "loss": 1.3712, "step": 174 }, { "epoch": 1.334296724470135, "grad_norm": 1.277942419052124, "learning_rate": 1.0794501669717146e-05, "loss": 1.3676, "step": 175 }, { "epoch": 1.342003853564547, "grad_norm": 1.190983533859253, "learning_rate": 1.05961512516982e-05, "loss": 1.3906, "step": 176 }, { "epoch": 1.3497109826589595, "grad_norm": 1.3649415969848633, "learning_rate": 1.039756515096926e-05, "loss": 1.3883, "step": 177 }, { "epoch": 1.357418111753372, "grad_norm": 1.2454570531845093, "learning_rate": 1.0198821876650702e-05, "loss": 1.3581, "step": 178 }, { "epoch": 1.3651252408477843, "grad_norm": 1.2593861818313599, "learning_rate": 1e-05, "loss": 1.3726, "step": 179 }, { "epoch": 1.3728323699421965, "grad_norm": 1.2473970651626587, "learning_rate": 9.801178123349298e-06, "loss": 1.4003, "step": 180 }, { "epoch": 1.3805394990366089, "grad_norm": 1.210317611694336, "learning_rate": 9.602434849030747e-06, "loss": 1.3875, "step": 181 }, { "epoch": 1.388246628131021, "grad_norm": 1.2112162113189697, "learning_rate": 9.403848748301802e-06, "loss": 1.3769, "step": 182 }, { "epoch": 1.3959537572254335, "grad_norm": 1.1812710762023926, "learning_rate": 9.205498330282857e-06, "loss": 1.3521, "step": 183 }, { "epoch": 1.4036608863198459, "grad_norm": 1.2227439880371094, "learning_rate": 9.007462010919387e-06, "loss": 1.3781, "step": 184 }, { "epoch": 1.4113680154142583, "grad_norm": 1.2190202474594116, "learning_rate": 8.809818081980954e-06, "loss": 1.3529, "step": 185 }, { "epoch": 1.4190751445086704, "grad_norm": 1.1302087306976318, "learning_rate": 8.61264468010932e-06, "loss": 1.3937, "step": 186 }, { "epoch": 1.4267822736030829, "grad_norm": 1.1406745910644531, "learning_rate": 8.416019755927851e-06, "loss": 1.3715, "step": 187 }, { "epoch": 1.4344894026974953, "grad_norm": 1.1421207189559937, "learning_rate": 8.2200210432245e-06, "loss": 1.3441, "step": 188 }, { "epoch": 1.4421965317919074, "grad_norm": 1.1282238960266113, "learning_rate": 8.024726028220474e-06, "loss": 1.3484, "step": 189 }, { "epoch": 1.4499036608863198, "grad_norm": 1.1182270050048828, "learning_rate": 7.83021191893682e-06, "loss": 1.3736, "step": 190 }, { "epoch": 1.4576107899807322, "grad_norm": 1.1618040800094604, "learning_rate": 7.636555614670953e-06, "loss": 1.3481, "step": 191 }, { "epoch": 1.4653179190751446, "grad_norm": 1.1137522459030151, "learning_rate": 7.443833675595254e-06, "loss": 1.3523, "step": 192 }, { "epoch": 1.4730250481695568, "grad_norm": 1.2066893577575684, "learning_rate": 7.252122292489747e-06, "loss": 1.3616, "step": 193 }, { "epoch": 1.4807321772639692, "grad_norm": 1.1276185512542725, "learning_rate": 7.061497256620793e-06, "loss": 1.353, "step": 194 }, { "epoch": 1.4884393063583814, "grad_norm": 1.1631989479064941, "learning_rate": 6.872033929777731e-06, "loss": 1.3483, "step": 195 }, { "epoch": 1.4961464354527938, "grad_norm": 1.1466474533081055, "learning_rate": 6.683807214479323e-06, "loss": 1.3678, "step": 196 }, { "epoch": 1.5038535645472062, "grad_norm": 1.132791519165039, "learning_rate": 6.496891524361757e-06, "loss": 1.3576, "step": 197 }, { "epoch": 1.5115606936416186, "grad_norm": 1.1244217157363892, "learning_rate": 6.311360754759923e-06, "loss": 1.3832, "step": 198 }, { "epoch": 1.5192678227360308, "grad_norm": 1.1384022235870361, "learning_rate": 6.127288253493591e-06, "loss": 1.3578, "step": 199 }, { "epoch": 1.5269749518304432, "grad_norm": 1.1305923461914062, "learning_rate": 5.944746791870062e-06, "loss": 1.368, "step": 200 }, { "epoch": 1.5346820809248554, "grad_norm": 1.1514254808425903, "learning_rate": 5.7638085359147235e-06, "loss": 1.3533, "step": 201 }, { "epoch": 1.5423892100192678, "grad_norm": 1.1174412965774536, "learning_rate": 5.584545017840886e-06, "loss": 1.3729, "step": 202 }, { "epoch": 1.5500963391136802, "grad_norm": 1.0917550325393677, "learning_rate": 5.40702710777022e-06, "loss": 1.3539, "step": 203 }, { "epoch": 1.5578034682080926, "grad_norm": 1.0902245044708252, "learning_rate": 5.231324985714942e-06, "loss": 1.3711, "step": 204 }, { "epoch": 1.565510597302505, "grad_norm": 1.1163016557693481, "learning_rate": 5.057508113832772e-06, "loss": 1.3782, "step": 205 }, { "epoch": 1.5732177263969171, "grad_norm": 1.1419026851654053, "learning_rate": 4.885645208965779e-06, "loss": 1.3825, "step": 206 }, { "epoch": 1.5809248554913293, "grad_norm": 1.1543022394180298, "learning_rate": 4.7158042154738094e-06, "loss": 1.3551, "step": 207 }, { "epoch": 1.5886319845857417, "grad_norm": 1.0950229167938232, "learning_rate": 4.548052278373327e-06, "loss": 1.3375, "step": 208 }, { "epoch": 1.5963391136801541, "grad_norm": 1.1293272972106934, "learning_rate": 4.382455716792291e-06, "loss": 1.3498, "step": 209 }, { "epoch": 1.6040462427745665, "grad_norm": 1.123294472694397, "learning_rate": 4.219079997751515e-06, "loss": 1.3519, "step": 210 }, { "epoch": 1.611753371868979, "grad_norm": 1.114963412284851, "learning_rate": 4.057989710282897e-06, "loss": 1.3597, "step": 211 }, { "epoch": 1.6194605009633911, "grad_norm": 1.0550687313079834, "learning_rate": 3.899248539894756e-06, "loss": 1.3594, "step": 212 }, { "epoch": 1.6271676300578035, "grad_norm": 1.0849530696868896, "learning_rate": 3.7429192433944016e-06, "loss": 1.3585, "step": 213 }, { "epoch": 1.6348747591522157, "grad_norm": 1.0992666482925415, "learning_rate": 3.589063624077802e-06, "loss": 1.3765, "step": 214 }, { "epoch": 1.642581888246628, "grad_norm": 1.1028841733932495, "learning_rate": 3.4377425072962467e-06, "loss": 1.3551, "step": 215 }, { "epoch": 1.6502890173410405, "grad_norm": 1.0943406820297241, "learning_rate": 3.2890157164096315e-06, "loss": 1.3426, "step": 216 }, { "epoch": 1.657996146435453, "grad_norm": 1.0819505453109741, "learning_rate": 3.1429420491358696e-06, "loss": 1.37, "step": 217 }, { "epoch": 1.665703275529865, "grad_norm": 1.0802693367004395, "learning_rate": 2.999579254305748e-06, "loss": 1.363, "step": 218 }, { "epoch": 1.6734104046242775, "grad_norm": 1.0993719100952148, "learning_rate": 2.8589840090325028e-06, "loss": 1.373, "step": 219 }, { "epoch": 1.6811175337186897, "grad_norm": 1.1456190347671509, "learning_rate": 2.721211896305059e-06, "loss": 1.337, "step": 220 }, { "epoch": 1.688824662813102, "grad_norm": 1.1663914918899536, "learning_rate": 2.5863173830138212e-06, "loss": 1.3695, "step": 221 }, { "epoch": 1.6965317919075145, "grad_norm": 1.10584557056427, "learning_rate": 2.454353798417698e-06, "loss": 1.336, "step": 222 }, { "epoch": 1.7042389210019269, "grad_norm": 1.0759963989257812, "learning_rate": 2.325373313060919e-06, "loss": 1.3436, "step": 223 }, { "epoch": 1.7119460500963393, "grad_norm": 1.0870240926742554, "learning_rate": 2.19942691814788e-06, "loss": 1.3458, "step": 224 }, { "epoch": 1.7196531791907514, "grad_norm": 1.101758360862732, "learning_rate": 2.0765644053842583e-06, "loss": 1.3562, "step": 225 }, { "epoch": 1.7273603082851636, "grad_norm": 1.0890095233917236, "learning_rate": 1.9568343472923524e-06, "loss": 1.3717, "step": 226 }, { "epoch": 1.735067437379576, "grad_norm": 1.0680255889892578, "learning_rate": 1.840284078008393e-06, "loss": 1.3402, "step": 227 }, { "epoch": 1.7427745664739884, "grad_norm": 1.0939226150512695, "learning_rate": 1.7269596745694295e-06, "loss": 1.3688, "step": 228 }, { "epoch": 1.7504816955684008, "grad_norm": 1.0921400785446167, "learning_rate": 1.6169059386972342e-06, "loss": 1.3316, "step": 229 }, { "epoch": 1.7581888246628132, "grad_norm": 1.048248529434204, "learning_rate": 1.5101663790863597e-06, "loss": 1.3347, "step": 230 }, { "epoch": 1.7658959537572254, "grad_norm": 1.080112338066101, "learning_rate": 1.4067831942033904e-06, "loss": 1.3476, "step": 231 }, { "epoch": 1.7736030828516378, "grad_norm": 1.0681742429733276, "learning_rate": 1.3067972556041753e-06, "loss": 1.3833, "step": 232 }, { "epoch": 1.78131021194605, "grad_norm": 1.070648431777954, "learning_rate": 1.2102480917756632e-06, "loss": 1.3601, "step": 233 }, { "epoch": 1.7890173410404624, "grad_norm": 1.0644824504852295, "learning_rate": 1.1171738725086833e-06, "loss": 1.3503, "step": 234 }, { "epoch": 1.7967244701348748, "grad_norm": 1.0739105939865112, "learning_rate": 1.0276113938078768e-06, "loss": 1.3686, "step": 235 }, { "epoch": 1.8044315992292872, "grad_norm": 1.0678924322128296, "learning_rate": 9.415960633447674e-07, "loss": 1.348, "step": 236 }, { "epoch": 1.8121387283236994, "grad_norm": 1.0799516439437866, "learning_rate": 8.591618864596541e-07, "loss": 1.3571, "step": 237 }, { "epoch": 1.8198458574181118, "grad_norm": 1.0634883642196655, "learning_rate": 7.803414527179343e-07, "loss": 1.3383, "step": 238 }, { "epoch": 1.827552986512524, "grad_norm": 1.0771961212158203, "learning_rate": 7.051659230261299e-07, "loss": 1.363, "step": 239 }, { "epoch": 1.8352601156069364, "grad_norm": 1.0874431133270264, "learning_rate": 6.336650173127224e-07, "loss": 1.3617, "step": 240 }, { "epoch": 1.8429672447013488, "grad_norm": 1.0974795818328857, "learning_rate": 5.658670027786561e-07, "loss": 1.3685, "step": 241 }, { "epoch": 1.8506743737957612, "grad_norm": 1.0479196310043335, "learning_rate": 5.017986827221733e-07, "loss": 1.3502, "step": 242 }, { "epoch": 1.8583815028901736, "grad_norm": 1.0492548942565918, "learning_rate": 4.4148538594239176e-07, "loss": 1.335, "step": 243 }, { "epoch": 1.8660886319845857, "grad_norm": 1.0853266716003418, "learning_rate": 3.8495095672579584e-07, "loss": 1.374, "step": 244 }, { "epoch": 1.873795761078998, "grad_norm": 1.0355820655822754, "learning_rate": 3.322177454196285e-07, "loss": 1.3624, "step": 245 }, { "epoch": 1.8815028901734103, "grad_norm": 1.0834838151931763, "learning_rate": 2.8330659959589944e-07, "loss": 1.3798, "step": 246 }, { "epoch": 1.8892100192678227, "grad_norm": 1.0652782917022705, "learning_rate": 2.3823685580949273e-07, "loss": 1.3659, "step": 247 }, { "epoch": 1.8969171483622351, "grad_norm": 1.0404101610183716, "learning_rate": 1.9702633195363918e-07, "loss": 1.3512, "step": 248 }, { "epoch": 1.9046242774566475, "grad_norm": 1.0568671226501465, "learning_rate": 1.5969132021579347e-07, "loss": 1.3674, "step": 249 }, { "epoch": 1.9123314065510597, "grad_norm": 1.0594927072525024, "learning_rate": 1.262465806366664e-07, "loss": 1.3562, "step": 250 }, { "epoch": 1.920038535645472, "grad_norm": 1.0357680320739746, "learning_rate": 9.670533527498139e-08, "loss": 1.3609, "step": 251 }, { "epoch": 1.9277456647398843, "grad_norm": 1.043979525566101, "learning_rate": 7.10792629802659e-08, "loss": 1.3224, "step": 252 }, { "epoch": 1.9354527938342967, "grad_norm": 1.045696496963501, "learning_rate": 4.937849477572587e-08, "loss": 1.3764, "step": 253 }, { "epoch": 1.943159922928709, "grad_norm": 1.0686546564102173, "learning_rate": 3.161160985304168e-08, "loss": 1.3563, "step": 254 }, { "epoch": 1.9508670520231215, "grad_norm": 1.0238714218139648, "learning_rate": 1.77856321806702e-08, "loss": 1.3303, "step": 255 }, { "epoch": 1.9585741811175337, "grad_norm": 1.0332682132720947, "learning_rate": 7.906027726981568e-09, "loss": 1.3412, "step": 256 }, { "epoch": 1.966281310211946, "grad_norm": 1.079695224761963, "learning_rate": 1.9767022993444353e-09, "loss": 1.3787, "step": 257 }, { "epoch": 1.9739884393063583, "grad_norm": 1.0400174856185913, "learning_rate": 0.0, "loss": 1.3201, "step": 258 }, { "epoch": 1.9739884393063583, "eval_loss": 1.5724855661392212, "eval_runtime": 27.4882, "eval_samples_per_second": 1.819, "eval_steps_per_second": 0.473, "step": 258 } ], "logging_steps": 1, "max_steps": 258, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 43, "total_flos": 3.045490266539557e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }